In [2]:
import pandas as pd
import time
import spacy

In [3]:
# We are going to make use of the stack over flow Question data set
data = pd.read_csv(r'D:\Datasets\Stack Overflow\Questions.csv',encoding='iSO-8859-1',usecols=['Title','Id'])
data.head()

Unnamed: 0,Id,Title
0,80,SQLStatement.execute() - multiple queries in o...
1,90,Good branching and merging tutorials for Torto...
2,120,ASP.NET Site Maps
3,180,Function for creating color wheels
4,260,Adding scripting functionality to .NET applica...


In [4]:
#Now we need only those rows which contain questions on go language
titles = [_ for _ in data['Title']]

def has_go_lang(text):
    return 'go' in text

g = (title for title in data.loc[lambda df: df['Title'].str.lower().str.contains("go")]['Title'])
[next(g) for i in range(10)]     

['Good branching and merging tutorials for TortoiseSVN?',
 'Good STL-like library for C',
 'My website got hacked... What should I do?',
 "DVCS Choices - What's good for Windows?",
 'Is a "Confirm Email" input good practice when user changes email address?',
 'Any good advice on using emacs for C++ project?',
 'What is a good way to denormalize a mysql database?',
 'Is AnkhSVN any good?',
 'Arguments for going open source',
 'Does Hostmonster support Django']

#### Now since this is not so effective to search for text. We have to build a better mechanism for the searching process. For that we have to make use of SPACY's inherent functions which tells about the liguistic structure of a sentence. 

In [5]:
nlp = spacy.load('en_core_web_sm',disable=['ner'])

In [6]:
[[t,t.pos_,t.dep_] for t in nlp('What is a good way to denormalize a mysql database?')]

[[What, 'PRON', 'nsubj'],
 [is, 'AUX', 'ROOT'],
 [a, 'DET', 'det'],
 [good, 'ADJ', 'amod'],
 [way, 'NOUN', 'attr'],
 [to, 'PART', 'aux'],
 [denormalize, 'VERB', 'relcl'],
 [a, 'DET', 'det'],
 [mysql, 'NOUN', 'compound'],
 [database, 'NOUN', 'dobj'],
 [?, 'PUNCT', 'punct']]

In [7]:
#There is another way to see the gramatical structure of a sentence
spacy.displacy.render(nlp('What is a good way to denormalize a mysql database?'))

#### We can make use of this gramatical notions in our logic to filter the words which has the string 'Go' in it because  we want to have those Questions where 'Go' is been used in the context of the programming language

In [8]:
spacy.displacy.render(nlp('How do I allocate memory for an array in the go programming language?'))

In [9]:
spacy.explain('amod')

'adjectival modifier'

#### As we see that go is used as Noun here in the context of the programming language and it acts a adjectival modifier to language 

In [10]:
for t in nlp("Where does Console.WriteLine go in ASP.NET?"):
    print(t, t.pos_, t.dep_)

Where ADV advmod
does AUX ROOT
Console PROPN nsubj
. PUNCT punct
WriteLine PROPN nsubj
go VERB ROOT
in ADP prep
ASP.NET PROPN pobj
? PUNCT punct


In [12]:
titles = [_ for _ in data.loc[lambda df : df['Title'].str.lower().str.contains('go')]['Title']]

In [13]:
titles

['Good branching and merging tutorials for TortoiseSVN?',
 'Good STL-like library for C',
 'My website got hacked... What should I do?',
 "DVCS Choices - What's good for Windows?",
 'Is a "Confirm Email" input good practice when user changes email address?',
 'Any good advice on using emacs for C++ project?',
 'What is a good way to denormalize a mysql database?',
 'Is AnkhSVN any good?',
 'Arguments for going open source',
 'Does Hostmonster support Django',
 "What's a good way to check if two datetimes are on the same calendar day in TSQL?",
 'Good strategy for leaving an audit trail/change history for DB applications?',
 'Factorial Algorithms in different languages',
 'What is a good dvd burning component for Windows or .Net?',
 'Best .NET Wrapper for Google Maps or Yahoo Maps?',
 'Is there a best .NET algorithm for credit card encryption?',
 'How to generate urls in django',
 'Suggest some good MVC framework in perl',
 'Is there a good Fogbugz client for Mac OS X?',
 'What are some

#### Instead of passing text we are using the pipe function of spacy to send doc object into the object

In [20]:
%%time
def has_golang(doc):
    for t in doc:
        if t.lower_ in ['go','golang']:
            if t.pos_ == 'NOUN':
                return True
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range(30)]

Wall time: 12.3 s


[Deploying multiple Java web apps to Glassfish in one go,
 Removing all event handlers in one go,
 Paypal integration to serve multiple sellers in one go for a shopping site,
 How do I disable multiple listboxes in one go using jQuery?,
 multi package makefile example for go,
 Google's 'go' and scope/functions,
 Where is App.config go after publishing?,
 SOAPUI & Groovy Scripts, executing multiple SQL statements in one go,
 What's the simplest way to edit conflicted files in one go when using git and an editor like Vim or textmate?,
 Import large chunk of data into Google App Engine Data Store at one go,
 Saving all nested form objects in one go,
 what's the state of go language IDE support?,
 Decrypt many PDFs in one go using pdftk,
 How do I allocate memory for an array in the go programming language?,
 Is message passing via channels in go guaranteed to be non-blocking?,
 The maximum value for an int type in Go,
 Is there a reason why arrays in memory 'go' down while the function st

#### Writing a full fledged function right now to reduce false negetives

In [21]:
tags_df = pd.read_csv(r'D:\Datasets\Stack Overflow\Tags.csv')

In [22]:
tags_df.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [24]:
go_ids = [_ for _ in tags_df.loc[lambda d: d['Tag'] =='go']['Id']]

In [26]:
go_ids[1:5]

[1726130, 1727250, 1757090, 1766720]

In [29]:
def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go','golang']:
            if t.pos_ != 'VERB':
                return True
    return False

#Getting all the sentences whose Ids are present in the list 'go_ids'
all_go_sentences = data.loc[lambda df: df['Id'].isin(go_ids)]['Title']
detectable  = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

In [30]:
all_go_sentences

34143                                Go language benchmarks?
34175      Go code contribution: license and patent impli...
34207                 Embedding instead of inheritance in Go
34925                                  Shared library in Go?
35167                  multi package makefile example for go
                                 ...                        
1262867    How to connect to Microsoft SQL server using G...
1262941            Golang Pointer and Struct member function
1263284                  Where is the memory allocated for p
1264011    MongoDB is not marshaling a value before stori...
1264162    Is it idiomatic in go to handle all returned e...
Name: Title, Length: 1858, dtype: object

In [31]:
detectable

['Embedding instead of inheritance in Go',
 'Shared library in Go?',
 'multi package makefile example for go',
 "What's the point of having pointers in Go?",
 'Simulate a tcp connection in Go',
 'Trouble reading from a socket in go',
 "Google's 'go' and scope/functions",
 'Convert string to integer type in Go?',
 'Install Google GO Language',
 'Implementing the â\x80\x98deferâ\x80\x99 statement from Go in Objective-C?',
 "what's the state of go language IDE support?",
 'Generating Random Numbers in Go',
 'making generic algorithms in go',
 'How do I allocate memory for an array in the go programming language?',
 'In Go, one type is coerced into another, can a method to determine the type of the receiver?',
 'Is message passing via channels in go guaranteed to be non-blocking?',
 'The maximum value for an int type in Go',
 'Do Sets exist in Go? (like in Python)',
 'Usage of interface in Go',
 'Google Go: Why does the http server package not serve more than 5 simultaneous requests?',
 'H

In [35]:
for t in nlp('The maximum value for an int type in Go'):
    print(t,t.pos_,t.dep_)

The DET det
maximum ADJ amod
value NOUN ROOT
for ADP prep
an DET det
int NOUN compound
type NOUN pobj
in ADP prep
Go NOUN pobj


In [37]:
spacy.displacy.render(nlp('Convert string to integer type in Go?'))

In [34]:
non_detectable = (data.loc[lambda d: ~d['Id'].isin(go_ids)]
                  .loc[lambda d: d['Title'].str.lower().str.contains('go')]
                  ['Title'].tolist())
non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

In [33]:
non_detectable

['Good branching and merging tutorials for TortoiseSVN?',
 'Good STL-like library for C',
 'My website got hacked... What should I do?',
 "DVCS Choices - What's good for Windows?",
 'Is a "Confirm Email" input good practice when user changes email address?',
 'Any good advice on using emacs for C++ project?',
 'What is a good way to denormalize a mysql database?',
 'Is AnkhSVN any good?',
 'Arguments for going open source',
 'Does Hostmonster support Django',
 "What's a good way to check if two datetimes are on the same calendar day in TSQL?",
 'Good strategy for leaving an audit trail/change history for DB applications?',
 'Factorial Algorithms in different languages',
 'What is a good dvd burning component for Windows or .Net?',
 'Best .NET Wrapper for Google Maps or Yahoo Maps?',
 'Is there a best .NET algorithm for credit card encryption?',
 'How to generate urls in django',
 'Suggest some good MVC framework in perl',
 'Is there a good Fogbugz client for Mac OS X?',
 'What are some

In [43]:
model_name = "en_core_web_sm"
method = 'not-verb'

correct = sum(has_go_token(doc) for doc in nlp.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in nlp.pipe(non_detectable))
precision = correct / (correct+wrong)
recall = correct/len(detectable)
accuracy = (correct + len(non_detectable) - wrong)/(len(detectable) + len(non_detectable))

f"{precision},{recall},{accuracy},{model_name},{method}" # this is logged

'0.8910984848484849,1.0,0.8910984848484849,en_core_web_sm,not-verb'