In [22]:
import pandas as pd
import time
import spacy
from spacy.matcher import Matcher

In [2]:
# We are going to make use of the stack over flow Question data set
data = pd.read_csv(r'D:\Datasets\Stack Overflow\Questions.csv',encoding='iSO-8859-1',usecols=['Title','Id'])
data.head()

Unnamed: 0,Id,Title
0,80,SQLStatement.execute() - multiple queries in o...
1,90,Good branching and merging tutorials for Torto...
2,120,ASP.NET Site Maps
3,180,Function for creating color wheels
4,260,Adding scripting functionality to .NET applica...


In [5]:
nlp = spacy.load('en_core_web_sm')

#### Unlike our last NLP based models where we detacted the Go Language, We take a step further in this one to enhance the performance of the model to detect all languages. 

In [14]:
#Creating a function that checks whether the following word is contained in the document or not
def has_lang(doc):
    for t in doc:
        if t.lower_ in ['go','golang','python','ruby','objective-c']:
            return True
    return False         

In [25]:
doc = nlp('I develop apps in ios and use objective-c to do that.')
has_lang(doc)

False

In [9]:
[t for t in doc]

[I, develop, apps, in, ios, and, use, objective, -, c, to, do, that, .]

#### As you see we the function dont return us True even though our sentence contain objective-c. This is because spacy model identifies objective,-,c as different tokens.To counter this problem we use matcher library of spacy. We create patterns with specific instructions to find the languages which remain unrecognized because of some symantic constraints

#### https://spacy.io/usage/rule-based-matching

In [13]:
pattern_obj_c1 = [{'LOWER':'objective'},
                  {'IS_PUNCT':True,'OP':'?'},
                  {'LOWER':'c'}]

golang_pattern1 = [{'LOWER': 'golang'}] 
golang_pattern2 = [{'LOWER': 'go', 
                    'POS': {'NOT_IN': ['VERB']}}]

python_pattern = [{'LOWER': 'python'}]
ruby_pattern   = [{'LOWER': 'ruby'}]
js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

### When writing patterns, keep in mind that each dictionary represents one token. If spaCy’s tokenization doesn’t match the tokens defined in a pattern, the pattern is not going to produce any results. When developing complex patterns, make sure to check examples against spaCy’s tokenization:

In [34]:
# Initialize a matcher and add the above patterns to the matcher which will be fed to the has
matcher = Matcher(nlp.vocab,validate= True)
matcher.add('OBJ_C_LANG',None,pattern_obj_c1)
matcher(doc)

matcher.add("PYTHON_LANG", None, python_pattern)
matcher.add("GO_LANG", None, golang_pattern1, golang_pattern2)
matcher.add("JS_LANG", None, js_pattern)
matcher.add("RUBY_LANG", None, ruby_pattern)

In [28]:
doc = nlp("I develop apps in ios and use objective-c,golang/go and python to do that.")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

objective-c
golang
python


## Tuning the model 

In [32]:
#Checking if the model is able to capture all of the texts containing python or not
titles = (_ for _ in data['Title'] if 'python' in _.lower())

In [37]:
for i in range(200):
    doc = nlp(next(titles))
    if len(matcher(doc)) ==0:
        print(doc)

how to integrate ZSH and (i)python?
wxpython compilation
wxPython: how to search for text in a TextCtrl?
wxPython SplitterWindow does not expand within a Panel
Is it possible to go into ipython from code?
Need assistance with wxPython (newbie)
Python- about file-handle limits on OS
How to install a module as an egg under IronPython?
wxPython: Changing the color scheme of a wx.stc.StyledTextCtrl
How can I use a VB6 COM 'reference' in IronPython?
How to use Staticgenerator with Django + Apache + mod_python
wxPython: Items in BoxSizer don't expand horizontally, only vertically
mod_python problem?
Running doctests through iPython and pseudo-consoles
How can I impersonate the current user with IronPython?
Set Max Width for Frame with ScrolledWindow in wxPython
Edit python31 file and it opens notepad and starts python26
Potential Memory Leak in my wxPython App
Pythonic way to "flatten" object hierarchy to nested dicts?


In [None]:
python_pattern2 = [{'TEXT': {'REGEX':''}}]