## What will be learnt here...
- How to utilise Matcher function to match tokens in a multi-token way

In [1]:
import spacy
import pandas as pd
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def has_lang(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang', 'python', 'ruby', 'objective-c']:
            if t.pos_ != "VERB":
                return True
    return False

In [4]:
doc = nlp("I have a problem in creating functions using Python")

In [5]:
has_lang(doc)

True

##### Writing something sensible out of context of CS language...

In [6]:
doc = nlp("Python is considered to be a dangerous animal!")

In [7]:
has_lang(doc)

True

##### The main issue occurs when we try to detect languages having a punctuation (like objective-c)

In [8]:
doc = nlp("I like iOS development and i code in objective-c")

In [9]:
has_lang(doc)

False

In [10]:
[t for t in doc]

[I, like, iOS, development, and, i, code, in, objective, -, c]

###### As we can see objective-c is broken into parts... Which is not helpful for us

In [33]:
# Note that we can easily get the dictionary based on usecase from spacy's website - demo section

patterns = [[{'LOWER': 'objective'},
           {'IS_PUNCT': True, 'OP': '?'},
           {'LOWER': 'c'}],
            
          [{'LOWER': {'IN': ['go', 'golang']},
           'POS':{'NOT_IN' : ['VERB']}}],
           
           [{'LOWER': 'python'}],
           
           [{'LOWER': 'ruby'}],
           
           [{'LOWER': {'IN': ['js', 'javascript']}}]]

In [34]:
from spacy.matcher import Matcher

In [35]:
matcher = Matcher(nlp.vocab)

In [36]:
matcher.add("langs", patterns=patterns)

In [37]:
patterns

[[{'LOWER': 'objective'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'c'}],
 [{'LOWER': {'IN': ['go', 'golang']}, 'POS': {'NOT_IN': ['VERB']}}],
 [{'LOWER': 'python'}],
 [{'LOWER': 'ruby'}],
 [{'LOWER': {'IN': ['js', 'javascript']}}]]

In [38]:
doc = nlp('I like coding in golang and objective-c too')
matcher(doc)

[(13024456332050000378, 4, 5), (13024456332050000378, 6, 9)]

In [39]:
for match_id, start, end in matcher(doc):
    print(doc[start:end])

golang
objective-c


In [40]:
doc = nlp('JS javascript python ruby golang go and objective-c are good languages')
matcher(doc)

[(13024456332050000378, 0, 1),
 (13024456332050000378, 1, 2),
 (13024456332050000378, 2, 3),
 (13024456332050000378, 3, 4),
 (13024456332050000378, 4, 5),
 (13024456332050000378, 7, 10)]

In [41]:
for match_id, start, end in matcher(doc):
    print(doc[start:end])

JS
javascript
python
ruby
golang
objective-c


#### Let's check according to our dataset

In [42]:
import pandas as pd

df = pd.read_csv("Questions.csv", nrows=1_000_000,
                 encoding="ISO-8859-1", usecols=['Title', 'Id'])

In [45]:
titles = (_ for _ in df['Title'] if "python" in _.lower())

In [46]:
for i in range(200):
    doc = nlp(next(titles))
    if len(matcher(doc)) == 0:
        print(doc)

mod_python/MySQL error on INSERT with a lot of data: "OperationalError: (2006, 'MySQL server has gone away')"
Running subversion under apache and mod_python
What's the best way to embed IronPython inside my C# App?
How to set the PYTHONPATH in Emacs?
wxPython wxDC object from win32gui.GetDC
Need skeleton code to call Excel VBA from PythonWin
Questions for python->scheme conversion
wxPython and sharing objects between windows
Django on IronPython
IronPython Webframework
A SuggestBox for wxPython?
Intercepting Method Access on the Host Program of IronPython
Is there anything like IPython / IRB for Perl?
