In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'Tesla is looking at buying an U.S. startup for $6 million')

In [4]:
for token in doc:
    print(token.text)

Tesla
is
looking
at
buying
an
U.S.
startup
for
$
6
million


In [5]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
an DET det
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [6]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7fc47bc82eb8>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7fc47b9f8f10>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7fc47b9f8f68>)]

In [7]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [8]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [9]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
n't ADV neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [10]:
doc2[0].pos_

'PROPN'

In [11]:
for token in doc:
    print(token.is_alpha, token.is_stop)

True False
True True
True False
True True
True False
True True
False False
True False
True True
False False
False False
True False


In [12]:
doc[0:1]

Tesla

In [35]:
para = nlp(u"The political career of John C. Breckinridge included service in the governments of Kentucky, the United States, and the Confederate States of America. Breckinridge (January 16, 1821 – May 17, 1875) was inaugurated in 1857 as James Buchanan's vice president, and remains the youngest person to ever hold the office. In 1860 he ran as the presidential candidate of a dissident group of Southern Democrats and won the electoral votes of most of the Southern states, but he finished a distant second among four candidates, losing the election to the Republican candidate, Abraham Lincoln. Most Southern states seceded, but Kentucky stayed in the Union. Previously elected to a U.S. Senate term that began in 1861, Breckenridge fled the state, joined the Confederate States Army, and was expelled from the Senate. Confederate President Jefferson Davis appointed him Secretary of War in February 1865. I am adding this line for code-testing purposes, or Code-testing or code testing purposes.")

In [36]:
for sent in para.sents:
    print(sent)

The political career of John C. Breckinridge included service in the governments of Kentucky, the United States, and the Confederate States of America.
Breckinridge
(January 16, 1821 –
May 17, 1875) was inaugurated in 1857 as James Buchanan's vice president, and remains the youngest person to ever hold the office.
In 1860 he ran as the presidential candidate of a dissident group of Southern Democrats and won the electoral votes of most of the Southern states, but he finished a distant second among four candidates, losing the election to the Republican candidate, Abraham Lincoln.
Most Southern states seceded, but Kentucky stayed in the Union.
Previously elected to a U.S. Senate term that began in 1861, Breckenridge fled the state, joined the Confederate States Army, and was expelled from the Senate.
Confederate President Jefferson Davis appointed him Secretary of War in February 1865.
I am adding this line for code-testing purposes, or Code-testing or code testing purposes.


In [37]:
for entity in para.ents:
    print(f"{entity} {entity.label_}")

John C. Breckinridge PERSON
Kentucky GPE
the United States GPE
the Confederate States of America GPE
January 16, 1821 – DATE
1875 DATE
1857 DATE
James Buchanan's PERSON
1860 DATE
Southern Democrats NORP
Southern NORP
second ORDINAL
four CARDINAL
Republican NORP
Abraham Lincoln PERSON
Southern NORP
Kentucky GPE
U.S. Senate ORG
1861 DATE
Breckenridge ORG
the Confederate States Army ORG
Senate ORG
Jefferson Davis PERSON
War ORG
February 1865 DATE


In [38]:
for entity in para.ents:
    print(f"{entity} \n{entity.label_} \n{spacy.explain(entity.label_)}\n\n")

John C. Breckinridge 
PERSON 
People, including fictional


Kentucky 
GPE 
Countries, cities, states


the United States 
GPE 
Countries, cities, states


the Confederate States of America 
GPE 
Countries, cities, states


January 16, 1821 – 
DATE 
Absolute or relative dates or periods


1875 
DATE 
Absolute or relative dates or periods


1857 
DATE 
Absolute or relative dates or periods


James Buchanan's 
PERSON 
People, including fictional


1860 
DATE 
Absolute or relative dates or periods


Southern Democrats 
NORP 
Nationalities or religious or political groups


Southern 
NORP 
Nationalities or religious or political groups


second 
ORDINAL 
"first", "second", etc.


four 
CARDINAL 
Numerals that do not fall under another type


Republican 
NORP 
Nationalities or religious or political groups


Abraham Lincoln 
PERSON 
People, including fictional


Southern 
NORP 
Nationalities or religious or political groups


Kentucky 
GPE 
Countries, cities, states


U.S. Senate 
ORG 
Compa

In [39]:
for noun_chunk in para.noun_chunks:
    print(noun_chunk)

The political career
John C. Breckinridge
service
the governments
Kentucky
the United States
the Confederate States
America
Breckinridge
(January
May
James Buchanan's vice president
the youngest person
the office
he
the presidential candidate
a dissident group
Southern Democrats
the electoral votes
the Southern states
he
four candidates
the election
the Republican candidate
Abraham Lincoln
Most Southern states
Kentucky
the Union
a U.S. Senate term
Breckenridge
the state
the Confederate States Army
the Senate
Confederate President Jefferson Davis
him
War
February
I
this line
code-testing purposes
Code-testing or code testing purposes


In [40]:
# presentation part

from spacy import displacy

In [41]:
# trying 0-9 for shorter displacy
displacy.render(para[0:10], style='dep', jupyter=True, options={'distance': 110})

In [42]:
displacy.render(para, style='ent', jupyter=True)

In [43]:
## # 

In [44]:
for token in para:
    if token.text.lower() != token.lemma_:
    # lemmatization
        print(f'{token.text:{20}} {token.lemma_:{10}}')

included             include   
governments          government
was                  be        
inaugurated          inaugurate
remains              remain    
youngest             young     
he                   -PRON-    
ran                  run       
won                  win       
votes                vote      
states               state     
he                   -PRON-    
finished             finish    
candidates           candidate 
losing               lose      
states               state     
seceded              secede    
stayed               stay      
elected              elect     
began                begin     
fled                 flee      
joined               join      
was                  be        
expelled             expel     
appointed            appoint   
him                  -PRON-    
I                    -PRON-    
am                   be        
adding               add       
purposes             purpose   
purposes             purpose   


In [45]:
print(nlp.Defaults.stop_words)

{'than', 'themselves', 'moreover', 'further', 'enough', 'take', 'below', 'empty', 'many', 'ca', 'alone', 'among', 'whether', 'perhaps', 'anyway', 'next', 'very', 'once', 'forty', 'beside', 'become', 'keep', 'within', 'almost', 'a', 'most', 'regarding', 'twelve', 'someone', 'whoever', 'for', 'ever', 'hence', 'up', 'yours', 'wherein', 'her', 'across', 'she', 'sometimes', 'whom', 'each', 'it', 'hereafter', 'six', 'seems', 'third', 'latterly', 'fifteen', 'his', 'only', 'here', 'therein', 'elsewhere', 'thereafter', 'via', 'everyone', 'part', 'from', 'top', 'i', 'something', 'all', 'without', 'was', 'are', 'get', 'due', 'herein', 'me', 'whereafter', 'does', 'two', 'cannot', 'call', 'so', 'latter', 'quite', 'thru', 'down', 'others', 'those', 'four', 'since', 'which', 'also', 'may', 'back', 'put', 'there', 'never', 'except', 'while', 'between', 'into', 'that', 'already', 'have', 'now', 'them', 'doing', 'what', 'been', 'wherever', 'how', 'amongst', 'anywhere', 'upon', 'their', 'three', 'against

In [46]:
len(nlp.Defaults.stop_words)

305

In [47]:
nlp.vocab['hello']

<spacy.lexeme.Lexeme at 0x7fc479e14708>

In [48]:
nlp.vocab['is'].is_stop

True

In [49]:
nlp.vocab['hello'].is_stop

False

In [50]:
from spacy.matcher import Matcher

In [51]:
matcher = Matcher(nlp.vocab)

In [54]:
pattern1 = [{'LOWER': 'codetesting'}]

In [55]:
pattern2 = [{'LOWER': 'code'}, {'IS_PUNCT': True}, {'LOWER': 'testing'}]

In [56]:
pattern3 = [{'LOWER': 'code'}, {'LOWER': 'testing'}]

In [68]:
matcher.add('CodeTesting', None, pattern1, pattern2, pattern3)

In [69]:
found_matches = matcher(para)

In [70]:
print(found_matches)

[(11500672109852549154, 169, 172), (11500672109852549154, 175, 178), (11500672109852549154, 179, 181)]


In [72]:
for m_id, start, end in found_matches:
    span = para[start:end]
    print(span.text, " starts at ", start, " and ends at ", end)

code-testing  starts at  169  and ends at  172
Code-testing  starts at  175  and ends at  178
code testing  starts at  179  and ends at  181
