# NLP Cheat Sheet - Python, spacy- Starter Kit - Nomenclature


Stack
- spacy
- Python3

Installation:
```shell
pip install spacy
python -m spacy download en_core_web_lg
```

In [15]:
import spacy

In [None]:
# Import dataset
nlp = spacy.load("en_core_web_lg")
# Import small dataset. Results are less accurate.
# nlp = spacy.load("en_core_web_sm")

# Tokenization

Segmenting text into words, punctuation etc.

In [39]:
doc = nlp("Larry Page founded Google in early 1990.")
[token.text for token in doc]

['Larry', 'Page', 'founded', 'Google', 'in', 'early', '1990', '.']

# Lemmatization

Assigning the base forms of words, for example: 
- "was" → "be"
- "rats" → "rat"

In [41]:
doc = nlp("Was Google founded in early 1990?")
[(x.orth_, x.lemma_) for x in [token for token in doc]]

[('Was', 'be'),
 ('Google', 'Google'),
 ('founded', 'found'),
 ('in', 'in'),
 ('early', 'early'),
 ('1990', '1990'),
 ('?', '?')]

# Spans
Part of a given text. So doc[2:4] is a span starting at token 2, up to – but not including! – token 4.

In [42]:
doc = nlp("Larry Page founded Google in early 1990.")
span = doc[2:4]
span.text

'founded Google'

# Sentence Detection

Finding and segmenting individual sentences.

In [43]:
doc = nlp("Larry Page founded Google in early 1990. Sergey Brin joined.")
[sent.text for sent in doc.sents]

['Larry Page founded Google in early 1990.', 'Sergey Brin joined.']

# Part-of-speech (POS) Tagging

Assigning word types to tokens like verb or noun.


In [34]:
doc = nlp("We are reading a text.")
[(x.orth_, x.pos_, spacy.explain(x.pos_)) for x in [token for token in doc]]

[('We', 'PRON', 'pronoun'),
 ('are', 'VERB', 'verb'),
 ('reading', 'VERB', 'verb'),
 ('a', 'DET', 'determiner'),
 ('text', 'NOUN', 'noun'),
 ('.', 'PUNCT', 'punctuation')]

In [35]:
[(x.orth_, x.tag_, spacy.explain(x.tag_)) for x in [token for token in doc]]

[('We', 'PRP', 'pronoun, personal'),
 ('are', 'VBP', 'verb, non-3rd person singular present'),
 ('reading', 'VBG', 'verb, gerund or present participle'),
 ('a', 'DT', 'determiner'),
 ('text', 'NN', 'noun, singular or mass'),
 ('.', '.', 'punctuation mark, sentence closer')]

# Dependency Parsing	

Assigning syntactic dependency labels, describing the relations between individual tokens, like subject or object.

In [38]:
doc = nlp("We are reading a text.")
# Dependency labels
[(x.orth_, x.dep_, spacy.explain(x.dep_)) for x in [token for token in doc]]

[('We', 'nsubj', 'nominal subject'),
 ('are', 'aux', 'auxiliary'),
 ('reading', 'ROOT', None),
 ('a', 'det', 'determiner'),
 ('text', 'dobj', 'direct object'),
 ('.', 'punct', 'punctuation')]

In [27]:
# Syntactic head token (governor)
[token.head.text for token in doc]

['reading', 'reading', 'reading', 'text', 'reading', 'reading']

# Base noun phrases


In [47]:
doc = nlp("I have a red car")
[chunk.text for chunk in doc.noun_chunks]

['I', 'a red car']

# Named Entity Recognition (NER)

Labeling "real-world" objects, like persons, companies or locations.

In [48]:
doc = nlp("Larry Page founded Google in the US in early 1990.")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]

[('Larry Page', 'PERSON'),
 ('Google', 'ORG'),
 ('US', 'GPE'),
 ('early 1990', 'DATE')]

# Text Classification

Assigning categories or labels to a whole document, or parts of a document.

# Similarity
How similar are two documents, sentences, token or spans?

In [54]:
doc1 = nlp("I like cats")
doc2 = nlp("I like dogs")
# Compare 2 documents
doc1.similarity(doc2)

0.957709143352323

In [59]:
# "cats" vs "dogs"
doc1[2].similarity(doc2[2])

0.83117634

In [70]:
# "I" vs "like dogs"
doc1[0].similarity(doc2[1:3])

0.46475163

In [73]:
doc = nlp("I like cats")
# L2 norm of "I like cats"
doc.vector_norm

4.706799587675896

In [76]:
# L2 norm of "cats"
doc[2].vector_norm

6.933004

In [77]:
# Vector representation of "cats"
doc[2].vector

array([-0.26763  ,  0.029846 , -0.3437   , -0.54409  , -0.49919  ,
        0.15928  , -0.35278  , -0.2036   ,  0.23482  ,  1.5671   ,
       -0.36458  , -0.028713 , -0.27053  ,  0.2504   , -0.18126  ,
        0.13453  ,  0.25795  ,  0.93213  , -0.12841  , -0.18505  ,
       -0.57597  ,  0.18538  , -0.19147  , -0.38465  ,  0.21656  ,
       -0.4387   , -0.27846  , -0.41339  ,  0.37859  , -0.2199   ,
       -0.25907  , -0.019796 , -0.31885  ,  0.12921  ,  0.22168  ,
        0.32671  ,  0.46943  , -0.81922  , -0.20031  ,  0.013561 ,
       -0.14663  ,  0.14438  ,  0.0098044, -0.15439  ,  0.21146  ,
       -0.28409  , -0.4036   ,  0.45355  ,  0.12173  , -0.11516  ,
       -0.12235  , -0.096467 , -0.26991  ,  0.028776 , -0.11307  ,
        0.37219  , -0.054718 , -0.20297  , -0.23974  ,  0.86271  ,
        0.25602  , -0.3064   ,  0.014714 , -0.086497 , -0.079054 ,
       -0.33109  ,  0.54892  ,  0.20076  ,  0.28064  ,  0.037788 ,
        0.0076729, -0.0050123, -0.11619  , -0.23804  ,  0.3302

# Visualization

In [49]:
from spacy import displacy

In [50]:
doc = nlp("This is a sentence")
displacy.render(doc, style="dep")

In [52]:
doc = nlp("Larry Page founded Google in the US in early 1990.")
displacy.render(doc, style="ent")

Inspired by: https://www.datacamp.com/community/blog/spacy-cheatsheet