# <span style="color:#0072ce;">spaCy</span> for Natural Language Processing

In [1]:
import spacy

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 29.7 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'd:\Code\NLTK_Spacy\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
print(type(nlp))

<class 'spacy.lang.en.English'>


## Tokenization

In [7]:
sentence = "I don't like implementing neural nets in Tensorflow."
doc = nlp(sentence)
print(type(doc))

<class 'spacy.tokens.doc.Doc'>


When we pass our senetence to our language instance nlp, it returns a **Doc** container object. A **Doc** is a sequence of **Token** objects.

In [8]:
print([token.text for token in doc])

['I', 'do', "n't", 'like', 'implementing', 'neural', 'nets', 'in', 'Tensorflow', '.']


In [9]:
print([(token.text, token.i) for token in doc])

[('I', 0), ('do', 1), ("n't", 2), ('like', 3), ('implementing', 4), ('neural', 5), ('nets', 6), ('in', 7), ('Tensorflow', 8), ('.', 9)]


In [10]:
print(doc[-2])
print(type(doc[-2]))

Tensorflow
<class 'spacy.tokens.token.Token'>


In [11]:
print(doc[4:7])
print(type(doc[4:7]))

implementing neural nets
<class 'spacy.tokens.span.Span'>


In [12]:
print(doc.text)

I don't like implementing neural nets in Tensorflow.


In [13]:
karpathy_tweet = """I've been using PyTorch a few months now and I've never felt better.\
I have more energy. My skin is clearer. My eye sight has improved."""

doc = nlp(karpathy_tweet)

[sentence for sentence in doc.sents]

[I've been using PyTorch a few months now and I've never felt better.,
 I have more energy.,
 My skin is clearer.,
 My eye sight has improved.]

You can also use **NLTK** for tokenization if you like

In [14]:
import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize

sentence = "I don't like implementing neural nets in Tensorflow."

print(word_tokenize(sentence))

['I', 'do', "n't", 'like', 'implementing', 'neural', 'nets', 'in', 'Tensorflow', '.']


## Case-Folding

In [15]:
sentence = "He told Dr. Lovato that he was done with the tests and would post the results shortly."
doc = nlp(sentence)

In [16]:
print([token.lower_ for token in doc])

['he', 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


In [17]:
print([token.lower_ if not token.is_sent_start else token for token in doc])

[He, 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


## Stop Word Removal

In [18]:
print(nlp.Defaults.stop_words)

{'everyone', 'to', 'all', 'very', '’s', 'yourselves', 'i', 'during', '‘ll', 'yourself', 'whenever', 'here', 'do', 'she', 'each', 'while', 'only', 'top', 'same', 'as', 'about', 'through', 'formerly', 'via', 'hereafter', 'now', 'seem', 'yet', '‘re', 'eight', 'may', 'him', 'must', 'across', 'give', 'it', 'above', 'a', 'our', 'made', 'cannot', 'within', "'m", 'namely', 'put', 'throughout', 'these', 'how', 'my', 'have', 'though', 'sixty', 'but', 'something', 'this', 'seems', 'not', 'see', 'the', 'his', 'again', "'d", 'doing', 'however', 'latter', 'am', 'wherever', 'perhaps', 'next', 'became', 'of', 'he', 'nothing', 'off', 'hereby', 'wherein', 'toward', 'nor', 'some', 'who', 'become', 'and', 'get', 'until', '’ll', 'might', 'thus', 'often', 'such', 'after', 'bottom', 'someone', 'between', 'either', 'done', 'twelve', 'should', 'if', 'serious', 'be', 'say', 'three', 'still', 'regarding', 'we', 'keep', 'which', 'else', 'amongst', 'among', 'front', 'less', 'when', "'re", 'eleven', 'four', 'so', '

In [19]:
print([token for token in doc if not token.is_stop])

[told, Dr., Lovato, tests, post, results, shortly, .]


## Stemming and Lemmatization

### Lemmatization

Spacy offers **Lemmatization** but not **Stemming**.

In [20]:
[(token.text, token.lemma_) for token in doc]

[('He', 'he'),
 ('told', 'tell'),
 ('Dr.', 'Dr.'),
 ('Lovato', 'Lovato'),
 ('that', 'that'),
 ('he', 'he'),
 ('was', 'be'),
 ('done', 'do'),
 ('with', 'with'),
 ('the', 'the'),
 ('tests', 'test'),
 ('and', 'and'),
 ('would', 'would'),
 ('post', 'post'),
 ('the', 'the'),
 ('results', 'result'),
 ('shortly', 'shortly'),
 ('.', '.')]

### Stemming

If you need stemming for your use case, you can use **NLTK**

In [21]:
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()

In [22]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
            'died', 'agreed', 'owned', 'humbled', 'sized',
            'meeting', 'stating', 'siezing', 'itemization',
            'sensational', 'traditional', 'reference', 'colonizer',
            'plotted']

In [23]:
singles = [porter_stemmer.stem(plural) for plural in plurals]

In [24]:
print(' '.join(singles))

caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot


In [25]:
from nltk.stem.snowball import SnowballStemmer

snowball_stemmer = SnowballStemmer('english')

print(snowball_stemmer.stem("running"))

print(snowball_stemmer.stem("having"))

run
have


In [26]:
print(snowball_stemmer.stem("generously"))

generous


In [27]:
print(porter_stemmer.stem("generously"))

gener


## Part-of-Speech Tagging

In [28]:
sentence = "Vijay watched Animal at the cinema and was ashamed of himself."
doc = nlp(sentence)

In [29]:
[(token.text, token.pos_) for token in doc]

[('Vijay', 'PROPN'),
 ('watched', 'VERB'),
 ('Animal', 'PROPN'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('cinema', 'NOUN'),
 ('and', 'CCONJ'),
 ('was', 'AUX'),
 ('ashamed', 'ADJ'),
 ('of', 'ADP'),
 ('himself', 'PRON'),
 ('.', 'PUNCT')]

In [38]:
[(token.text, token.tag_) for token in doc]

[('Tesla', 'NNP'),
 ("'s", 'POS'),
 ('massive', 'JJ'),
 ('$', '$'),
 ('30', 'CD'),
 ('bn', 'IN'),
 ('plan', 'NN'),
 ('for', 'IN'),
 ('India', 'NNP'),
 (';', ':'),
 ('Musk', 'NNP'),
 ('an', 'DT'),
 ("'", '``'),
 ('admirer', 'NN'),
 ("'", "''"),
 ('of', 'IN'),
 ('Modi', 'NNP')]

In [39]:
spacy.explain("NNP")

'noun, proper singular'

## Named Entity Recognition

In [31]:
sentence = "Tesla's massive $30 bn plan for India; Musk an 'admirer' of Modi"
doc = nlp(sentence)

[(token.text, token.ent_type_) for token in doc]

[('Tesla', 'ORG'),
 ("'s", ''),
 ('massive', ''),
 ('$', ''),
 ('30', 'MONEY'),
 ('bn', ''),
 ('plan', ''),
 ('for', ''),
 ('India', 'GPE'),
 (';', ''),
 ('Musk', 'PERSON'),
 ('an', ''),
 ("'", ''),
 ('admirer', ''),
 ("'", ''),
 ('of', ''),
 ('Modi', 'GPE')]

In [32]:
[(token.text, token.ent_type_) for token in doc if token.ent_type != 0]

[('Tesla', 'ORG'),
 ('30', 'MONEY'),
 ('India', 'GPE'),
 ('Musk', 'PERSON'),
 ('Modi', 'GPE')]

In [33]:
[(ent.text, ent.label_) for ent in doc.ents]

[('Tesla', 'ORG'),
 ('30', 'MONEY'),
 ('India', 'GPE'),
 ('Musk', 'PERSON'),
 ('Modi', 'GPE')]

In [34]:
[(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]

[('Tesla', 'ORG', 0, 5),
 ('30', 'MONEY', 17, 19),
 ('India', 'GPE', 32, 37),
 ('Musk', 'PERSON', 39, 43),
 ('Modi', 'GPE', 60, 64)]

In [35]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

In [36]:
spacy.explain('GPE')

'Countries, cities, states'