## Spacy Library

In [49]:
# import library
import spacy
nlp = spacy.load('en_core_web_sm')

In [50]:
# testing spacy library
sentence = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(sentence)

tokens = [token.text for token in doc]

print('Original Sentence: {}'.format(sentence))
print()
print(tokens)

Original Sentence: Apple is looking at buying U.K. startup for $1 billion

['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion']


### -  Removing Stopwards & Text Normalization using spaCy

In [51]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
nlp  = English()

In [52]:
text = """He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""

# 'nlp' object is used to create documents with Linguistic annotations.
doc = nlp(text)

In [53]:
# create list of word tokens
word_tokens = []

for token in doc:
    word_tokens.append(token.text)

In [54]:
# create a list of word tokens after removing stopwords
filtered_sentence = []

for word in word_tokens:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_sentence.append(word)

print("\n\nOriginal Sentence \n\n")
print(word_tokens)

print("\n\nFiltered Sentence \n\n")
print(filtered_sentence)



Original Sentence 


['He', 'determined', 'to', 'drop', 'his', 'litigation', 'with', 'the', 'monastry', ',', 'and', 'relinguish', 'his', 'claims', 'to', 'the', 'wood', '-', 'cuting', 'and', '\n', 'fishery', 'rihgts', 'at', 'once', '.', 'He', 'was', 'the', 'more', 'ready', 'to', 'do', 'this', 'becuase', 'the', 'rights', 'had', 'become', 'much', 'less', 'valuable', ',', 'and', 'he', 'had', '\n', 'indeed', 'the', 'vaguest', 'idea', 'where', 'the', 'wood', 'and', 'river', 'in', 'question', 'were', '.']


Filtered Sentence 


['determined', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claims', 'wood', '-', 'cuting', '\n', 'fishery', 'rihgts', '.', 'ready', 'becuase', 'rights', 'valuable', ',', '\n', 'vaguest', 'idea', 'wood', 'river', 'question', '.']



- Stopword removal doesn't take off punctuation marks or newline character. Need to remove them manually.

#### 1. Stemming - spaCy has no module for stemming

#### 2. Lemmatization

In [72]:
lemma_word = []
for token in doc:
    lemma_word.append(token.lemma_)

print(lemma_word)

['-PRON-', 'determine', 'to', 'drop', '-PRON-', 'litigation', 'with', 'the', 'monastry', ',', 'and', 'relinguish', '-PRON-', 'claim', 'to', 'the', 'wood', '-', 'cut', 'and', '\n', 'fishery', 'rihgts', 'at', 'once', '.', '-PRON-', 'be', 'the', 'more', 'ready', 'to', 'do', 'this', 'becuase', 'the', 'right', 'have', 'become', 'much', 'less', 'valuable', ',', 'and', '-PRON-', 'have', '\n', 'indeed', 'the', 'vague', 'idea', 'where', 'the', 'wood', 'and', 'river', 'in', 'question', 'be', '.']


## NLTK Library

In [63]:
doc = nlp(u"""He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were.""")

lemma_word1 = [] 
for token in doc:
    lemma_word1.append(token.lemma_)
print(lemma_word1)

['-PRON-', 'determine', 'to', 'drop', '-PRON-', 'litigation', 'with', 'the', 'monastry', ',', 'and', 'relinguish', '-PRON-', 'claim', 'to', 'the', 'wood', '-', 'cut', 'and', '\n', 'fishery', 'rihgts', 'at', 'once', '.', '-PRON-', 'be', 'the', 'more', 'ready', 'to', 'do', 'this', 'becuase', 'the', 'right', 'have', 'become', 'much', 'less', 'valuable', ',', 'and', '-PRON-', 'have', '\n', 'indeed', 'the', 'vague', 'idea', 'where', 'the', 'wood', 'and', 'river', 'in', 'question', 'be', '.']


In [4]:
import nltk
nltk.download('punkt')
print('NLTK version: {}'.format(nltk.__version__))

NLTK version: 3.5


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitchaudhary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# testing nltk library
sentence = "Apple is looking at buying U.K. startup for $1 billion"
tokens = nltk.word_tokenize(sentence)
print('Original Sentence: {}'.format(sentence))
print()
print(tokens)

Original Sentence: Apple is looking at buying U.K. startup for $1 billion

['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion']


###  - Removing Stopwards & Text Normalization using NLTK

In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankitchaudhary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# sample sentence
text = """"He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""

In [13]:
# set of stop words
stop_words = set(stopwords.words('english'))

In [14]:
# tokens of words
word_tokens = word_tokenize(text)

In [25]:
# create a list of word tokens after removing stopwords
filtered_sentence = []

for word in word_tokens:
    if word not in stop_words:
        filtered_sentence.append(word)

print("\n\nOriginal Sentence \n\n")
print(" ".join(word_tokens)) 

print("\n\nFiltered Sentence \n\n")
print(" ".join(filtered_sentence)) 



Original Sentence 


He determined to drop his litigation with the monastry , and relinguish his claims to the wood - cuting and 
 fishery rihgts at once . He was the more ready to do this becuase the rights had become much less valuable , and he had 
 indeed the vaguest idea where the wood and river in question were .


Filtered Sentence 


He determined drop litigation monastry , relinguish claims wood - cuting 
 fishery rihgts . He ready becuase rights become much less valuable , 
 indeed vaguest idea wood river question .


- As we can see that the size of the text has almost reduced to half!

#### 1. Stemming

In [41]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [40]:
stem_words = []
for word in filtered_sentence:
    root_word = ps.stem(word)
    stem_words.append(root_word)

print("\n\nFiltered Sentence \n\n",filtered_sentence)
print("\n\nSentence with Stemming\n\n",stem_words)



Filtered Sentence 

 ['determined', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claims', 'wood', '-', 'cuting', '\n', 'fishery', 'rihgts', '.', 'ready', 'becuase', 'rights', 'valuable', ',', '\n', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


Sentence with Stemming

 ['determin', 'drop', 'litig', 'monastri', ',', 'relinguish', 'claim', 'wood', '-', 'cute', '\n', 'fisheri', 'rihgt', '.', 'readi', 'becuas', 'right', 'valuabl', ',', '\n', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


#### 2. Lemmatization

In [45]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ankitchaudhary/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [47]:
lemma_word = []

for word in filtered_sentence:
    word = wordnet_lem.lemmatize(word, pos = 'n')
    word = wordnet_lem.lemmatize(word, pos = 'v')
    word = wordnet_lem.lemmatize(word, pos = 'a')
    lemma_word.append(word)

print("\n\nFiltered Sentence \n\n",filtered_sentence)
print("\n\nSentence with Lemmatization\n\n",lemma_word)



Filtered Sentence 

 ['determined', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claims', 'wood', '-', 'cuting', '\n', 'fishery', 'rihgts', '.', 'ready', 'becuase', 'rights', 'valuable', ',', '\n', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


Sentence with Lemmatization

 ['determine', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claim', 'wood', '-', 'cut', '\n', 'fishery', 'rihgts', '.', 'ready', 'becuase', 'right', 'valuable', ',', '\n', 'vague', 'idea', 'wood', 'river', 'question', '.']


- The lemmatizer only lemmatizes those words which match the pos parameter of the lemmatize method. 
- Lemmatization is done on the basis of part-of-speech tagging (POS tagging). Weâ€™ll talk in detail about POS tagging in an upcoming article.