# Vectorization and preprocessing
### 1. Stop words

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adhocmaster\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords
# stopwords.words('english')

### 2. Stemming and Lemmatization

Stemming chops off the end of the world, lemmatization finds the root

In [10]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()
print([porter.stem(word) for word in ["walking", "walked", "walks"]])
print([porter.stem(word) for word in ["go", "going", "went"]])
print([porter.stem(word) for word in ["good", "better", "best"]])
print([porter.stem(word) for word in ["is", "was"]])
print([porter.stem(word) for word in ["mouse", "mice"]])

['walk', 'walk', 'walk']
['go', 'go', 'went']
['good', 'better', 'best']
['is', 'wa']
['mous', 'mice']


In [19]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# nltk.download("wordnet")
# nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

print([lemmatizer.lemmatize(word) for word in ["walking", "walked", "walks"]])
print([lemmatizer.lemmatize(word, wordnet.VERB) for word in ["walking", "walked", "walks"]])
print([lemmatizer.lemmatize(word) for word in ["go", "going", "went"]])
print([lemmatizer.lemmatize(word, wordnet.VERB) for word in ["go", "going", "went"]])
print([lemmatizer.lemmatize(word) for word in ["good", "better", "best"]])
print([lemmatizer.lemmatize(word, wordnet.ADJ) for word in ["good", "better", "best"]])
print([lemmatizer.lemmatize(word) for word in ["is", "was"]])
print([lemmatizer.lemmatize(word, wordnet.VERB) for word in ["is", "was"]])
print([lemmatizer.lemmatize(word) for word in ["mouse", "mice"]])

['walking', 'walked', 'walk']
['walk', 'walk', 'walk']
['go', 'going', 'went']
['go', 'go', 'go']
['good', 'better', 'best']
['good', 'good', 'best']
['is', 'wa']
['be', 'be']
['mouse', 'mouse']


### 3. POST Tagging
To correctly lemmatize, we need POS Tagging first. But the tags used by nltk pos tagger is not compatible with the tags that WordNetLemmatizer use. So, we need a mapping. Here goes the mapping:

In [24]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\adhocmaster\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [23]:
from nltk.corpus import wordnet
def treebankToWordnetPOS(treebankTag: str) -> str:
    if treebankTag.startswith("J"):
        return wordnet.ADJ
    if treebankTag.startswith("V"):
        return wordnet.VERB
    if treebankTag.startswith("N"):
        return wordnet.NOUN
    if treebankTag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

In [27]:
sentence = "Alexis Mac Allister scored the opening goal of the game, after Wojciech Szczesny denied Lionel Messi from the penalty spot with a brilliant spot."
wordsAndTags = nltk.pos_tag(sentence.split())
wordsAndTags

[('Alexis', 'NNP'),
 ('Mac', 'NNP'),
 ('Allister', 'NNP'),
 ('scored', 'VBD'),
 ('the', 'DT'),
 ('opening', 'NN'),
 ('goal', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('game,', 'NN'),
 ('after', 'IN'),
 ('Wojciech', 'NNP'),
 ('Szczesny', 'NNP'),
 ('denied', 'VBD'),
 ('Lionel', 'NNP'),
 ('Messi', 'NNP'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('penalty', 'NN'),
 ('spot', 'NN'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('brilliant', 'JJ'),
 ('spot.', 'NN')]

In [29]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()
print([lemmatizer.lemmatize(word, treebankToWordnetPOS(tbTag)) for word, tbTag in wordsAndTags])

['Alexis', 'Mac', 'Allister', 'score', 'the', 'opening', 'goal', 'of', 'the', 'game,', 'after', 'Wojciech', 'Szczesny', 'deny', 'Lionel', 'Messi', 'from', 'the', 'penalty', 'spot', 'with', 'a', 'brilliant', 'spot.']
