In [5]:
from nltk.tokenize import word_tokenize

sentence = "When I first met him he wasn't very quiet. He spoke without stopping during the entire two hour long journey from Washington to New York."

nltk_word = word_tokenize(sentence)

print(nltk_word)

['When', 'I', 'first', 'met', 'him', 'he', 'was', "n't", 'very', 'quiet', '.', 'He', 'spoke', 'without', 'stopping', 'during', 'the', 'entire', 'two', 'hour', 'long', 'journey', 'from', 'Washington', 'to', 'New', 'York', '.']


In [6]:
whiteList = "new"

if whiteList in nltk_word:
    whiteIndex = nltk_word.index(whiteList)

nltk_word[whiteIndex:whiteIndex+2] = [' '.join(nltk_word[whiteIndex:whiteIndex+2])]
print(nltk_word)

['When I', 'first', 'met', 'him', 'he', 'was', "n't", 'very', 'quiet', '.', 'He', 'spoke', 'without', 'stopping', 'during', 'the', 'entire', 'two', 'hour', 'long', 'journey', 'from', 'Washington', 'to', 'New', 'York', '.']


In [7]:
# Remove punctation and stopwords

import string
#   import nltk
#   nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words_en = stopwords.words("english")

filteredWords = [w for w in nltk_word if (w not in list(string.punctuation)) and (w not in stop_words_en)]

print(filteredWords)

['When I', 'first', 'met', "n't", 'quiet', 'He', 'spoke', 'without', 'stopping', 'entire', 'two', 'hour', 'long', 'journey', 'Washington', 'New', 'York']


In [36]:
# Converting to lowercase

lowerWords = [w.lower() for w in nltk_word]

print(lowerWords)

['when', 'i', 'first', 'met', 'him', 'he', 'was', "n't", 'very', 'quiet', '.', 'he', 'spoke', 'without', 'stopping', 'during', 'the', 'entire', 'two', 'hour', 'long', 'journey', 'from', 'washington', 'to', 'new york', '.']


In [37]:
# Number words to numeric

from word2number import w2n

text = "two million twenty three thousand and forty nine"
print(w2n.word_to_num(text))

2023049


In [38]:
# unicode characters

import unidecode

test_word = unidecode.unidecode("ni√±o")
print(test_word)

nino


# Stemming words

In [39]:
# PorterStemmer

from nltk.stem import PorterStemmer

porter = PorterStemmer()

porterWords = [porter.stem(w) for w in nltk_word]

print(porterWords)

['when', 'i', 'first', 'met', 'him', 'he', 'wa', "n't", 'veri', 'quiet', '.', 'he', 'spoke', 'without', 'stop', 'dure', 'the', 'entir', 'two', 'hour', 'long', 'journey', 'from', 'washington', 'to', 'new york', '.']


In [40]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

words = ["multidimensional", "coming", "sing", "stemming" ]

[porter_stemmer.stem(w) for w in words]

#   Porter Stemming Algorithm   - http://snowball.tartarus.org/algorithms/porter/stemmer.html

['multidimension', 'come', 'sing', 'stem']

In [41]:
# SnowballStemmer

from nltk.stem import SnowballStemmer

snowball = SnowballStemmer('english')

snowballWords = [snowball.stem(w) for w in nltk_word]

print(snowballWords)

['when', 'i', 'first', 'met', 'him', 'he', 'was', "n't", 'veri', 'quiet', '.', 'he', 'spoke', 'without', 'stop', 'dure', 'the', 'entir', 'two', 'hour', 'long', 'journey', 'from', 'washington', 'to', 'new york', '.']


In [42]:
# LancasterStemmer

from nltk.stem import LancasterStemmer

lancaster = LancasterStemmer()

lancasterWords = [lancaster.stem(w) for w in nltk_word]

print(lancasterWords)

['when', 'i', 'first', 'met', 'him', 'he', 'was', "n't", 'very', 'quiet', '.', 'he', 'spok', 'without', 'stop', 'dur', 'the', 'entir', 'two', 'hour', 'long', 'journey', 'from', 'washington', 'to', 'new york', '.']


In [43]:
# RegexpStemmer

from nltk.stem import RegexpStemmer

regexp = RegexpStemmer('ing')

regexpWords = [regexp.stem(w) for w in nltk_word]

print(regexpWords)

['When', 'I', 'first', 'met', 'him', 'he', 'was', "n't", 'very', 'quiet', '.', 'He', 'spoke', 'without', 'stopp', 'dur', 'the', 'entire', 'two', 'hour', 'long', 'journey', 'from', 'Washton', 'to', 'New York', '.']


# Lemmatizing words

In [44]:
# lemmatize as noun (default)   :   Valid options are `"n"` for nouns, `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` for satellite adjectives.

#   import nltk
#   nltk.download('wordnet')
#   nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

lemmaWords = [lemma.lemmatize(w) for w in nltk_word]

print(lemmaWords)

['When', 'I', 'first', 'met', 'him', 'he', 'wa', "n't", 'very', 'quiet', '.', 'He', 'spoke', 'without', 'stopping', 'during', 'the', 'entire', 'two', 'hour', 'long', 'journey', 'from', 'Washington', 'to', 'New York', '.']


In [45]:
# lemmatize as verbs

lemmaWords = [lemma.lemmatize(w, pos = 'v') for w in nltk_word]

print(lemmaWords)

['When', 'I', 'first', 'meet', 'him', 'he', 'be', "n't", 'very', 'quiet', '.', 'He', 'speak', 'without', 'stop', 'during', 'the', 'entire', 'two', 'hour', 'long', 'journey', 'from', 'Washington', 'to', 'New York', '.']


In [46]:
# lemmatize as adjectives

lemma.lemmatize('better', pos = 'a')

'good'

In [47]:
# lemmatize as adverbs

lemma.lemmatize('better', pos = 'r')

'well'

In [48]:
import spacy
#spaCy determines the part-of-speech tag by default and assigns the corresponding lemma. It comes with a bunch of prebuilt models where the 'en' we just downloaded above is one of the standard ones for english.

sentence = ' '.join(nltk_word)

nlp = spacy.load('en_core_web_sm')
doc = nlp(sentence)

# Extract the lemma for each token and join
print([token.lemma_ for token in doc])

['when', 'I', 'first', 'meet', 'he', 'he', 'be', 'not', 'very', 'quiet', '.', 'he', 'speak', 'without', 'stop', 'during', 'the', 'entire', 'two', 'hour', 'long', 'journey', 'from', 'Washington', 'to', 'New', 'York', '.']


In [49]:
doc = nlp("I'm coming in a multimensional")

[token.lemma_ for token in doc]

['I', 'be', 'come', 'in', 'a', 'multimensional']

In [50]:

doc = nlp(" ".join(filteredWords))

print([token.lemma_ for token in doc])

['when', 'I', 'first', 'meet', 'not', 'quiet', 'he', 'speak', 'without', 'stop', 'entire', 'two', 'hour', 'long', 'journey', 'Washington', 'New', 'York']
