In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Remove punctuations

In [3]:
# import the nltk library after you have installed it
import nltk
# download the library subset that is needed for word tokenization
nltk.download('punkt')
# import the word tokenization function
from nltk.tokenize import word_tokenize
# creating a list of our quotes from the input dataset
quotes = ["Learning does not make one learned: there are those who have knowledge and those who have understanding. The first requires memory and the second philosophy.","For all evils there are two remedies - time and silence.","and as imagination bodies forth the forms of things unknown, the poet's pen turns them to shape, and gives to airy nothing a local habitation and a name"]
# define a list for feature engineering output
fe1_quotes = []
for quote in quotes:
    # get the word tokens from each quote
    r_words = word_tokenize(quote)
    # remove punctuations
    p_words = [w for w in r_words if w.isalpha()]
    p_removed = ''
    # put the works with punctuations removed back into a sentence
    for p_word in p_words:
        p_removed += ' ' + p_word
    fe1_quotes.append(p_removed)
    print(p_removed)

 Learning does not make one learned there are those who have knowledge and those who have understanding The first requires memory and the second philosophy
 For all evils there are two remedies time and silence
 and as imagination bodies forth the forms of things unknown the poet pen turns them to shape and gives to airy nothing a local habitation and a name


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Converting to lower case

In [4]:
fe2_quotes = []
for quote in fe1_quotes:
    fe2_quotes.append(quote.lower())
print(fe2_quotes)

[' learning does not make one learned there are those who have knowledge and those who have understanding the first requires memory and the second philosophy', ' for all evils there are two remedies time and silence', ' and as imagination bodies forth the forms of things unknown the poet pen turns them to shape and gives to airy nothing a local habitation and a name']


## Removing stop words

In [5]:
# download the stopwords library
nltk.download('stopwords')
# import the word tokenization function
from nltk.tokenize import word_tokenize
# import the stop words function
from nltk.corpus import stopwords
# Define a new list for storing quotes with the stop words removed
fe3_quotes = []
# get the full list of stop words in the English language
stop_ws = set(stopwords.words('english'))
for quote in fe2_quotes:
    words = word_tokenize(quote)
    s_words = [word for word in words if not word in stop_ws]
    s_removed = ''
    for s_word in s_words:
        s_removed += ' ' + s_word
    fe3_quotes.append(s_removed)
print(fe3_quotes)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[' learning make one learned knowledge understanding first requires memory second philosophy', ' evils two remedies time silence', ' imagination bodies forth forms things unknown poet pen turns shape gives airy nothing local habitation name']


## Perform stemming

In [6]:
# import the stemming function
from nltk.stem.porter import PorterStemmer
# declare the porter stemmer, the most commonly used stemming function
porter = PorterStemmer()
fe4_quotes = []
for quote in fe3_quotes:
    words = word_tokenize(quote)
    # stem each of the words for each of our quotes
    stem_words = [porter.stem(word) for word in words]
    stem_quote = ''
    for stem_word in stem_words:
        stem_quote += ' ' + stem_word
    fe4_quotes.append(stem_quote)
print(fe4_quotes)

[' learn make one learn knowledg understand first requir memori second philosophi', ' evil two remedi time silenc', ' imagin bodi forth form thing unknown poet pen turn shape give airi noth local habit name']


## Perform lemmatization

In [8]:
# download the wordnet library for lemmatizer
nltk.download('wordnet')
# import the lemmatizer function
from nltk.stem import WordNetLemmatizer
# declare the lemmatizer
lemmatizer = WordNetLemmatizer()
fe5_quotes = []
for quote in fe3_quotes:
    words = word_tokenize(quote)
    # stem each of the words for each of our quotes
    lemma_words = [lemmatizer.lemmatize(word) for word in words]
    lemma_quote = ''
    for lemma_word in lemma_words:
        lemma_quote += ' ' + lemma_word
    fe5_quotes.append(lemma_quote)
print(fe5_quotes)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


[' learning make one learned knowledge understanding first requires memory second philosophy', ' evil two remedy time silence', ' imagination body forth form thing unknown poet pen turn shape give airy nothing local habitation name']
