In [None]:
'''
Problem Statement
Text Analytics
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document 
Frequency.
'''

In [1]:
# Creating the sample documents
sentence_1 = "I will walk 500 miles and I would walk 500 more. Just to be the man who walks " + "a thousand miles and fall down on your door!"
sentence_2 = "I played the play playfully as the players were playing in the play with playfullness"

In [None]:
# 1. Tokenization

In [2]:
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Aditya
[nltk_data]     Padwal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
print("Tokenized words from sentense_1: ", word_tokenize(sentence_1))
print("Tokenized words from sentense_2: ", word_tokenize(sentence_2))

print("Tokenized sentences from sentense_1: ", sent_tokenize(sentence_1))
print("Tokenized sentences from sentense_2: ", sent_tokenize(sentence_2))

Tokenized words from sentense_1:  ['I', 'will', 'walk', '500', 'miles', 'and', 'I', 'would', 'walk', '500', 'more', '.', 'Just', 'to', 'be', 'the', 'man', 'who', 'walks', 'a', 'thousand', 'miles', 'and', 'fall', 'down', 'on', 'your', 'door', '!']
Tokenized words from sentense_2:  ['I', 'played', 'the', 'play', 'playfully', 'as', 'the', 'players', 'were', 'playing', 'in', 'the', 'play', 'with', 'playfullness']
Tokenized sentences from sentense_1:  ['I will walk 500 miles and I would walk 500 more.', 'Just to be the man who walks a thousand miles and fall down on your door!']
Tokenized sentences from sentense_2:  ['I played the play playfully as the players were playing in the play with playfullness']


In [6]:
# 2. POS Tagging

In [8]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aditya Padwal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [9]:
token = word_tokenize(sentence_1) + word_tokenize(sentence_2)
tagged = pos_tag(token)                 

print("Tagging Parts of Speech:", tagged)

Tagging Parts of Speech: [('I', 'PRP'), ('will', 'MD'), ('walk', 'VB'), ('500', 'CD'), ('miles', 'NNS'), ('and', 'CC'), ('I', 'PRP'), ('would', 'MD'), ('walk', 'VB'), ('500', 'CD'), ('more', 'JJR'), ('.', '.'), ('Just', 'NNP'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('man', 'NN'), ('who', 'WP'), ('walks', 'VBZ'), ('a', 'DT'), ('thousand', 'NN'), ('miles', 'NNS'), ('and', 'CC'), ('fall', 'VBP'), ('down', 'RP'), ('on', 'IN'), ('your', 'PRP$'), ('door', 'NN'), ('!', '.'), ('I', 'PRP'), ('played', 'VBD'), ('the', 'DT'), ('play', 'NN'), ('playfully', 'RB'), ('as', 'IN'), ('the', 'DT'), ('players', 'NNS'), ('were', 'VBD'), ('playing', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('play', 'NN'), ('with', 'IN'), ('playfullness', 'NN')]


In [10]:
# 3. Stop-Words Removal 

In [13]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Aditya
[nltk_data]     Padwal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
stop_words = stopwords.words('english')

token_one = word_tokenize(sentence_1)
cleaned_token_one = []

for word in token_one:
    if word not in stop_words:
        cleaned_token_one.append(word)

print('Unclean version of sentence_1:', token_one)
print('\nCleaned version of sentence_1:', cleaned_token_one)

token_two = word_tokenize(sentence_2)
cleaned_token_two = []

for word in token_two:
    if word not in stop_words:
        cleaned_token_two.append(word)

print('Unclean version of sentence_:', token_two)
print('\nCleaned version:', cleaned_token_two)

Unclean version: ['I', 'will', 'walk', '500', 'miles', 'and', 'I', 'would', 'walk', '500', 'more', '.', 'Just', 'to', 'be', 'the', 'man', 'who', 'walks', 'a', 'thousand', 'miles', 'and', 'fall', 'down', 'on', 'your', 'door', '!']

Cleaned version: ['I', 'walk', '500', 'miles', 'I', 'would', 'walk', '500', '.', 'Just', 'man', 'walks', 'thousand', 'miles', 'fall', 'door', '!']
Unclean version: ['I', 'played', 'the', 'play', 'playfully', 'as', 'the', 'players', 'were', 'playing', 'in', 'the', 'play', 'with', 'playfullness']

Cleaned version: ['I', 'played', 'play', 'playfully', 'players', 'playing', 'play', 'playfullness']


In [16]:
# 4. Stemming

In [17]:
from nltk.stem import PorterStemmer

In [20]:
stemmer = PorterStemmer()

token_one = word_tokenize(sentence_1)
token_two = word_tokenize(sentence_2)

stemmed_one = [stemmer.stem(word) for word in token_one]
stemmed_two = [stemmer.stem(word) for word in token_two]

print("Performing stemming on sentence_1")
print(" ".join(stemmed_one))

print("Performing stemming on sentence_2")
print(" ".join(stemmed_two))

Performing stemming on sentence_1
i will walk 500 mile and i would walk 500 more . just to be the man who walk a thousand mile and fall down on your door !
Performing stemming on sentence_2
i play the play play as the player were play in the play with playful


In [21]:
# 5. Lemmatization

In [26]:
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to C:\Users\Aditya
[nltk_data]     Padwal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Aditya
[nltk_data]     Padwal\AppData\Roaming\nltk_data...


True

In [27]:
lemmatizer = WordNetLemmatizer()

token_one = word_tokenize(sentence_1)
lemmatized_output_one = [lemmatizer.lemmatize(word) for word in token_one]
print(" ".join(lemmatized_output_one))

token_two = word_tokenize(sentence_2)
lemmatized_output_two = [lemmatizer.lemmatize(word) for word in token_two]
print(" ".join(lemmatized_output_two))

I will walk 500 mile and I would walk 500 more . Just to be the man who walk a thousand mile and fall down on your door !
I played the play playfully a the player were playing in the play with playfullness
