# Text Analytics
  1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, 
     stop words removal, Stemming and Lemmatization.
  2. Create representation of document by calculating Term Frequency and Inverse Document
     Frequency.


In [1]:
import nltk
import pandas as pd
import math


In [2]:
# nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\C2K19\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\C2K19\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\C2K19\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\C2K19\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from nltk.corpus import inaugural
corpus = "Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard"
print(corpus)

Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard


# Task1:

## Tokenization 

In [4]:
from nltk.tokenize import word_tokenize,sent_tokenize

## Sentence Tokenization

In [5]:
tokenized_text = sent_tokenize(corpus)
print(tokenized_text)

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]


## Word Tokenization

In [6]:
tokenized_word = word_tokenize(corpus)
print(tokenized_word)

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


## Stopwords Removal

In [7]:
from nltk.corpus import stopwords

In [8]:
stop_words=set(stopwords.words("english"))
print(stop_words)

{'re', 'where', 'his', 'doesn', 'was', 'now', "aren't", 'out', "mustn't", 'doing', 'of', 'won', 'are', 'about', 'over', 'been', 'can', 's', 'which', 'at', 'to', 'but', 'with', 'o', 'from', 'off', "hasn't", "she's", "you'll", "don't", 'herself', 'down', 'most', "didn't", 'should', "needn't", 'or', 'once', "mightn't", "wouldn't", 'for', 'on', 'does', 'and', 'under', 'only', 'yourself', 'so', 'there', "isn't", 'above', 'why', 'had', 'am', "shouldn't", 'all', 'as', 'more', 'couldn', 'in', 'them', 'i', 'against', 'your', 'me', 'he', 'hasn', 'did', 'you', 'an', 'by', 'below', 'here', 'didn', 'needn', 'shouldn', 'being', 'yourselves', 'itself', 'other', 'this', 'no', 'y', 'll', 'yours', "weren't", 'don', 'ourselves', "you've", 'when', "you'd", 'm', 'these', 'if', 'who', "hadn't", "couldn't", 'it', 'is', 'himself', 'aren', 'isn', 'mightn', 'while', "won't", "haven't", 'theirs', 'the', "you're", 'him', 'my', 'ours', 'have', 'after', 'own', 'weren', 'then', 'we', 'between', 'some', 'during', 'ha

In [9]:
filtered_sent = []
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized words:",tokenized_word)
print('\n')
print("Filterd Sentence:",filtered_sent)

Tokenized words: ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


Filterd Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


## Stemming

In [10]:
from nltk.stem import PorterStemmer

In [11]:
ps = PorterStemmer()
stemmed_words = []
for w in filtered_sent:
    stemmed_words.append(ps.stem(w))
print("Filtered Sentence:",filtered_sent)
print('\n')
print("Stemmed Sentence:",stemmed_words)


Filtered Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


Stemmed Sentence: ['hello', 'mr.', 'smith', ',', 'today', '?', 'the', 'weather', 'great', ',', 'citi', 'awesom', '.', 'the', 'sky', 'pinkish-blu', '.', 'you', "n't", 'eat', 'cardboard']


## Lemmatization

In [12]:
from nltk.stem.wordnet import WordNetLemmatizer

In [13]:
lem = WordNetLemmatizer()
lemmatized_words = []

for w in filtered_sent:
    lemmatized_words.append(lem.lemmatize(w))

print("Filtered Sentence:",filtered_sent)
print('\n')
print("Stemmed Sentence:",lemmatized_words)


Filtered Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


Stemmed Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


In [14]:
word = "flying"
print("Lemmatized Word:",lem.lemmatize(word,"v"))
print("Stemmed Word:",ps.stem(word))

Lemmatized Word: fly
Stemmed Word: fli


## POS Tagging

In [15]:
nltk.pos_tag(tokenized_word)

[('Hello', 'NNP'),
 ('Mr.', 'NNP'),
 ('Smith', 'NNP'),
 (',', ','),
 ('how', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('doing', 'VBG'),
 ('today', 'NN'),
 ('?', '.'),
 ('The', 'DT'),
 ('weather', 'NN'),
 ('is', 'VBZ'),
 ('great', 'JJ'),
 (',', ','),
 ('and', 'CC'),
 ('city', 'NN'),
 ('is', 'VBZ'),
 ('awesome', 'JJ'),
 ('.', '.'),
 ('The', 'DT'),
 ('sky', 'NN'),
 ('is', 'VBZ'),
 ('pinkish-blue', 'JJ'),
 ('.', '.'),
 ('You', 'PRP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('eat', 'VB'),
 ('cardboard', 'NN')]

# Task2:

In [16]:
first_sentence = "Data Science is the hardest job of the 21st century"
second_sentence = "machine learning is the key for data science"

first_sentence = first_sentence.split(" ")
second_sentence = second_sentence.split(" ")

total= set(first_sentence).union(set(second_sentence))
print(total)

{'science', 'Data', 'learning', 'key', 'Science', 'machine', 'data', 'century', 'is', 'job', 'of', 'the', '21st', 'for', 'hardest'}


In [17]:
wordDictA = dict.fromkeys(total, 0) 
wordDictB = dict.fromkeys(total, 0)
for word in first_sentence:
    wordDictA[word]+=1
    
for word in second_sentence:
    wordDictB[word]+=1
    
print(wordDictA)
print(wordDictB)

{'science': 0, 'Data': 1, 'learning': 0, 'key': 0, 'Science': 1, 'machine': 0, 'data': 0, 'century': 1, 'is': 1, 'job': 1, 'of': 1, 'the': 2, '21st': 1, 'for': 0, 'hardest': 1}
{'science': 1, 'Data': 0, 'learning': 1, 'key': 1, 'Science': 0, 'machine': 1, 'data': 1, 'century': 0, 'is': 1, 'job': 0, 'of': 0, 'the': 1, '21st': 0, 'for': 1, 'hardest': 0}


In [18]:
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,science,Data,learning,key,Science,machine,data,century,is,job,of,the,21st,for,hardest
0,0,1,0,0,1,0,0,1,1,1,1,2,1,0,1
1,1,0,1,1,0,1,1,0,1,0,0,1,0,1,0


# TF (Term Frequency)

In [19]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)

In [20]:
#running our sentences through the tf function:
tfFirst = computeTF(wordDictA, first_sentence)
tfSecond = computeTF(wordDictB, second_sentence)

#Converting to dataframe for visualization
tf = pd.DataFrame([tfFirst, tfSecond])
print(tf)

   science  Data  learning    key  Science  machine   data  century     is  \
0    0.000   0.1     0.000  0.000      0.1    0.000  0.000      0.1  0.100   
1    0.125   0.0     0.125  0.125      0.0    0.125  0.125      0.0  0.125   

   job   of    the  21st    for  hardest  
0  0.1  0.1  0.200   0.1  0.000      0.1  
1  0.0  0.0  0.125   0.0  0.125      0.0  


# IDF (Inverse Document Frequency)

In [21]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    
    for word, val in idfDict.items():
        cnt = 0
        for doc in docList:
            if(doc[word] != 0):
                cnt += 1
        idfDict[word] = cnt
#     print(idfDict)
        
        
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) ))
        
    return(idfDict)

In [22]:
idfs = computeIDF([wordDictA, wordDictB])
print(idfs)

{'science': 0.3010299956639812, 'Data': 0.3010299956639812, 'learning': 0.3010299956639812, 'key': 0.3010299956639812, 'Science': 0.3010299956639812, 'machine': 0.3010299956639812, 'data': 0.3010299956639812, 'century': 0.3010299956639812, 'is': 0.0, 'job': 0.3010299956639812, 'of': 0.3010299956639812, 'the': 0.0, '21st': 0.3010299956639812, 'for': 0.3010299956639812, 'hardest': 0.3010299956639812}
