## Text Analytics

### Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization

#### Download the required packages

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TANMAY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TANMAY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\TANMAY\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\TANMAY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

#### Import the required libraries

In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

#### Initialize the text

In [3]:
text = "Tokenization is the first step in text analytics. the process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

#### Sentence Tokenization

In [6]:
tokenized_text = sent_tokenize(text)
print(tokenized_text)

['Tokenization is the first step in text analytics.', 'the process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


#### Word Tokenization

In [8]:
tokenized_word = word_tokenize(text)
print(tokenized_word)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'the', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


#### Removing Punctuations and Stop Word

In [7]:
# Print stop words in English
stop_words = set(stopwords.words("english"))
print(stop_words)

{'they', 'myself', "should've", 'whom', 'needn', 'does', 'doing', 'couldn', 'all', 'once', "she's", 't', 'these', 'on', 'doesn', 'm', 'their', 'this', 'd', 'that', 'yourself', 'her', 'an', "you'd", "wasn't", 'should', 'under', 're', "it's", 'out', 'not', 'just', 'because', "aren't", 'his', 'other', 'did', 'above', "hasn't", 'off', 'such', 'the', "mightn't", 'any', 'shan', 'weren', "mustn't", 'yourselves', 'further', "you'll", 'll', 'were', 'won', 'here', 'through', 'how', 'now', 'my', 'so', 'below', 'ours', "shan't", 'both', 'herself', 'at', 'own', 's', 'mustn', 'you', 'there', 'then', 'she', 'from', 'down', 'more', 'him', 'during', 'me', 'been', 'shouldn', 'after', 'himself', 'your', "isn't", 'ma', 'same', 'in', 'when', "wouldn't", 'if', "shouldn't", "you've", 'it', 'are', 'be', "that'll", 'as', 'about', 'up', 'few', 'nor', 'ain', "weren't", 'while', 'its', "don't", 'what', "you're", 'them', 'aren', 'is', 'hadn', 'hers', 'wasn', 'had', 'with', 'our', 'of', "hadn't", 'mightn', 'i', 'ha

In [8]:
# Removing Punctuations
text= "How to remove stop words with NLTK library in Python?"
tokens = word_tokenize(text.lower())
filtered_text=[]
for w in tokens:
    if w not in stop_words:
        filtered_text.append(w)
print("Tokenized Sentence:",tokens)
print("Filterd Sentence:",filtered_text)

Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python', '?']
Filterd Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python', '?']


#### Perfrom Stemming

In [10]:
e_words = ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()
for w in e_words:
    rootWord = ps.stem(w)
print(rootWord)

wait


#### Perform Lemmatization

In [14]:
erwordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, erwordnet_lemmatizer.lemmatize(w)))

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


#### Apply POS Tagging to text

In [17]:
data = "The pink sweater fit her perfectly"
words = word_tokenize(data)
for w in words:
    print(nltk.pos_tag([w]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


### Representation of document by calculating TFIDF

#### Import the necessary libraries

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#### Initialize the Documents

In [13]:
documentA = "Jupiter is the largest Planet"
documentB = "Mars is the fourth planet from the Sun"

#### Create BagofWords for Document A and B

In [14]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

#### Create Collection of Unique words from Document A and B

In [15]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

#### Create a dictionary of words and their occurrence for each document in the corpus

In [16]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for w in bagOfWordsA:
    numOfWordsA[w] += 1
    numOfWordsB = dict.fromkeys(uniqueWords,0)
    for w in bagOfWordsB:
        numOfWordsB[w] += 1

#### Compute the term frequency for each of our documents

In [17]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

#### Compute the term Inverse Document Frequency

In [18]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idfs = computeIDF([numOfWordsA,numOfWordsB])
idfs

{'Jupiter': 0.6931471805599453,
 'Planet': 0.6931471805599453,
 'is': 0.0,
 'the': 0.0,
 'Mars': 0.6931471805599453,
 'Sun': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'from': 0.6931471805599453,
 'largest': 0.6931471805599453}

#### Compute the term TF/IDF for all words

In [19]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA,idfs)
tfidfB = computeTFIDF(tfB,idfs)
df = pd.DataFrame([tfidfA,tfidfB])
df

Unnamed: 0,Jupiter,Planet,is,the,Mars,Sun,planet,fourth,from,largest
0,0.138629,0.138629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138629
1,0.0,0.0,0.0,0.0,0.086643,0.086643,0.086643,0.086643,0.086643,0.0
