# Analyzing Text With NLTK

## Overview
Basic text analytics with NLTK

## Runtime
20 mins

## Step 1 : Wordcount on NLTK Corpus
Let's do some basic word counts in corpus

In [None]:
import nltk
from os.path import expanduser
nltk.data.path.append( expanduser("~") + "/data/nltk_data")
from nltk.corpus import state_union
from nltk import FreqDist

## -- state of the union
print(state_union.fileids())
print("---")

gw2006 = state_union.raw('2006-GWBush.txt')
print ("len of gw2006 : " , len(gw2006))
print ("gw2006 :\n",  gw2006[1:300])
print("---")

## TODO-2 :  get the words
gw2006_words = state_union.words('???')
## TODO-3 : print number of words and some words (Hint : gw2006_words[1:100])
print("len(gw2006_words) : ", len(???))
print ("gw2006_words : \n", ???)
print("---")

fdist = nltk.FreqDist(gw2006_words)
print ("most common words : " , fdist.most_common(50))
## What do we see?

## Step 2 : Cleaning up text
In the previous example, our top words were 'the', 'and' , 'of'.  These are called 'stop words'.  Let's clean them up.

In [None]:
import nltk
from os.path import expanduser
nltk.data.path.append( expanduser("~") + "/data/nltk_data")
from nltk.corpus import stopwords
from nltk.corpus import state_union

gw2006_words = state_union.words('2006-GWBush.txt')
gw2006_words_lower = [i.lower() for i in gw2006_words]
print("len(gw2006_words) : ", len(gw2006_words))
print("---")

stop_words_en = set(stopwords.words('english'))
print ("len(stop_words_en) : ", len(stop_words_en))
print("stop words_en : \n", sorted(stop_words_en))
print("---")

cleaned = [i for i in gw2006_words_lower if i not in stop_words_en]
## TODO : how many words in cleaned
print("len(cleaned) : ", ???)
print("---")

print("cleaned text:\n", cleaned[1:200])
print("---")

fdist = nltk.FreqDist(cleaned)
print ("most common words in cleaned :\n" , fdist.most_common(50))
print("---")

# we need to remove punctuation
stop_words_en.update(['-', '.', ',', '#', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) 

cleaned2 = [i for i in gw2006_words_lower if i not in stop_words_en]
## TODO : how many words in cleaned
print("len(cleaned2) : ", ???)
print("---")

## TODO - calculate FreqDist for cleaned2
fdist = nltk.FreqDist(???)
print ("most common words in cleaned2 :\n" , fdist.most_common(50))
print("---")

## TODO :  further cleanup
## Inspecting the output, we see a couple of punctuations that need cleaning up
## add them to stop words list and clean up again
stop_words_en.update(['.)', '??']) 
cleaned3 = [i for i in gw2006_words_lower if i not in stop_words_en]
## TODO : len of cleaned3
print("len(cleaned3) : ", ???)
print("---")

## TODO : Commpute FreqDist for cleaned3
fdist = nltk.FreqDist(???)
print ("most common words in cleaned3 :\n" , fdist.most_common(50))
print("---")

## Step 3 : Analyze another text dataset
In the previous exercise we analyzed data bundled with NLTK.
In this section, we are going to load and analyze custom dataset

We will use [State of the union 2014 by President Obama](../data/text/sotu-2014-obama.txt)

In [None]:
import nltk
from os.path import expanduser
nltk.data.path.append( expanduser("~") + "/data/nltk_data")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize

f = open('../data/text/sotu-2014-obama.txt')
text = f.read()
print("len(text)", len(text))
print("text:\n", text[1:500])
print("---")

# Tokenize, split into words
words = word_tokenize(text)
words_lower = [i.lower() for i in words]
print("len(words) : ", ???)
print("words:\n", words[:50])
print ("---")
print("words_lower:\n", words_lower[:50])
print ("---")

# fdist
fdist = nltk.FreqDist(words)
print ("most common in words :\n" , fdist.most_common(50))
print ("---")


## TODO : now use the above example to eliminate stop words from text
## and run distribution

stop_words_en = set(stopwords.words('english'))

cleaned1 = [i for i in words_lower if i not in stop_words_en]
print("len(cleaned1) : ", len(cleaned))
print("---")

fdist = nltk.FreqDist(???)
print ("most common in cleaned1 :\n" , fdist.most_common(50))
print ("---")

## TODO : continue cleaning up further
## remove punctuation
## remove any other 

## Final output is 
## [('applause', 97), ('america', 39), ('help', 32), ('cheers', 32)

## Step 4 : Stemming Part 1
Let's explore different stemming algorithm available in NLTK.  Run the code below and observe the differences in stemming (marked by \*)

In [None]:
import nltk
from os.path import expanduser
nltk.data.path.append( expanduser("~") + "/data/nltk_data")
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

words = ['run', 'running', 'like', 'liked', 'snow', 'snowing', 'dog', 'dogs', 'maximum', 
         'multiply', 'crying', 'leaves', 'fairly']
#print(words)
stemmer_snowball = SnowballStemmer("english")
stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
#stemmed_tokens =  [stemmer.stem(word) for word in words]
#print(stemmed_tokens)

# run through a few stems
print("word,  snowball_stem,   porter_stem, lancaster stem")
for w in words:
    snowball_stem = stemmer_snowball.stem(w)
    porter_stem = stemmer_porter.stem(w)
    lancaster_stem = stemmer_lancaster.stem(w)
    if (snowball_stem != porter_stem) or (snowball_stem != lancaster_stem) or (porter_stem != lancaster_stem):
        print("* ", end='')
    print ("{},  {},  {},  {}".format(w, snowball_stem, porter_stem, lancaster_stem))

## Step 5: Putting it all together
Now that we know all the algorithms, let's do a final analysis on 2014 SOTU.

In [None]:
import nltk
from os.path import expanduser
nltk.data.path.append( expanduser("~") + "/data/nltk_data")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

f = open('../data/text/sotu-2014-obama.txt')
text = f.read()
words = word_tokenize(text)

## TODO : Cleanup 1 : lowercase it all
words_lower = [i.???() for i in words]


## cleanup 2 - remove stop words
stop_words_english = set(stopwords.words('english'))
stop_words_english.update(['-', '.', ',', '#', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
stop_words_english.update(["'s", '--'])
## TODO : iterate through 'words_lower'
cleaned1 = [i for i in ??? if i not in stop_words_english]
print("len(cleaned1) : ", len(cleaned1))
print ("cleaned1:\n", cleaned1[:50])
print("---")

##  Cleanup 3 - stem
stemmer = SnowballStemmer("english")
## TODO : iterate through 'cleaned1'
cleaned2 = [stemmer.stem(word) for word in ???]
print ("len(cleaned2) : ", len(cleaned2))
print ("cleaned2:\n", cleaned2[:50])
print("---")

## TODO : Wordcount on 'cleaned2'
wc = FreqDist(???)
print("top 20 word count(cleaned2) : " )
for word, frequency in wc.most_common(20):
    print(f"{word} = {frequency}")
print("---")