In [None]:
import sys
import nltk
import pandas as pd
import matplotlib as plt
import sklearn
import seaborn

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
#Corpus - Body of text, singular. Corpora is the plural of this. Example: A collection of medical journals.

#Lexicon - Words and their meanings. Example: English dictionary. Consider, however, that various fields will have different lexicons.

#Token - Each "entity" that is a part of whatever was split up based on rules.
#        For examples, each word is a token when a sentence is "tokenized" into words.
#        Each sentence can also be a token, if you tokenized the sentences out of a paragraph.
# pre processing - converting data so comp can understand
# stop words - useless data

In [None]:
text = "Hello students, how are you doing today? The olympics are inspiring, and Python is awesome. You look nice today."

print(sent_tokenize(text)) # sentence tokenizer

['Hello students, how are you doing today?', 'The olympics are inspiring, and Python is awesome.', 'You look nice today.']


In [None]:
print(word_tokenize(text)) # word tokenizer

['Hello', 'students', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'olympics', 'are', 'inspiring', ',', 'and', 'Python', 'is', 'awesome', '.', 'You', 'look', 'nice', 'today', '.']


In [None]:
print(set(stopwords.words('english')))#stopwords removal

{'above', 'yourself', 'should', 'any', "you're", 'all', "isn't", 'out', "doesn't", 'being', 'haven', 're', 'he', 'wouldn', 'doesn', "weren't", 't', 'does', 'yourselves', 'with', 'couldn', 'each', 'now', "haven't", 'have', "it's", "should've", 'just', 'they', 'up', 'mightn', 'other', 'yours', "needn't", "shouldn't", 'how', 'more', "wasn't", 'both', 'needn', 'ours', 'o', 'was', 'most', 'once', 'very', 'am', 'an', 'over', 'having', 'against', "don't", 'been', "you've", 'why', 'is', 'for', 'll', 'where', 'which', 'don', 'these', 'hers', 'm', 'her', 'between', 'themselves', 'myself', 'those', 'a', 'there', 'do', 'the', 'them', "couldn't", 'our', 'did', 'i', "that'll", 's', 'itself', "aren't", "didn't", 'because', 'didn', 'at', 'its', 'through', 'when', 'of', 'mustn', 'she', "she's", "you'd", 'ma', "shan't", 'before', 'you', "you'll", 'their', 've', 'whom', 'shouldn', 'won', 'while', 'here', "won't", 'same', 'to', 'after', 'on', 'herself', 'are', 'down', 'me', 'than', 'will', 'isn', 'then', 

In [None]:
stopword_eg = ' This is sample text, showing off stop words filtration.'
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(stopword_eg)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'sample', 'text', ',', 'showing', 'off', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'text', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [None]:
# Stemming words
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_words = ['rider', 'riding', 'rider', 'rides']

for w in example_words:
  print(ps.stem(w))

rider
ride
rider
ride


In [None]:
# Stemming an entire sentence
new_text = 'When riders are riding their horses, they often think of how cowboys rode horses.'

words = word_tokenize(new_text)

for i in words:
  print(ps.stem(i))

when
rider
are
ride
their
hors
,
they
often
think
of
how
cowboy
rode
hors
.


In [None]:
nltk.download('udhr')

[nltk_data] Downloading package udhr to /root/nltk_data...
[nltk_data]   Unzipping corpora/udhr.zip.


True

In [None]:
from nltk.corpus import udhr
print(udhr.raw('English-Latin1'))

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom, justice and peace in the world, 

Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind, and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people, 

Whereas it is essential, if man is not to be compelled to have recourse, as a last resort, to rebellion against tyranny and oppression, that human rights should be protected by the rule of law, 

Whereas it is essential to promote the development of friendly relations between nations, 

Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights, in the dignity and worth of the human person and in

In [None]:
nltk.download('state_union')

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

In [None]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

In [None]:
print(train_text)

In [None]:
# Training sentence tokenizer

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [None]:
# Now we use this to tokenize the sample text

tokenized = custom_sent_tokenizer.tokenize(sample_text)
print(tokenized)

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', '(Applause.)', "President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan. 31, 2006. White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.", 'We have gathered under this Capitol dome in moments of national mourning and national achievement.', '

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# This function will tag each tokenized word with a part of speech

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


# The output is a list of tuples - the word with it's part of speech
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

##### Chunking with NLTK

Now that each word has been tagged with a part of speech, we can move onto chunking: grouping the words into meaningful clusters.  The main goal of chunking is to group words into "noun phrases", which is a noun with any associated verbs, adjectives, or adverbs.

The part of speech tags that were generated in the previous step will be combined with regular expressions, such as the following:

In [None]:
''''
+ = match 1 or more
? = match 0 or 1 repetitions.
* = match 0 or MORE repetitions
. = Any character except a new line
''''

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            # combine the part-of-speech tag with a regular expression

            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            # draw the chunks with nltk
            # chunked.draw()

    except Exception as e:
        print(str(e))


process_content()

In [None]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

### Text Classification

##### Text classification using NLTK

Now that we have covered the basics of preprocessing for Natural Language Processing, we can move on to text classification using simple machine learning classification algorithms.

In [None]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
import random
import nltk
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# shuffle the documents
random.shuffle(documents)

print('Number of Documents: {}'.format(len(documents)))
print('First Review: {}'.format(documents[1]))

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

print('Most common words: {}'.format(all_words.most_common(25)))
print('The word happy: {}'.format(all_words["happy"]))
print('The word movie: {}'.format(all_words["movie"]))

Number of Documents: 2000
First Review: (['a', 'fully', 'loaded', 'entertainment', 'review', '-', 'website', 'coming', 'in', 'july', '!', '>', 'from', 'ace', 'ventura', 'to', 'truman', 'burbank', ',', 'jim', 'carrey', 'has', 'run', 'the', 'whole', 'gamut', 'of', 'comic', ',', 'yet', 'sympathetic', ',', 'characters', '.', '1996', "'", 's', 'the', 'cable', 'guy', 'was', 'supposed', 'to', 'be', 'his', 'big', '"', 'breakthrough', '"', 'role', 'from', 'zany', 'humor', 'into', 'darker', ',', 'more', 'dramatic', 'acting', '.', 'as', 'most', 'everyone', 'knows', ',', 'the', 'results', 'were', ',', 'well', ',', 'less', '-', 'than', '-', 'stellar', '.', 'not', 'only', 'did', 'the', 'film', 'not', 'do', 'so', 'hot', 'at', 'the', 'box', 'office', ',', 'but', 'it', 'was', 'also', 'panned', 'by', 'critics', '.', 'as', 'far', 'as', 'i', 'know', ',', 'gene', 'siskel', 'and', 'i', 'are', 'the', 'only', 'ones', 'willing', 'to', 'admit', 'that', 'we', 'dug', 'it', '.', 'the', 'first', 'time', 'i', 'saw',

In [None]:
# We'll use the 4000 most common words as features
print(len(all_words))
word_features = list(all_words.keys())[:4000]

39768


In [None]:
# The find_features function will determine which of the 3000 word features are contained in the review
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


# Lets use an example from a negative review
features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key, value in features.items():
    if value == True:
        print key

In [None]:
# Now lets do it for all the documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [None]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# define a seed for reproducibility
seed = 1

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [None]:
print(len(training))
print(len(testing))

In [None]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))