# Tokenizing

In [None]:
import nltk
nltk.download()

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [2]:
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


# Stop Words

We would not want these words taking up space in our database, or taking up valuable processing time. As such, we call these words "stop words" because they are useless, and we wish to do nothing with them.

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [4]:
print(stop_words)

{'are', 'but', 'hasn', 're', 'myself', 'ma', 'we', "weren't", 'some', 'before', 'why', 'd', 'the', "you'd", "mightn't", 'did', "don't", "you'll", 'his', 'them', 'mustn', 'as', "mustn't", "hadn't", 'if', 'whom', 'doing', 'does', 'just', 'once', 'our', "haven't", 'few', 'she', 'yourself', 'those', 'am', 'themselves', 'over', 'its', 'about', 'into', 'now', "aren't", 'hers', 'while', 'own', 'me', 'through', 'has', "won't", 'below', 'hadn', 'should', 'been', 'do', 'same', 'needn', 'or', 'weren', 'of', 'll', 'here', "she's", 'above', 'all', 'further', 'aren', 'doesn', 'what', 'shan', 'that', 'wasn', "wouldn't", "shouldn't", "you're", "didn't", 'having', 'because', 'more', 'theirs', 'then', 'until', 'out', 's', 'her', "shan't", 'ourselves', "you've", 'at', 'such', 'ain', 'himself', 'during', 'shouldn', "needn't", 'you', 'so', "couldn't", 'o', 'is', 'had', 'by', 'there', 'my', "should've", 'these', "it's", 'who', 'don', 'm', 'not', 'this', 'each', 'off', 't', 've', 'against', 'were', 'was', 'f

# Stemming

In [5]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [6]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [7]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [8]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [9]:
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


# Part of Speech Tagging

In [None]:
# POS tag list:

# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there (like: "there is" ... think of it like "there exists")
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	adjective	'big'
# JJR	adjective, comparative	'bigger'
# JJS	adjective, superlative	'biggest'
# LS	list marker	1)
# MD	modal	could, will
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'
# PDT	predeterminer	'all the kids'
# POS	possessive ending	parent\'s
# PRP	personal pronoun	I, he, she
# PRP$	possessive pronoun	my, his, hers
# RB	adverb	very, silently,
# RBR	adverb, comparative	better
# RBS	adverb, superlative	best
# RP	particle	give up
# TO	to	go 'to' the store.
# UH	interjection	errrrrrrrm
# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# WDT	wh-determiner	which
# WP	wh-pronoun	who, what
# WP$	possessive wh-pronoun	whose
# WRB	wh-abverb	where, when

In [10]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [11]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [12]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [13]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [14]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

# Lemmatizing

In [16]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


# WordNet

In [24]:
from nltk.corpus import wordnet

syns = wordnet.synsets("good")

In [25]:
print(syns[0].name())

good.n.01


In [33]:
for s in syns:
    for l in s.lemmas(): # Lemmas are Synonyms
        print(l.name())

good
good
goodness
good
goodness
commodity
trade_good
good
good
full
good
good
estimable
good
honorable
respectable
beneficial
good
good
good
just
upright
adept
expert
good
practiced
proficient
skillful
skilful
good
dear
good
near
dependable
good
safe
secure
good
right
ripe
good
well
effective
good
in_effect
in_force
good
good
serious
good
sound
good
salutary
good
honest
good
undecomposed
unspoiled
unspoilt
good
well
good
thoroughly
soundly
good


In [19]:
print(syns[0].lemmas()[0].name())

plan


In [20]:
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [21]:
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [22]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'safe', 'salutary', 'undecomposed', 'dear', 'unspoiled', 'proficient', 'near', 'ripe', 'commodity', 'practiced', 'trade_good', 'secure', 'honest', 'unspoilt', 'serious', 'skilful', 'full', 'well', 'honorable', 'expert', 'dependable', 'thoroughly', 'goodness', 'in_force', 'in_effect', 'sound', 'just', 'estimable', 'soundly', 'good', 'respectable', 'beneficial', 'skillful', 'effective', 'adept', 'upright', 'right'}
{'bad', 'evil', 'badness', 'evilness', 'ill'}


In [34]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [36]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))

0.6956521739130435


In [37]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


# Text Classification

In [46]:
import nltk
import random
from nltk.corpus import movie_reviews

movie_reviews.categories()

['neg', 'pos']

In [48]:
# movie_reviews.fileids('neg') -> this gives all negative review text file names as output

In [54]:
list(movie_reviews.words('neg/cv160_10848.txt'))[0:20]

['deceiver',
 'is',
 'a',
 'plot',
 'twist',
 'in',
 'search',
 'of',
 'a',
 'movie',
 '.',
 'this',
 'overly',
 'constructed',
 'film',
 'succeeds',
 'in',
 'having',
 'many',
 'surprises']

In [50]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [51]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [55]:
print(all_words["stupid"]) # Returns the frequency of word "Stupid"

253


# Converting words to Features

In [56]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000] # Keys -> Words in all_words, this contains 3000 most common words

In [61]:
word_features[0:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [62]:
len(word_features)

3000

### find_features function will find these top 3,000 words in our positive and negative documents, marking their presence as either positive or negative

In [63]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [64]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) # Example



In [65]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Naive Bayes Classifier

In [66]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

In [67]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [68]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 86.0


In [69]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =     15.7 : 1.0
                  annual = True              pos : neg    =      9.7 : 1.0
                 idiotic = True              neg : pos    =      9.6 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                    lame = True              neg : pos    =      6.4 : 1.0
             silverstone = True              neg : pos    =      6.3 : 1.0
               atrocious = True              neg : pos    =      6.2 : 1.0

## Let's take an example

In [70]:
sample_text = "This movie is idiotic and it sucks"

In [79]:
tokenized_text = word_tokenize(sample_text)
print(tokenized_text)

['This', 'movie', 'is', 'idiotic', 'and', 'it', 'sucks']


In [81]:
test_text = find_features(tokenized_text)
print(test_text)



In [83]:
classifier.classify(test_text)

'neg'

# Saving Classifiers

### Dumping classfier data into a file

In [85]:
import pickle

save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

### Loading a classfier from a pickle file

In [86]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

# Sci-Kit Learn Incorporation

In [87]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set))

MultinomialNB accuracy percent: 0.86
BernoulliNB accuracy percent: 0.87


In [89]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 86.0
Most Informative Features
                   sucks = True              neg : pos    =     15.7 : 1.0
                  annual = True              pos : neg    =      9.7 : 1.0
                 idiotic = True              neg : pos    =      9.6 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                    lame = True              neg : pos    =      6.4 : 1.0
             silverstone = True              neg : pos    =      6.3 : 1.0
               atrocious 

# Combining Algorithms

In [91]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]
        
training_set = featuresets[:1900]
testing_set =  featuresets[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()


print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

Original Naive Bayes Algo accuracy percent: 91.0
Most Informative Features
                   sucks = True              neg : pos    =     15.7 : 1.0
                  annual = True              pos : neg    =      9.7 : 1.0
                 idiotic = True              neg : pos    =      9.6 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                    lame = True              neg : pos    =      6.4 : 1.0
             silverstone = True              neg : pos    =      6.3 : 1.0
               atrocious 

# Improving Training Data

In [105]:
short_pos = open("short_reviews/positive.txt","r").read()
short_neg = open("short_reviews/negative.txt","r").read()

documents = []

for r in short_pos.split('\n'):
    documents.append( (r, "pos"))

for r in short_neg.split('\n'):
    documents.append( (r, "neg"))


all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 4645: invalid continuation byte