In [2]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
from nltk.corpus import names
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import string

**Exercise:  calculate the tf-idf for the movie_review corpus and the top ranking words for that corpus.**

In [6]:
import sklearn
from sklearn.datasets import load_files

In [7]:
moviedir = r'C:\Users\Aether Analytics\Desktop\movie_reviews'

In [8]:
# loading all files as training data. 
movie_train = load_files(moviedir, shuffle=True)
len(movie_train.data)

2000

In [9]:
# target names ("classes") are automatically generated from subfolder names
movie_train.target_names

['neg', 'pos']

In [10]:
# First file seems to be about a Schwarzenegger movie. 
movie_train.data[0][:500]

"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so cal"

In [11]:
# first file is in "neg" folder
movie_train.filenames[0]

'C:\\Users\\Aether Analytics\\Desktop\\movie_reviews\\neg\\cv405_21868.txt'

In [12]:
 #first file is a negative review and is mapped to 0 index 'neg' in target_names
movie_train.target[0]

0

**CountVectorizer & TF-IDF**

In [14]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
# Turn off pretty printing of jupyter notebook... it generates long lines
%pprint

Pretty printing has been turned OFF


In [16]:
import nltk

In [17]:
sents = ['A rose is a rose is a rose is a rose.',
         'Oh, what a fine day it is.',
        "It ain't over till it's over, I tell you!!"]

In [18]:
# Initialize a CoutVectorizer to use NLTK's tokenizer instead of its 
# default one (which ignores punctuation and stopwords). 
# Minimum document frequency set to 1. 
foovec = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)

In [19]:
# sents turned into sparse vector of word frequency counts
sents_counts = foovec.fit_transform(sents)
# foovec now contains vocab dictionary which maps unique words to indexes
foovec.vocabulary_

{u'!': 0, u"'s": 1, u'ai': 5, u'over': 13, u'it': 10, u'fine': 7, u'day': 6, u'a': 4, u'what': 17, u'oh': 12, u'i': 8, u'rose': 14, u',': 2, u'.': 3, u"n't": 11, u'till': 16, u'tell': 15, u'you': 18, u'is': 9}

In [20]:
# sents_counts has a dimension of 3 (document count) by 19 (# of unique words)
sents_counts.shape

(3, 19)

In [21]:
# this vector is small enough to view in full! 
sents_counts.toarray()

array([[0, 0, 0, 1, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [2, 1, 1, 0, 0, 1, 0, 0, 1, 0, 2, 1, 0, 2, 0, 1, 1, 0, 1]], dtype=int64)

In [22]:
# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
sents_tfidf = tfidf_transformer.fit_transform(sents_counts)

In [23]:
# TF-IDF values
# raw counts have been normalized against document length, 
# terms that are found across many docs are weighted down
sents_tfidf.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.13650997,  0.54603988,
         0.        ,  0.        ,  0.        ,  0.        ,  0.40952991,
         0.        ,  0.        ,  0.        ,  0.        ,  0.71797683,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.28969526,  0.28969526,  0.28969526,
         0.        ,  0.38091445,  0.38091445,  0.        ,  0.28969526,
         0.28969526,  0.        ,  0.38091445,  0.        ,  0.        ,
         0.        ,  0.        ,  0.38091445,  0.        ],
       [ 0.47282517,  0.23641258,  0.17979786,  0.        ,  0.        ,
         0.23641258,  0.        ,  0.        ,  0.23641258,  0.        ,
         0.35959573,  0.23641258,  0.        ,  0.47282517,  0.        ,
         0.23641258,  0.23641258,  0.        ,  0.23641258]])

**transforming movie reviews**

In [24]:
# initialize movie_vector object, and then turn movie train data into a vector 
movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)         # use all 25K words. 82.2% acc.
# movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features = 3000) # use top 3000 words only. 78.5% acc.
movie_counts = movie_vec.fit_transform(movie_train.data)

In [25]:
# 'screen' is found in the corpus, mapped to index 19637
movie_vec.vocabulary_.get('screen')

19637

In [26]:
# Likewise, Mr. Steven Seagal is present...
movie_vec.vocabulary_.get('seagal')

19690

In [27]:
# huge dimensions! 2,000 documents, 25K unique terms. 
movie_counts.shape

(2000, 25313)

In [28]:
# Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)

In [29]:
# Same dimensions, now with tf-idf values instead of raw frequency counts
movie_tfidf.shape

(2000, 25313)

**Training and testing a Naive Bayes classifier**

In [30]:
# Now ready to build a classifier. 
# We will use Multinominal Naive Bayes as our model
from sklearn.naive_bayes import MultinomialNB

In [31]:
# Split data into training and test sets
# from sklearn.cross_validation import train_test_split  # deprecated in 0.18
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_train.target, test_size = 0.20, random_state = 12)

In [32]:
# Train a Multimoda Naive Bayes classifier
clf = MultinomialNB().fit(docs_train, y_train)

In [33]:
# Predicting the Test set results, find accuracy
y_pred = clf.predict(docs_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.82250000000000001

In [34]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[176,  30],
       [ 41, 153]], dtype=int64)

**Trying classifier on fake movie reviews**

In [36]:
# very short and fake movie reviews
reviews_new = ['This movie was excellent', 'Absolute joy ride', 
            'Steven Seagal was terrible', 'Steven Seagal shined through.', 
              'This was certainly a movie', 'Two thumbs up', 'I fell asleep halfway through', 
              "We can't wait for the sequel!!", '!', '?', 'I cannot recommend this highly enough', 
              'instant classic.', 'Steven Seagal was amazing. His performance was Oscar-worthy.']
reviews_new_counts = movie_vec.transform(reviews_new)
reviews_new_tfidf = tfidf_transformer.transform(reviews_new_counts)

In [37]:
# have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)

In [38]:
# print out results
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movie_train.target_names[category]))

'This movie was excellent' => pos
'Absolute joy ride' => pos
'Steven Seagal was terrible' => neg
'Steven Seagal shined through.' => neg
'This was certainly a movie' => neg
'Two thumbs up' => neg
'I fell asleep halfway through' => neg
"We can't wait for the sequel!!" => neg
'!' => neg
'?' => neg
'I cannot recommend this highly enough' => pos
'instant classic.' => pos
'Steven Seagal was amazing. His performance was Oscar-worthy.' => neg


In [39]:
import nltk
import random
from nltk.corpus import movie_reviews
import pprint
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
import pickle

In [40]:
movie_reviews.categories()

[u'neg', u'pos']

In [41]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)
            ]
random.shuffle(documents)

**Getting the list of all words to store the most frequently occuring ones**

In [42]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [43]:
#making a frequency distribution of the words
all_words = nltk.FreqDist(all_words)
all_words.most_common(20)

[(u',', 77717), (u'the', 76529), (u'.', 65876), (u'a', 38106), (u'and', 35576), (u'of', 34123), (u'to', 31937), (u"'", 30585), (u'is', 25195), (u'in', 21822), (u's', 18513), (u'"', 17612), (u'it', 16107), (u'that', 15924), (u'-', 15595), (u')', 11781), (u'(', 11664), (u'as', 11378), (u'with', 10792), (u'for', 9961)]

In [44]:
all_words["hate"]  ## counting the occurences of a single word

134

In [45]:
#train first 5000 words on the list
feature_words = list(all_words.keys())[:5000]

In [46]:
def find_features(document):
    words = set(document)
    feature = {}
    for w in feature_words:
        feature[w] = (w in words)
    return feature

In [47]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]

In [48]:
#Training the classifier
training_set = feature_sets[:1900]
testing_set = feature_sets[1900:]

**Naive Bayes Algorithm**

In [49]:
## TO-DO: To build own naive bais algorithm
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [50]:
## Testing it's accuracy
print("Naive bayes classifier accuracy percentage : ", (nltk.classify.accuracy(classifier, testing_set))*100)

('Naive bayes classifier accuracy percentage : ', 70.0)


In [51]:
classifier.show_most_informative_features(20)

Most Informative Features
               insulting = True              neg : pos    =     11.1 : 1.0
                  doubts = True              pos : neg    =      9.6 : 1.0
                    sans = True              neg : pos    =      8.4 : 1.0
             wonderfully = True              pos : neg    =      8.4 : 1.0
                    scum = True              pos : neg    =      8.2 : 1.0
              mediocrity = True              neg : pos    =      7.7 : 1.0
                   tripe = True              neg : pos    =      7.7 : 1.0
               overboard = True              pos : neg    =      7.6 : 1.0
            coincidences = True              neg : pos    =      7.1 : 1.0
               dismissed = True              pos : neg    =      6.9 : 1.0
                    taxi = True              pos : neg    =      6.5 : 1.0
             bruckheimer = True              neg : pos    =      6.4 : 1.0
                 wasting = True              neg : pos    =      6.4 : 1.0

In [52]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [53]:
# This is how the Naive Bayes classifier expects the input
def create_word_features(words):
    
    # Remove all stopwords
    useful_words = [word for word in words if word not in stopwords.words("english")]
    
    # For each word, we create a dictionary with all the words and True. 
    # Why a dictionary? So that words are not repeated. If a word already exists, it won’t be added to the dictionary.
    my_dict = dict([(word, True) for word in useful_words])
    
    return my_dict

In [54]:
create_word_features(["python", "is", "better", "than", "r", "and", "r", "is", "better", "than", "java"])

{'python': True, 'better': True, 'r': True, 'java': True}

In [55]:
# We create an empty list called neg_reviews. Next, we loop over all the files in the neg folder.
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    # We get all the words in that file.
    words = movie_reviews.words(fileid)
    # Then we use the function we wrote earlier to create word features in the format nltk expects.
    neg_reviews.append((create_word_features(words), "negative"))

In [56]:
print(neg_reviews[0])    
print(len(neg_reviews))

({u'concept': True, u'-': True, u'insight': True, u'salvation': True, u'playing': True, u'executed': True, u'go': True, u'still': True, u'find': True, u'seemed': True, u'member': True, u'touches': True, u'thrilling': True, u'craziness': True, u'somewhere': True, u'(': True, u'excites': True, u'seems': True, u',': True, u'snag': True, u'presents': True, u'going': True, u'4': True, u'pretty': True, u'skip': True, u'folks': True, u'8': True, u'main': True, u'might': True, u'good': True, u'7': True, u'get': True, u'big': True, u'showing': True, u'continues': True, u'watch': True, u'break': True, u'feels': True, u'every': True, u'know': True, u'half': True, u'world': True, u'bit': True, u'password': True, u'exact': True, u'dreams': True, u'cool': True, u'entire': True, u'like': True, u'lost': True, u'always': True, u'dig': True, u'wrapped': True, u'bad': True, u'highway': True, u'arrow': True, u'meantime': True, u'rarely': True, u'giving': True, u'looooot': True, u'mean': True, u'flick': Tr

In [57]:
# Let’s do the same for the positive reviews. The code is exactly the same:
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append((create_word_features(words), "positive"))

In [58]:
print(pos_reviews[0])    
print(len(pos_reviews))

({u'childs': True, u'steve': True, u'surgical': True, u'/': True, u'go': True, u'certainly': True, u'watchmen': True, u'song': True, u'simpsons': True, u'novel': True, u'jack': True, u'surgeon': True, u'level': True, u'turns': True, u'michael': True, u'flashy': True, u'sooty': True, u'direct': True, u'past': True, u'street': True, u'design': True, u'befriends': True, u'odd': True, u'even': True, u'new': True, u'supporting': True, u'never': True, u'quell': True, u'les': True, u'102': True, u'strong': True, u'great': True, u'kids': True, u'30': True, u'creepy': True, u'nervous': True, u'rafael': True, u'named': True, u'love': True, u'brought': True, u'color': True, u'ians': True, u'would': True, u'indians': True, u'chooses': True, u'music': True, u'films': True, u'oscar': True, u'holm': True, u'arthouse': True, u'keeping': True, u'graphic': True, u'word': True, u'car': True, u'era': True, u'crazy': True, u'coltrane': True, u'carrot': True, u'sense': True, u'needs': True, u'end': True, u'

In [59]:
# We will now create our test and train samples
train_set = neg_reviews[:750] + pos_reviews[:750]
test_set =  neg_reviews[750:] + pos_reviews[750:]

print(len(train_set),  len(test_set))

(1500, 500)


In [60]:
# Let’s create our Naive Bayes Classifier, and train it with our training set.
classifier = NaiveBayesClassifier.train(train_set)

# And let’s use our test set to find the accuracy
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(accuracy * 100)

72.4


In [61]:
review_santa = '''

It would be impossible to sum up all the stuff that sucks about this film, so I'll break it down into what I remember most strongly: a man in an ingeniously fake-looking polar bear costume (funnier than the "bear" from Hercules in New York); an extra with the most unnatural laugh you're ever likely to hear; an ex-dope addict martian with tics; kid actors who make sure every syllable of their lines are slowly and caaarreee-fulll-yyy prrooo-noun-ceeed; a newspaper headline stating that Santa's been "kidnaped", and a giant robot. Yes, you read that right. A giant robot.

The worst acting job in here must be when Mother Claus and her elves have been "frozen" by the "Martians'" weapons. Could they be *more* trembling? I know this was the sixties and everyone was doped up, but still.
'''
print(review_santa )



It would be impossible to sum up all the stuff that sucks about this film, so I'll break it down into what I remember most strongly: a man in an ingeniously fake-looking polar bear costume (funnier than the "bear" from Hercules in New York); an extra with the most unnatural laugh you're ever likely to hear; an ex-dope addict martian with tics; kid actors who make sure every syllable of their lines are slowly and caaarreee-fulll-yyy prrooo-noun-ceeed; a newspaper headline stating that Santa's been "kidnaped", and a giant robot. Yes, you read that right. A giant robot.

The worst acting job in here must be when Mother Claus and her elves have been "frozen" by the "Martians'" weapons. Could they be *more* trembling? I know this was the sixties and everyone was doped up, but still.



In [62]:
words = word_tokenize(review_santa)
words = create_word_features(words)
classifier.classify(words)

'negative'

In [63]:
review_spirit = '''
Spirited Away' is the first Miyazaki I have seen, but from this stupendous film I can tell he is a master storyteller. A hallmark of a good storyteller is making the audience empathise or pull them into the shoes of the central character. Miyazaki does this brilliantly in 'Spirited Away'. During the first fifteen minutes we have no idea what is going on. Neither does the main character Chihiro. We discover the world as Chihiro does and it's truly amazing to watch. But Miyazaki doesn't seem to treat this world as something amazing. The world is filmed just like our workaday world would. The inhabitants of the world go about their daily business as usual as full with apathy as us normal folks. Places and buildings are not greeted by towering establishing shots and majestic music. The fact that this place is amazing doesn't seem to concern Miyazaki.

What do however, are the characters. Miyazaki lingers upon the characters as if they were actors. He infixes his animated actors with such subtleties that I have never seen, even from animation giants Pixar. Twenty minutes into this film and I completely forgot these were animated characters; I started to care for them like they were living and breathing. Miyazaki treats the modest achievements of Chihiro with unashamed bombast. The uplifting scene where she cleanses the River God is accompanied by stirring music and is as exciting as watching gladiatorial combatants fight. Of course, by giving the audience developed characters to care about, the action and conflicts will always be more exciting, terrifying and uplifting than normal, generic action scenes. 
'''
print(review_spirit)


Spirited Away' is the first Miyazaki I have seen, but from this stupendous film I can tell he is a master storyteller. A hallmark of a good storyteller is making the audience empathise or pull them into the shoes of the central character. Miyazaki does this brilliantly in 'Spirited Away'. During the first fifteen minutes we have no idea what is going on. Neither does the main character Chihiro. We discover the world as Chihiro does and it's truly amazing to watch. But Miyazaki doesn't seem to treat this world as something amazing. The world is filmed just like our workaday world would. The inhabitants of the world go about their daily business as usual as full with apathy as us normal folks. Places and buildings are not greeted by towering establishing shots and majestic music. The fact that this place is amazing doesn't seem to concern Miyazaki.

What do however, are the characters. Miyazaki lingers upon the characters as if they were actors. He infixes his animated actors with such 

In [64]:
words = word_tokenize(review_spirit)
words = create_word_features(words)
classifier.classify(words)

'positive'