In [54]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [55]:
import tarfile

In [56]:
import numpy as np
import nltk
import regex as re
import os
import collections
import math
import os
import random
import zipfile

#file download utilities
from six.moves import urllib
from six.moves import xrange

import numpy as np
import tensorflow as tf


In [57]:
print(np.__version__)
print(nltk.__version__)

1.14.1
3.2.5


In [58]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    
    print('Found and verified file fom this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [85]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
#             print(filename)
            with open(dirname + filename, 'r+', encoding='utf8') as f:
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                # Return a tuple of the review tex and the label
                # whether it is a positive or a negative review
                #1 - positive review
                #0 - negative review
                reviews.append((review,label))
    print(len(reviews))
    return reviews


    
        
    

In [62]:
def extract_reviews():
    
    #If the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME)  as tar:
            tar.extractall()
            tar.close()
    
    positive_reviews = get_reviews("aclImdb/train/pos/", positive=True)
#     print(positive_reviews)
    negative_reviews = get_reviews("aclImdb/train/neg/", positive=False)
    
    return positive_reviews, negative_reviews
    

In [63]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_file(URL_PATH)

Found and verified file fom this path:  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Downloaded file:  ImdbReviews.tar.gz


In [86]:
positive_reviews, negative_reviews = extract_reviews()

5457
12500


In [87]:
positive_reviews[:2]

[('bromwellhighisacartooncomedyitranatthesametimeassomeotherprogramsaboutschoollifesuchasteachersmy35yearsintheteachingprofessionleadmetobelievethatbromwellhighssatireismuchclosertorealitythanisteachersthescrambletosurvivefinanciallytheinsightfulstudentswhocanseerightthroughtheirpatheticteacherspompthepettinessofthewholesituationallremindmeoftheschoolsiknewandtheirstudentswhenisawtheepisodeinwhichastudentrepeatedlytriedtoburndowntheschooliimmediatelyrecalledathighaclassiclineinspectorimheretosackoneofyourteachersstudentwelcometobromwellhighiexpectthatmanyadultsofmyagethinkthatbromwellhighisfarfetchedwhatapitythatitisnt',
  1),
 ('ilikedthefilmsomeoftheactionsceneswereveryinterestingtenseandwelldoneiespeciallylikedtheopeningscenewhichhadasemitruckinitaverytenseactionscenethatseemedwelldonesomeofthetransitionalsceneswerefilmedininterestingwayssuchastimelapsephotographyunusualcolorsorinterestinganglesalsothefilmisfunnyisseveralpartsialsolikedhowtheevilguywasportrayedtooidgivethefilman8out

In [88]:
len(positive_reviews)

5457

In [89]:
len(negative_reviews)

12500

In [90]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_reviews = positive_reviews[:TRAIN_DATA] + negative_reviews[:TRAIN_DATA]

test_positive_reviews = positive_reviews[TRAIN_DATA:TOTAL_DATA]
test_negative_reviews = negative_reviews[TRAIN_DATA:TOTAL_DATA]


In [94]:
len(train_reviews)

10000

In [91]:
def get_vocabulary(train_reviews):
    words_set = set()
    
    for review in train_reviews:
        words_set.update(review[0].split())
    
    return list(words_set)



In [92]:
vocabulary = get_vocabulary(train_reviews)

In [93]:
len(vocabulary)

9979

In [72]:
vocabulary[:5]

['easilyoneofthetenbestmoviesofthe20thcenturyincoldbloodisbrilliantinthesimplicityandrealismofitsstorytellingandabsolutelyrivetingrobertblakewalksawaywiththefilmthestoryseemstobepresentedalmostentirelyfromperrysviewpointdespitedickbeingtheleaderandplannerofthepairtheviewerwillinvariableperceivedickasbeingmoreunstableimmatureandgenerallyfeellikeperrywouldnothavebeenpulledintothisnightmarebutfordickandhisneedtobesomebodyandpulloffabigscorebasedonatruestorywithparticularattentiontoaccuracyincoldblooddepictsthestorybehindthebrutalandsenselessmurderofaruralkansasfamilyonecoldwindynightbecausedickhasboughtintoanageoldruralmythaboutprosperousfarmershavingasafefullofcashintheirhomeasprosecutoracharacterthatisntgivenanameinthescriptplayedbywillgeersoastutelypointsouttheirlivesareboughtforonly10aheaddirectorrichardbrookswiselychoosesnottosharewithusthegruesomedetailsofthemurdersuntiltheendofthefilmpriortothisweonlyknowithashappenedandwatchthelivesofdickandperryslowlyunravelastheyattempttoescapen

In [73]:
#outputs the data the way our ML model expects it to be
def extract_features(review_text):
    
    #Split the review into words and create a set of words
    review_words = set(review_text.split())
    
    features = {}#dictionary of (word, boolean)
    for word in vocabulary:
        features[word] = (word in review_words)
        #very similar to the one-hot notation
        
    return features

In [74]:
train_features = nltk.classify.apply_features(extract_features, train_reviews)

In [75]:
trained_classifier = nltk.NaiveBayesClassifier.train(train_features)

In [76]:
#the above line of code will give us a trained Machine Learning Model

In [77]:
def sentiment_calculator(review_text):
    features = extract_features(review_text)
    return trained_classifier.classify(features)

In [78]:
sentiment_calculator("What an amazing moveie!")

1

In [79]:
sentiment_calculator("Was a great movie until I realised it was not")

1

In [80]:
sentiment_calculator("wasn't a bad movie I should say")

1

In [81]:
sentiment_calculator("was not a bad movie I should say")

1

In [82]:
sentiment_calculator("was not a great movie I should say")

1

In [83]:
def classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator):
    
    positive_results = [sentiment_calculator(review[0]) for review in test_positive_reviews]
    negative_results = [sentiment_calculator(review[0]) for review in test_negative_reviews]
    
    true_positives = sum(x > 0 for x in positive_results)
    true_negatives = sum(x == 0 for x in negative_results)
    
    percent_true_positive = float(true_positives/len(positive_results))
    percent_true_negative = float(true_negatives/len(negative_results))
    
    total_accurate = true_positives + true_negatives
    total = len(positive_results) + len(negative_results)
    
    print("Accuracy on positive reviews = " + "%.2f" % (percent_true_positive * 100) + "%")
    print("Accuracy on negative reviews = " + "%.2f" % (percent_true_negative * 100) + "%")
    print("Overall accuracy = " + "%.2f" % (total_accurate * 100/ total) + "%")
    

In [84]:
classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator)

Accuracy on positive reviews = 100.00%
Accuracy on negative reviews = 0.00%
Overall accuracy = 31.37%
