In [1]:
from nltk.corpus import movie_reviews
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import NaiveBayesClassifier
from nltk import pos_tag
import nltk
import random
import string

In [2]:
Categories = movie_reviews.categories() #Whether the movie review is positive or negative

In [3]:
FileIds = movie_reviews.fileids() #IDs of all movie reviews

In [4]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))    #List of tuple of fileid and its category
random.shuffle(documents)  #Shuffle the documents list

In [5]:
lemmatizer = WordNetLemmatizer()  # Creating a Lemmatization Object

In [6]:
stops = set(stopwords.words('english'))   #Set of all stopwords of english language
punctuations = list(string.punctuation)   #List of all punctuations
stops.update(punctuations)                #Appending Punctuations in Stopwords Set

In [7]:
def get_simple_pos(tag):            #Convert POS Complicated Tags into simple Tags
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
def clean_review(words):                         #We get words after lemmatization - Basically cleaning of words
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))  #Lemmatization requires pos
            output_words.append(clean_word.lower())
    return output_words

In [9]:
#All the cleaned words in a review with category
documents = [(clean_review(document),category) for document,category in documents]

In [10]:
training_document = documents[:1500]  #Training Data - 75%
testing_document = documents[1500:]  #Testing Data - 25%

In [11]:
all_words = []
for doc in training_document:             #Containing just the words
    all_words += doc[0]
freq = nltk.FreqDist(all_words)           #Frequency of each word
common = freq.most_common(5000)           #Most Common words : With highest frequency
features = [i[0] for i in common]         #Containing just the words: Feature Set

In [12]:
def get_feature_dict(words):             #Takes a review and check if a word from feature set is present in that review or not 
    current_features = {}                # And return true or false for that word accordingly
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [13]:
training_data = [(get_feature_dict(doc),category)for doc,category in training_document]
testing_data = [(get_feature_dict(doc),category)for doc,category in testing_document]

In [14]:
classifier = NaiveBayesClassifier.train(training_data)
nltk.classify.accuracy(classifier,testing_data)

0.782

In [15]:
classifier.show_most_informative_features(25)

Most Informative Features
               marvelous = True              pos : neg    =     15.3 : 1.0
             outstanding = True              pos : neg    =     14.2 : 1.0
                   anger = True              pos : neg    =     12.9 : 1.0
             beautifully = True              pos : neg    =     12.5 : 1.0
                 astound = True              pos : neg    =     11.9 : 1.0
               mesmerize = True              pos : neg    =     11.9 : 1.0
              schumacher = True              neg : pos    =     10.8 : 1.0
               stupidity = True              neg : pos    =     10.4 : 1.0
                  hudson = True              neg : pos    =     10.1 : 1.0
             wonderfully = True              pos : neg    =      9.9 : 1.0
               atrocious = True              neg : pos    =      9.5 : 1.0
                     sat = True              neg : pos    =      9.2 : 1.0
             fascination = True              pos : neg    =      9.2 : 1.0

In [16]:
#Checking for a review 
from nltk.tokenize import word_tokenize
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review = word_tokenize(custom_review.lower())
clean_review(custom_review)
custom_review_set = get_feature_dict(custom_review)
classifier.classify(custom_review_set)
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result)
print (prob_result.max())
print (prob_result.prob("neg")) 
print (prob_result.prob("pos")) 

<ProbDist with 2 samples>
neg
0.9999999997381025
2.6190830804520114e-10


In [17]:
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_review = word_tokenize(custom_review.lower())
clean_review(custom_review)
custom_review_set = get_feature_dict(custom_review)
classifier.classify(custom_review_set)
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result)
print (prob_result.max())
print (prob_result.prob("neg"))  #Due to bad accuracy
print (prob_result.prob("pos")) 

<ProbDist with 2 samples>
neg
0.9999999914902032
8.50980464205583e-09


In [18]:
from sklearn.svm import SVC #Using sklearn Classifier
from nltk.classify.scikitlearn import SklearnClassifier
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)
classifier_sklearn.train(training_data)
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.862

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
categories = [category for document,category in documents]

In [21]:
text_document = [" ".join(document) for document,category in documents]

In [22]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(text_document,categories)

In [23]:
count_vec = CountVectorizer(max_features = 2000, ngram_range(2,3))
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 3, 0],
        [0, 0, 0, ..., 0, 1, 0],
        ...,
        [0, 0, 0, ..., 1, 3, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '17',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '2001',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'agrees',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'ala',
 'alan',
 'albeit',
 'alex',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone',
 'anything',
 'anyw

In [25]:
x_test_features = count_vec.transform(x_test)