In [76]:
import pandas as pd
import numpy as np
import nltk
from sklearn import naive_bayes
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

In [34]:
df = pd.read_csv("movie_reviews.txt",sep = '\t', names = ['polarity','review'])

In [89]:
df.sample(10)

Unnamed: 0,polarity,review
3547,1,Corean cinema can be quite surprising for an o...
21010,0,"The title should have been ""The walker"". That ..."
10109,1,"I had the pleasure of screening ""The Big Bad S..."
17453,0,This was one of the worst movies I've ever see...
10795,1,I have been a fan of Pushing Daisies since the...
18593,0,"No ,I'm not kidding. If they ever propose a mo..."
22429,0,I'm only rating this film as a 3 out of pity b...
20916,0,Is this the same Kim Ki Duk who directed the p...
5919,1,After not having much luck at selling his scre...
8034,1,I had never heard of this film prior to seeing...


In [69]:

stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf = True, lowercase = True, strip_accents= 'ascii', stop_words = stopset)


In [91]:
y = df.polarity  # dependent variable
X = vectorizer.fit_transform(df.review) # independent variable 
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size=0.80)

In [39]:
print y.shape
print X.shape

(25000,)
(25000, 75710)


In [40]:
X_train, X_test,y_train, y_test = train_test_split(X, y, random_state = 42)

In [94]:
NB = naive_bayes.MultinomialNB()
NB.fit(X_train,y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [95]:
roc_auc_score(y_test,NB.predict_proba(X_test)[:,1])

0.93024295388726241

In [100]:

words_polarity = []

for word in vectorizer.vocabulary_:
    index =  vectorizer.vocabulary_[word]
    words_polarity.append((word, NB.coef_[0][index]))

words_polarity = sorted(words_polarity, key=lambda x: x[1])


print("10 Negative words :" + ", ".join([str(x[0]) for x in words_polarity[:10]]))

print("10 Positive words :" + ", ".join([str(x[0]) for x in words_polarity[-10:]]))


10 Negative words :fawn, nunnery, vani, spiders, trawling, localized, disobeying, yougoslavia, canet, acurately
10 Positive words :time, well, see, story, great, like, good, one, film, movie


In [99]:
for word in [["I liked the movie"],["I hate this character"],["The plot of the movie was interesting"],["It has been and amazing depiction of charaters"],["Movie was boring"]]:
    review_word = np.array(word)
    review_vector = vectorizer.transform(review_word)
    print(str(word[0]) + " " + str(NB.predict(review_vector)[0]))
    
    

I liked the movie 1
I hate this character 0
The plot of the movie was interesting 0
It has been and amazing depiction of charaters 1
Movie was boring 0
