In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
train = pd.read_csv('movie_review_train.csv', encoding='ISO-8859-1')

In [10]:
train.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [15]:
train['class'].value_counts()

1    800
0    800
Name: class, dtype: int64

In [14]:
train['class'] = train['class'].map({'Pos':1,'Neg':0})

In [16]:
train.head()

Unnamed: 0,class,text
0,1,a common complaint amongst film critics is ...
1,1,whew this film oozes energy the kind of b...
2,1,steven spielberg s amistad which is bas...
3,1,he has spent his entire life in an awful litt...
4,1,being that it is a foreign language film with...


In [17]:
X_train = train['text']
y_train = train['class']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
print(len(vect.vocabulary_))

35858


In [20]:
vect = CountVectorizer(stop_words='english',min_df=.03,max_df=.8)
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=0.03,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [21]:
print(len(vect.vocabulary_))

1643


In [22]:
test = pd.read_csv('movie_review_test.csv', encoding='ISO-8859-1')

In [23]:
test.head()

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...


In [24]:
test['class'] = test['class'].map({'Pos':1,'Neg':0})

In [25]:
test.head()

Unnamed: 0,class,text
0,1,films adapted from comic books have had plent...
1,1,every now and then a movie comes along from a...
2,1,you ve got mail works alot better than it des...
3,1,jaws is a rare film that grabs your atte...
4,1,moviemaking is a lot like being the general m...


In [27]:
X_test = test['text']
y_test = test['class']

In [28]:
X_train_trans = vect.transform(X_train)
X_test_trans = vect.transform(X_test)

In [29]:
type(X_test_trans)

scipy.sparse.csr.csr_matrix

In [32]:
X_test_trans.count_nonzero()

51663

In [37]:
print(vect.get_feature_names()[49])

albeit


In [38]:
print(vect.get_feature_names()[-50])

wide


In [40]:
mnb = MultinomialNB()

mnb.fit(X_train_trans, y_train)

y_pred_class = mnb.predict(X_test_trans)

y_pred_proba = mnb.predict_proba(X_test_trans)

# Checking accuracy
from sklearn import metrics

metrics.accuracy_score(y_test, y_pred_class)

0.8275

In [41]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[172,  28],
       [ 41, 159]], dtype=int64)

In [43]:
mnb.feature_count_.shape

(2, 1643)

In [45]:
mnb.feature_count_.sum(axis=1)

array([137807., 153000.])

In [47]:
vect.get_feature_names().index('stupid')

1390

In [49]:
mnb.feature_count_[0,1390]

161.0

In [51]:
Neg_token_count = mnb.feature_count_[0, :]
Pos_token_count = mnb.feature_count_[1, :]
tokens = pd.DataFrame({'words':vect.get_feature_names(), 'Pos':Pos_token_count, 'Neg':Neg_token_count}).set_index('words')
tokens.loc['stupid', ]

Neg    161.0
Pos     35.0
Name: stupid, dtype: float64

In [52]:
tokens.loc['painfully', ]

Neg    41.0
Pos    10.0
Name: painfully, dtype: float64

In [53]:
tokens['negativity'] = (tokens['Neg']/137807)/(tokens['Pos']/153000)

In [54]:
tokens['negativity'].argmax()

  """Entry point for launching an IPython kernel.


'waste'