In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [34]:
df = pd.read_csv('imdb_labelled.txt',
            sep='\t',header=None)

In [35]:
columns = ['review', 'sentiment']

In [36]:
df.columns = columns

In [37]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [38]:
x = df['review'].values
y = df['sentiment'].values

In [39]:
x[:5]

array(['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
       'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
       'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
       'Very little music or anything to speak of.  ',
       'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  '],
      dtype=object)

In [40]:
y[:5]

array([0, 0, 0, 0, 1])

In [41]:
for i in range(len(x)):
    x[i] = word_tokenize(x[i].lower())

In [42]:
print(x[0])

['a', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie', 'about', 'a', 'distressed', ',', 'drifting', 'young', 'man', '.']


In [43]:
stop_words = stopwords.words('english')

In [44]:
stop_words.extend([',','.','-'])

In [45]:
stop_words = set(stop_words)

In [46]:
for i in range(len(x)):
    x[i] = list(set(x[i]) - stop_words)

In [47]:
print(x[0])

['movie', 'young', 'man', 'aimless', 'distressed', 'drifting', 'slow-moving']


In [48]:
lemmatizer = WordNetLemmatizer()

In [49]:
for i in range(len(x)):
    for j in range(len(x[i])):
        x[i][j] = lemmatizer.lemmatize(x[i][j], pos='v')

In [50]:
print(x[0])

['movie', 'young', 'man', 'aimless', 'distress', 'drift', 'slow-moving']


In [51]:
for i in range(len(x)):
    x[i] = ' '.join(x[i])

In [52]:
print(x[0])

movie young man aimless distress drift slow-moving


In [53]:
vect = TfidfVectorizer()

In [54]:
matrix = vect.fit_transform(x)

In [55]:
x = matrix.toarray()

In [56]:
nb = GaussianNB()

In [58]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [59]:
nb.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [60]:
y_pred = nb.predict(x_test)

In [61]:
accuracy_score(y_test,y_pred)

0.6666666666666666

In [62]:
logistic = LogisticRegression()

In [63]:
logistic.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [64]:
y_pred = logistic.predict(x_test)

In [65]:
accuracy_score(y_test,y_pred)

0.7133333333333334

In [66]:
confusion_matrix(y_test,y_pred)

array([[44, 27],
       [16, 63]])