In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('dataset/imdb_labelled.txt', sep='\t', header=None)
df.columns = ['review', 'sentiment']

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
x = df['review']
y = df['sentiment']

In [5]:
x[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [7]:
for i in range(len(x)):
    x[i] = word_tokenize( x[i].lower() )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
print(x[0])

['a', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie', 'about', 'a', 'distressed', ',', 'drifting', 'young', 'man', '.']


In [9]:
s_words = stopwords.words('english')
s_words.append(",")
s_words.append(".")
s_words.append("-")

In [10]:
newX = []
for i in range(len(x)):
    newX.append( list( set(x[i]) - set(s_words) ) )

In [11]:
newX[0]

['aimless', 'distressed', 'slow-moving', 'man', 'drifting', 'young', 'movie']

In [12]:
newX[1]

['sure', 'nearly', 'lost', 'half', 'audience', 'characters', 'flat', 'walked']

In [13]:
wordnet = WordNetLemmatizer()

In [14]:
for i in range(len(newX)):
    for j in range(len(newX[i])):
        newX[i][j] = wordnet.lemmatize( newX[i][j], pos='v' )

In [15]:
newX[0]

['aimless', 'distress', 'slow-moving', 'man', 'drift', 'young', 'movie']

In [16]:
newX[1]

['sure', 'nearly', 'lose', 'half', 'audience', 'character', 'flat', 'walk']

In [17]:
type(newX)

list

In [18]:
newX = np.array(newX)

In [19]:
newX[0]

['aimless', 'distress', 'slow-moving', 'man', 'drift', 'young', 'movie']

In [20]:
' '.join(newX[0])

'aimless distress slow-moving man drift young movie'

In [21]:
for i in range(len(newX)):
    newX[i] = ' '.join(newX[i])

In [22]:
newX[1]

'sure nearly lose half audience character flat walk'

In [23]:
vect = TfidfVectorizer()

In [24]:
newX = vect.fit_transform(newX)

In [25]:
newX

<748x2613 sparse matrix of type '<class 'numpy.float64'>'
	with 6890 stored elements in Compressed Sparse Row format>

In [26]:
newX = newX.toarray()

In [27]:
reg = LogisticRegression()

In [28]:
x_train, x_test, y_train, y_test = train_test_split(newX, y, test_size = 0.25)

In [29]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
y_pred = reg.predict(x_test)

In [31]:
accuracy_score(y_test, y_pred)

0.7379679144385026

In [32]:
nb = GaussianNB()
nb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [33]:
y_pred2 = nb.predict(x_test)

In [34]:
accuracy_score(y_test, y_pred2)

0.7058823529411765

In [35]:
newX.shape

(748, 2613)

In [36]:
confusion_matrix(y_test, y_pred)

array([[56, 41],
       [ 8, 82]])

In [39]:
y_test

23     1
43     0
578    0
661    0
597    1
      ..
337    0
718    1
220    0
285    1
590    1
Name: sentiment, Length: 187, dtype: int64