In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics

### Load Data

In [2]:
path = 'data/IMDB Dataset.csv'
reviews = pd.read_csv(path)

In [3]:
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
reviews.isnull().sum()

review       0
sentiment    0
dtype: int64

### Transform data

Create numerical target variables. I decided not to overwrite the "sentiment" column (could have done it to save memory)

In [5]:
reviews['target'] = reviews.sentiment.map({'negative': 0, 'positive': 1})

In [6]:
reviews.head()

Unnamed: 0,review,sentiment,target
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


### Create X and y

In [7]:
X = reviews.review
y = reviews.target

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37500,), (12500,), (37500,), (12500,))

### Vectorize train & test set

Create a vocabulary based on train set and only transform the test set (simulation of real-world application - the model will come across words that are not present in the vocabulary!)

In [9]:
vectorizer = CountVectorizer(stop_words= 'english', dtype= np.int32) # int32 to save memory

In [10]:
X_train_dtm = vectorizer.fit_transform(X_train)
X_train_dtm

<37500x90404 sparse matrix of type '<class 'numpy.int32'>'
	with 3321482 stored elements in Compressed Sparse Row format>

In [11]:
X_test_dtm = vectorizer.transform(X_test)
X_test_dtm

<12500x90404 sparse matrix of type '<class 'numpy.int32'>'
	with 1100873 stored elements in Compressed Sparse Row format>

In order to save memory, the sparse matrix is represented in Compressed Sparse Row format

### Create model instances

In [12]:
naive_bayes = MultinomialNB()
logreg = LogisticRegression(max_iter= 500)

Fit models with training data

In [13]:
%%timeit
naive_bayes.fit(X_train_dtm, y_train)

23.2 ms ± 542 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%timeit logreg.fit(X_train_dtm, y_train)

9.11 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


The naive bayes model fits faster, however the logistic regression model has better calibrated probabilities (class prediction probabilities)

### Model Evaluation

Get predictions from the 2 classifiers

In [15]:
y_pred_NB = naive_bayes.predict(X_test_dtm)

In [16]:
y_pred_LR = logreg.predict(X_test_dtm)

**Classification accuracy metrics**

Naive Bayes

In [17]:
metrics.accuracy_score(y_test, y_pred_NB)

0.8564

In [18]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_NB).ravel()

True Negative, False Positive, False Negative, True Positive

In [19]:
tn, fp, fn, tp

(5402, 755, 1040, 5303)

Logistic Regression

In [20]:
metrics.accuracy_score(y_test, y_pred_LR)

0.8828

In [21]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_LR).ravel()

True Negative, False Positive, False Negative, True Positive

In [22]:
tn, fp, fn, tp

(5403, 754, 711, 5632)

**Base line accuracy**

In [23]:
reviews.sentiment.value_counts(normalize= True)

negative    0.5
positive    0.5
Name: sentiment, dtype: float64