In [1]:
import pandas as pd

In [2]:
imdb_df = pd.read_csv("./IMDB Dataset.csv")

In [3]:
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
imdb_df["sentiment"].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
# transforming labels to 0 and 1
imdb_df.loc[imdb_df["sentiment"] == "positive", "label"] = 1
imdb_df.loc[imdb_df["sentiment"] == "negative", "label"] = 0

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
seed = 42

In [20]:
X_traindev, X_test, y_traindev, y_test = \
    train_test_split(imdb_df["review"].tolist(), imdb_df["label"].tolist(), test_size=(1/3), random_state=seed)
X_train, X_dev, y_train, y_dev = \
    train_test_split(X_traindev, y_traindev, test_size=0.5, random_state=seed)

In [21]:
# TF-IDF model extracting the TF-IDF features for the words in the reviews
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(X_train)
X_dev = tfidf.transform(X_dev)

In [22]:
# model training
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# model evaluation
pred_train = model.predict(X_train)
pred_prob_train = model.predict_proba(X_train)
pred_dev = model.predict(X_dev)
pred_prob_dev = model.predict_proba(X_dev)
print("Accuracy of training:",(pred_train == y_train).mean())
print("Accuracy of validation:",(pred_dev == y_dev).mean())

Accuracy of training: 0.9335773430937238
Accuracy of validation: 0.8822823543529129


In [26]:
models = [['Random Model', 0.500], ['Baseline Model', 0.679], ['Unigram BoW', 0.855], ['Unigram TF-IDF', 0.882]]
models_df = pd.DataFrame(models, columns = ['Model', 'Accuracy'])

import plotly.express as px
fig = px.bar(models_df, x='Model', y='Accuracy', color='Accuracy')
fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), bargap = 0.75, yaxis=dict(range=[0, 1]))
fig.show()