In [247]:
import pandas as pd

In [248]:
imdb_df = pd.read_csv("./IMDB Dataset.csv")

In [249]:
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [250]:
imdb_df["sentiment"].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [251]:
# transforming labels to 0 and 1
imdb_df.loc[imdb_df["sentiment"] == "positive", "label"] = 1
imdb_df.loc[imdb_df["sentiment"] == "negative", "label"] = 0

In [253]:
# featurizing, checking the occurrences of the following words in review
words = ["good", "awesome", "great", "amazing", "recommend", 
         "bad", "terrible", "awful", "boring", "long", "not"]

for word in words:
    imdb_df[word] = imdb_df['review'].str.contains(word, case=False)
imdb_df["features"] = imdb_df.iloc[:,3:].stack().groupby(level=0).apply(list)

In [263]:
from sklearn.model_selection import train_test_split

In [255]:
seed = 42

In [256]:
X_traindev, X_test, y_traindev, y_test = \
    train_test_split(imdb_df["features"].tolist(), imdb_df["label"].tolist(), test_size=(1/3), random_state=seed)
X_train, X_dev, y_train, y_dev = \
    train_test_split(X_traindev, y_traindev, test_size=0.5, random_state=seed)

In [260]:
# model training
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [264]:
# model evaluations
pred_train = model.predict(X_train)
pred_prob_train = model.predict_proba(X_train)
pred_dev = model.predict(X_dev)
pred_prob_dev = model.predict_proba(X_dev)
print("Accuracy of training:",(pred_train == y_train).mean())
print("Accuracy of validation:",(pred_dev == y_dev).mean())

Accuracy of training: 0.6744269770790832
Accuracy of validation: 0.6788264234715305


In [285]:
models = [['Random Model', 0.500], ['Baseline Model', 0.679]]
models_df = pd.DataFrame(models, columns = ['Model', 'Accuracy'])

import plotly.express as px
fig = px.bar(models_df, x='Model', y='Accuracy', color='Accuracy')
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), bargap = 0.9, yaxis=dict(range=[0, 1]))
fig.show()