# Import and inspect data

The IMDB dataset contains 50,000 movie reviews from IMDB, labeled by sentiment (positive: 1 / negative: 0). The dataset is split into 25,000 reviews for training and 25,000 reviews for testing.

In [1]:
import pandas as pd

train_df = pd.read_parquet("../data/raw/train.parquet")
test_df = pd.read_parquet("../data/raw/test.parquet")

print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")


Train samples: 25000
Test samples: 25000


In [2]:
train_df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [3]:

X_train_text = train_df["text"].tolist()
y_train = train_df["label"].tolist()

X_test_text = test_df["text"].tolist()
y_test = test_df["label"].tolist()


In [4]:
X_test_text[3]


"STAR RATING: ***** Saturday Night **** Friday Night *** Friday Morning ** Sunday Night * Monday Morning <br /><br />Former New Orleans homicide cop Jack Robideaux (Jean Claude Van Damme) is re-assigned to Columbus, a small but violent town in Mexico to help the police there with their efforts to stop a major heroin smuggling operation into their town. The culprits turn out to be ex-military, lead by former commander Benjamin Meyers (Stephen Lord, otherwise known as Jase from East Enders) who is using a special method he learned in Afghanistan to fight off his opponents. But Jack has a more personal reason for taking him down, that draws the two men into an explosive final showdown where only one will walk away alive.<br /><br />After Until Death, Van Damme appeared to be on a high, showing he could make the best straight to video films in the action market. While that was a far more drama oriented film, with The Shepherd he has returned to the high-kicking, no brainer action that firs

# TF-IDF Vectorizer

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [6]:
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=2
)
X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)

X_train.shape, X_test.shape


((25000, 50000), (25000, 50000))

In [7]:
X_train[3]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 150 stored elements and shape (1, 50000)>

In [8]:
tfidf.get_feature_names_out()[20000:20010]

array(['hysteria', 'hysterical', 'hysterically', 'iago', 'ian',
       'ian holm', 'ian mcshane', 'ice', 'ice age', 'ice cold'],
      dtype=object)

# Baseline model with Logistic Regression

In [9]:
clf = LogisticRegression(max_iter=1000, n_jobs=1)
clf.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [10]:
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")
print(f"F1-score: {f1:.4f}")

print(classification_report(y_test, y_pred, target_names=["neg", "pos"]))


Accuracy: 0.8945
F1-score: 0.8949
              precision    recall  f1-score   support

         neg       0.90      0.89      0.89     12500
         pos       0.89      0.90      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



# Analyse results

In [11]:
import numpy as np

test_df["pred"] = y_pred
test_df["correct"] = (test_df["pred"] == test_df["label"])

# Some wrong positives (pred=pos, label=neg)
wrong_pos = test_df[(test_df["pred"] == 1) & (test_df["label"] == 0)].head(5)
wrong_neg = test_df[(test_df["pred"] == 0) & (test_df["label"] == 1)].head(5)

wrong_pos[["text", "label", "pred"]].head(3)


Unnamed: 0,text,label,pred
4,"First off let me say, If you haven't enjoyed a...",0,1
18,"Ben, (Rupert Grint), is a deeply unhappy adole...",0,1
22,The Forgotten (AKA: Don't Look In The Basement...,0,1


In [12]:
def display_review(df, index):
    from IPython.display import display, HTML
    text = df.iloc[index]["text"]
    display(HTML(text))



display_review(wrong_neg, 0)

### Some false positives

#### 1.

In [13]:
display_review(wrong_pos, 0)

Tbh the review is quite positive like the model predicted. It would be interesting to see how humans would label this review.

#### 2.

In [14]:
display_review(wrong_pos, 1)

This one would be a little easier to classify as negative. Surely not the easiest to classify and I can see why the model made a mistake here.

### Some false negatives

#### 1.

In [15]:
display_review(wrong_neg, 0)

Yeah okay, this one is clearly positive. Not sure how the model got this one wrong.

#### 2.

In [16]:
display_review(wrong_neg, 1)

Only slightly positive so it's understandable that the model got this one wrong.

# 2nd baseline model with logistic regression 

Use 5-fold cross-validation to find the best hyperparameters for the Logistic Regression model.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV

param_grid = {
    "Cs": [0.1, 1.0, 10.0],
    "cv": [3, 5, 10],
    "penalty": ["none", "l2", "elasticnet"],
}
clf_2 = GridSearchCV(LogisticRegressionCV(max_iter=1000, n_jobs=1), param_grid)
clf_2.fit(X_train, y_train)

ValueError: Invalid parameter 'C' for estimator LogisticRegressionCV(max_iter=1000, n_jobs=1). Valid parameters are: ['Cs', 'class_weight', 'cv', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratios', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'refit', 'scoring', 'solver', 'tol', 'verbose'].

In [20]:
clf_2 = LogisticRegressionCV(max_iter=1000, n_jobs=1, cv=5)
clf_2.fit(X_train, y_train)

0,1,2
,Cs,10
,fit_intercept,True
,cv,5
,dual,False
,penalty,'l2'
,scoring,
,solver,'lbfgs'
,tol,0.0001
,max_iter,1000
,class_weight,


In [21]:
# validate the model
y_pred_2 = clf_2.predict(X_test)
acc_2 = accuracy_score(y_test, y_pred_2)
f1_2 = f1_score(y_test, y_pred_2)
print(f"Accuracy: {acc_2:.4f}")
print(f"F1-score: {f1_2:.4f}")
print(classification_report(y_test, y_pred_2, target_names=["neg", "pos"]))

Accuracy: 0.9008
F1-score: 0.9009
              precision    recall  f1-score   support

         neg       0.90      0.90      0.90     12500
         pos       0.90      0.90      0.90     12500

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



#### 1.