# Imports

In [1]:
import pandas as pd
import numpy as np

import string
import re

from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score

# Data

The dataset is from kaggle 
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Clean the data

In [4]:
X = df[['review']]
y = df.sentiment.map({'positive': 1, 'negative': 0})

In [5]:
def clean(text):
    #remove html tags
    CLEANR = re.compile('<.*?>') 
    text = re.sub(CLEANR, '', text)
    
    #remove punctuation and lowercase
    for punc in string.punctuation:
        text = text.replace(punc,'').lower()
    
    #remove numeric char
    text = ''.join([i for i in text if not i.isdigit()])
    
    #tokenize(list of words)
    text_token = [word for word in text.split()]
    
    #lemmatize verbs and nouns
    lemmatizer = WordNetLemmatizer()
    v_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in text_token]
    v_n_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in v_lemmatized]

    res = " ".join(v_n_lemmatized)
    
    return res

Applying the clean function to the 'review' column and store the result in 'clean' column. 

In [6]:
X['clean'] = X['review'].apply(clean)

In [7]:
X.head()

Unnamed: 0,review,clean
0,One of the other reviewers has mentioned that ...,one of the other reviewer have mention that af...
1,A wonderful little production. <br /><br />The...,a wonderful little production the film techniq...
2,I thought this was a wonderful way to spend ti...,i think this be a wonderful way to spend time ...
3,Basically there's a family where a little boy ...,basically there a family where a little boy ja...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love in the time of money be a ...


## Train-Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X.clean, y, test_size=0.3)

# Model

After trying Neural Network, I realized the model was over-fitting and too complex for the dataset.<br>
Therefore I decided to go instead with below algorithms to see which one performs the best. For each, I used Random Search to find the best parameters possible. 

## Naive Bayes with TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [10]:
naivebayes = MultinomialNB()

In [11]:
cv_nb = cross_validate(
    naivebayes,
    X_train_bow,
    y_train,
    scoring = "accuracy",
    cv = 5
)
baseline_accuracy = cv_nb['test_score'].mean()
print(f'baseline accuracy on train: {baseline_accuracy*100:.3f}%')


baseline accuracy on train: 85.574%


# Tune the vectorizer and the model

In [12]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])
pipeline.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'nb': MultinomialNB(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': True,
 'nb__force_alpha': 'warn'}

In [13]:
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'tfidf__min_df': (0.05,0.5,1),
    'tfidf__max_df': (0.9,1.0),
    'nb__alpha': (0.1,1,10)
}

grid_search = GridSearchCV(
    pipeline, parameters, n_jobs=-1, 
    verbose=1, scoring = "accuracy", 
    refit=True, cv=5
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [14]:
best_model = grid_search.best_estimator_
best_model

In [15]:
cv_nb = cross_validate(
    best_model,
    X_train,
    y_train,
    scoring = "accuracy",
    cv = 5
)
accuracy = cv_nb['test_score'].mean()
print(f'Fine-tuned model accuracy on train: {accuracy*100:.3f}%')

Fine-tuned model accuracy on train: 89.000%


# Evaluate on test set

In [16]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Fine-tuned model accuracy on test: {accuracy*100:.3f}%')

Fine-tuned model accuracy on test: 88.967%
