In [None]:
import os
import re
import xgboost
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
train = pd.read_csv("IMDB_Dataset.csv")
train.head()

In [None]:
print(train.isnull().sum())

In [None]:
def processing_text(text):
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^0-9a-zA-Z\s]', ' ', text)  # Remove non-alphanumeric characters
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(words)

In [None]:
train['review_clean'] = train['review'].apply(processing_text)

In [None]:
stop_words = set(stopwords.words("english"))
words_to_keep = {"not", "no", "nor", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn", 
"mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"}
custom_stop_words = stop_words - words_to_keep

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),      # Use both unigrams and bigrams
    max_features=15000,      # Keep only the top 15,000 most frequent features
    min_df = 5,              # Ignore terms that appear in less than 5 documents
    stop_words=list(custom_stop_words)
)
## vectorize the clean text
X = vectorizer.fit_transform(train["review_clean"])
y = train['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Model training and testing

In [None]:
## Naive Bayes model
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_test)
print("\n-- Naive Bayes --")
print(f"Accuracy:, {accuracy_score(y_test, y_pred_nb)* 100:.2f}%")
print(classification_report(y_test, y_pred_nb, target_names=['Negative', 'Positive']))

## Logistic Regression
lr_clf = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
lr_clf.fit(X_train, y_train)
y_pred_lr = lr_clf.predict(X_test)
print("\n-- Logistic Regression --")
print(f"Accuracy:, {accuracy_score(y_test, y_pred_lr)* 100:.2f}%")
print(classification_report(y_test, y_pred_lr, target_names=['Negative', 'Positive']))

## Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

lr_model = LogisticRegression(max_iter=1000, random_state=42)

param_grid = [
    {
        'penalty': ['l1', 'l2'],               ## liblinear support both l1 and l2
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear']
    },
    {
        'penalty': ['l2'],                     ## lbfgs supports only l2
        'solver': ['lbfgs'],
        'C': [0.01, 0.1, 1, 10, 100]
    }
]

In [52]:
grid_search = GridSearchCV(estimator=lr_model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,           ## number of cross-validation folds
                           verbose=1,     
                           n_jobs=-1)      ## use all available cpus


grid_search.fit(X_train, y_train)
print("\nBest parameters found: ", grid_search.best_params_)
print(f"{grid_search.best_score_ * 100:.2f}% accuracy on training set")

## test set evaluation
best_lr_clf = grid_search.best_estimator_
y_pred_best_lr = best_lr_clf.predict(X_test)
print(f"{accuracy_score(y_test, y_pred_best_lr) * 100:.2f}% accuracy on test set")
print(classification_report(y_test, y_pred_best_lr, target_names=['Negative', 'Positive']))

Fitting 5 folds for each of 15 candidates, totalling 75 fits

Best parameters found:  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
89.42% accuracy on training set
89.64% accuracy on test set
              precision    recall  f1-score   support

    Negative       0.90      0.89      0.89      4961
    Positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

