In [48]:
import os
import re
import xgboost
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aditi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aditi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
train = pd.read_csv("IMDB_Dataset.csv")
train.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [40]:
print(train.isnull().sum())

review       0
sentiment    0
dtype: int64


In [43]:
def processing_text(text):
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^0-9a-zA-Z\s]', ' ', text)  # Remove non-alphanumeric characters
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(words)

In [44]:
train['review_clean'] = train['review'].apply(processing_text)

In [45]:
stop_words = set(stopwords.words("english"))
words_to_keep = {"not", "no", "nor", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn", 
"mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"}
custom_stop_words = stop_words - words_to_keep

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),      # Use both unigrams and bigrams
    max_features=15000,      # Keep only the top 15,000 most frequent features
    min_df = 5,              # Ignore terms that appear in less than 5 documents
    stop_words=list(custom_stop_words)
)
## vectorize the clean text
X = vectorizer.fit_transform(train["review_clean"])
y = train['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
## Naive Bayes model
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_test)
print("\n-- Naive Bayes --")
print(f"Accuracy:, {accuracy_score(y_test, y_pred_nb)* 100:.2f}%")
print(classification_report(y_test, y_pred_nb, target_names=['Negative', 'Positive']))

## Logistic Regression
lr_clf = LogisticRegression(solver='liblinear', C=1.0, random_state=42)   ## C is the regularization strength, higher number meaning less regularization and overfitting
lr_clf.fit(X_train, y_train)
y_pred_lr = lr_clf.predict(X_test)
print("\n-- Logistic Regression --")
print(f"Accuracy:, {accuracy_score(y_test, y_pred_lr)* 100:.2f}%")
print(classification_report(y_test, y_pred_lr, target_names=['Negative', 'Positive']))


-- Naive Bayes --
Accuracy:, 86.58%
              precision    recall  f1-score   support

    Negative       0.85      0.88      0.87      4961
    Positive       0.88      0.85      0.86      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000


-- Logistic Regression --
Accuracy:, 89.53%
              precision    recall  f1-score   support

    Negative       0.91      0.88      0.89      4961
    Positive       0.88      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

