In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

In [2]:
# 1. Load Dataset
df = pd.read_csv("IMDB_dataset.csv")

In [3]:
df.columns = df.columns.str.lower().str.strip()
df = df.dropna(subset=["review", "sentiment"])

In [4]:
# Encode target
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [5]:
X = df["review"]
y = df["sentiment"]

In [6]:
# 2. Train-Test Split 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
# 3. Define Models

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

In [8]:
# 4. Model Comparison using Cross Validation

print("Model Comparison (CV Accuracy)\n")

for name, model in models.items():
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english", max_features=10000)),
        ("model", model)
    ])

    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=5,
        scoring="accuracy"
    )

    print(f"{name} Mean Accuracy: {scores.mean():.4f}")

Model Comparison (CV Accuracy)

Logistic Regression Mean Accuracy: 0.8877
Decision Tree Mean Accuracy: 0.7193
Random Forest Mean Accuracy: 0.8478


In [9]:
# 5. Final Evaluation (Best Model)

final_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=10000)),
    ("model", LogisticRegression(max_iter=1000, random_state=42))
])

final_pipeline.fit(X_train, y_train)

y_pred = final_pipeline.predict(X_test)

print("\n✅ Final Model Evaluation\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


✅ Final Model Evaluation

Accuracy: 0.8971

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

