In [4]:
# 1. Upload the Dataset (run this cell and upload IMDB Dataset.csv manually)
from google.colab import files
uploaded = files.upload()  # Use file picker, select 'IMDB Dataset.csv'

# 2. Load and Inspect the Data
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())
print(df['sentiment'].value_counts())

# 3. Preprocess and Prepare Data
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = text.split()
    return ' '.join(tokens)

df['clean_review'] = df['review'].apply(clean_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# 4. Apply TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)  # You can increase max_features for even larger vocab
X = tfidf.fit_transform(df['clean_review'])
y = df['sentiment']

# 5. Split Data into Train/Test Sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Train Logistic Regression and Random Forest Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

# 7. Evaluate and Compare Results
from sklearn.metrics import accuracy_score, classification_report

print("\nLogistic Regression Results:")
y_pred_lr = lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification report:\n", classification_report(y_test, y_pred_lr))

print("\nRandom Forest Results:")
y_pred_rf = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification report:\n", classification_report(y_test, y_pred_rf))


Saving IMDB Dataset.csv to IMDB Dataset (1).csv
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64

Logistic Regression Results:
Accuracy: 0.8942
Classification report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Random Forest Results:
Accuracy: 0.8504
Classification report:
               precision    recall  f1-s