In [None]:
!pip install pandas scikit-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

In [None]:
from google.colab import files
files.upload()  # choose kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!pip install kaggle
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip imdb-dataset-of-50k-movie-reviews.zip

In [4]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()
df['sentiment'].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [5]:
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r"<.*?>", "", text)  # remove HTML tags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove special characters
    return text

df['review'] = df['review'].apply(clean_text)

In [6]:
X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [9]:
y_pred = model.predict(X_test_tfidf)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8832
              precision    recall  f1-score   support

    negative       0.90      0.87      0.88      4961
    positive       0.87      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

[[4294  667]
 [ 501 4538]]


In [10]:
def predict_sentiment(review):
    review_clean = clean_text(review)
    review_tfidf = vectorizer.transform([review_clean])
    pred = model.predict(review_tfidf)[0]
    return pred

# Example
sample_review_1 = "The movie wasn’t so bad, actually I enjoyed it a lot."
print(f"Review: '{sample_review_1}' -> Predicted sentiment: {predict_sentiment(sample_review_1)}")

sample_review_positive = "This movie was absolutely fantastic! I loved every moment of it."
print(f"Review: '{sample_review_positive}' -> Predicted sentiment: {predict_sentiment(sample_review_positive)}")

sample_review_negative = "This is the worst movie I have ever seen. A complete waste of time."
print(f"Review: '{sample_review_negative}' -> Predicted sentiment: {predict_sentiment(sample_review_negative)}")

Review: 'The movie wasn’t so bad, actually I enjoyed it a lot.' -> Predicted sentiment: negative
Review: 'This movie was absolutely fantastic! I loved every moment of it.' -> Predicted sentiment: positive
Review: 'This is the worst movie I have ever seen. A complete waste of time.' -> Predicted sentiment: negative
