In [4]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Synthetic dataset with enough examples
reviews = [
    "I absolutely loved this movie, it was fantastic and thrilling!",
    "Worst movie ever. Complete waste of time and money.",
    "The acting was okay, but the story was boring and predictable.",
    "Amazing direction and great storyline, I enjoyed every moment!",
    "I didn't like it at all, very disappointing and dull.",
    "Superb performance by the cast, highly recommend watching it!",
    "Terrible. Would not watch again, horrible experience.",
    "This movie was a masterpiece, brilliant from start to finish.",
    "Awful script and poor acting, not worth watching.",
    "Loved the cinematography and music, really beautiful film!",
    "Brilliant acting and amazing plot, loved every second!",
    "Disgusting plot and terrible execution, hate it.",
    "Great fun, I was smiling all through the movie.",
    "It was so bad I walked out of the theatre.",
    "One of the best films I've seen this year.",
    "A complete mess of a movie, awful editing."
]

sentiments = [
    "positive", "negative", "negative", "positive", "negative",
    "positive", "negative", "positive", "negative", "positive",
    "positive", "negative", "positive", "negative", "positive", "negative"
]

df = pd.DataFrame({"review": reviews, "sentiment": sentiments})

# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['clean_review'] = df['review'].apply(clean_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.25, random_state=42)

# TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_test_vec)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# Sample prediction
sample_text = ["I really enjoyed this film", "It was a horrible experience"]
sample_clean = [clean_text(t) for t in sample_text]
sample_vec = vectorizer.transform(sample_clean)
print("\nSample Predictions:", model.predict(sample_vec))


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

    negative       0.33      1.00      0.50         1
    positive       1.00      0.33      0.50         3

    accuracy                           0.50         4
   macro avg       0.67      0.67      0.50         4
weighted avg       0.83      0.50      0.50         4


Sample Predictions: ['negative' 'negative']


[nltk_data] Downloading package stopwords to C:\Users\VATSAL
[nltk_data]     PARIKH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
