In [2]:
# Install (in notebook if needed)
# pip install scikit-learn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
import numpy as np

# ---------------------------------------
# Sample reviews (20 total: positive + negative)
# ---------------------------------------
texts = [
    "One of the best movies I have seen in a long time. Brilliant acting and a strong story.",
    "This movie was a complete waste of time. Poor script and terrible execution.",
    "I really enjoyed the film. The characters were relatable and the story was engaging.",
    "The plot was boring and predictable. I struggled to finish watching it.",
    "A visually stunning movie with an emotional and powerful message.",
    "The acting was very weak and the dialogues felt unnatural.",
    "An excellent performance by the lead actor. Truly inspiring and well made.",
    "I expected much more, but the movie failed to impress me at any point.",
    "A heartwarming story that keeps you emotionally connected till the end.",
    "The movie was too slow and dragged unnecessarily in many scenes.",
    "Amazing direction and background score. Definitely worth watching.",
    "I did not like this movie at all. The story made no sense to me.",
    "A beautiful film with strong performances and meaningful dialogues.",
    "Terrible movie. Bad acting, bad script, and bad direction.",
    "This movie exceeded my expectations. I loved every moment of it.",
    "The film was confusing and poorly edited. Not recommended.",
    "One of the finest movies of its genre. Highly entertaining.",
    "I was bored throughout the movie. It lacked depth and creativity.",
    "A well-crafted movie that delivers both entertainment and emotion.",
    "Disappointing experience. The movie was messy and poorly written."
]

# ---------------------------------------
# Labels: 1 = positive, 0 = negative
# ---------------------------------------
labels = np.array([
    1, 0, 1, 0, 1,
    0, 1, 0, 1, 0,
    1, 0, 1, 0, 1,
    0, 1, 0, 1, 0
])


# ---------------------------------------
# 2. Bag of Words (CountVectorizer)
# ---------------------------------------
bow_vectorizer = CountVectorizer(lowercase=True, stop_words="english")
X_bow = bow_vectorizer.fit_transform(texts)

print("BoW shape:", X_bow.shape)
print("Sample vocabulary:", list(bow_vectorizer.vocabulary_.keys())[:15])

# ---------------------------------------
# 3. TF-IDF representation
# ---------------------------------------
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
X_tfidf = tfidf_vectorizer.fit_transform(texts)

print("\nTF-IDF shape:", X_tfidf.shape)
print("Example terms:", tfidf_vectorizer.get_feature_names_out()[:15])

# ---------------------------------------
# 4. Train / Test split
# ---------------------------------------
X_bow_train, X_bow_test, y_train, y_test = train_test_split(
    X_bow, labels, test_size=0.3, random_state=42, stratify=labels
)

X_tfidf_train, X_tfidf_test, _, _ = train_test_split(
    X_tfidf, labels, test_size=0.3, random_state=42, stratify=labels
)

# ---------------------------------------
# 4a. Logistic Regression with BoW
# ---------------------------------------
bow_clf = LogisticRegression(max_iter=1000)
bow_clf.fit(X_bow_train, y_train)

y_pred_bow = bow_clf.predict(X_bow_test)

print("\n=== BoW + Logistic Regression ===")
print(classification_report(y_test, y_pred_bow, target_names=["negative", "positive"]))
print("Confusion Matrix (BoW):\n", confusion_matrix(y_test, y_pred_bow))

# ---------------------------------------
# 4b. Logistic Regression with TF-IDF
# ---------------------------------------
tfidf_clf = LogisticRegression(max_iter=1000)
tfidf_clf.fit(X_tfidf_train, y_train)

y_pred_tfidf = tfidf_clf.predict(X_tfidf_test)

print("\n=== TF-IDF + Logistic Regression ===")
print(classification_report(y_test, y_pred_tfidf, target_names=["negative", "positive"]))
print("Confusion Matrix (TF-IDF):\n", confusion_matrix(y_test, y_pred_tfidf))

# ---------------------------------------
# 5. Predict sentiment for new movie reviews
# ---------------------------------------
new_reviews = [
    "The movie was boring and the story made no sense",
    "Fantastic film with excellent acting and direction"
]

new_X = tfidf_vectorizer.transform(new_reviews)
new_preds = tfidf_clf.predict(new_X)
new_probs = tfidf_clf.predict_proba(new_X)

for review, label, prob in zip(new_reviews, new_preds, new_probs):
    sentiment = "positive" if label == 1 else "negative"
    print("\nReview:", review)
    print(f"Predicted sentiment: {sentiment} (probability={prob[label]:.2f})")


BoW shape: (20, 94)
Sample vocabulary: ['best', 'movies', 'seen', 'long', 'time', 'brilliant', 'acting', 'strong', 'story', 'movie', 'complete', 'waste', 'poor', 'script', 'terrible']

TF-IDF shape: (20, 94)
Example terms: ['acting' 'actor' 'amazing' 'background' 'bad' 'beautiful' 'best' 'bored'
 'boring' 'brilliant' 'characters' 'complete' 'confusing' 'connected'
 'crafted']

=== BoW + Logistic Regression ===
              precision    recall  f1-score   support

    negative       0.50      1.00      0.67         3
    positive       0.00      0.00      0.00         3

    accuracy                           0.50         6
   macro avg       0.25      0.50      0.33         6
weighted avg       0.25      0.50      0.33         6

Confusion Matrix (BoW):
 [[3 0]
 [3 0]]

=== TF-IDF + Logistic Regression ===
              precision    recall  f1-score   support

    negative       0.50      1.00      0.67         3
    positive       0.00      0.00      0.00         3

    accuracy     