In [None]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from nltk.corpus import stopwords

In [None]:

nltk.download("movie_reviews")
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [None]:
# Build dataset
docs = [(movie_reviews.raw(fileid), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

df = pd.DataFrame(docs, columns=["review", "label"])

In [None]:
# Preprocessing: lowercasing & stopword removal
stop_words = set(stopwords.words("english"))

In [None]:
def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    return " ".join(tokens)

In [None]:
nltk.download("punkt_tab")
df["clean_review"] = df["review"].apply(preprocess)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["clean_review"], df["label"],
                                                    test_size=0.2, random_state=42)

In [None]:
# Vectorizers
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

In [None]:

X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
print(f"Shape of X_train_count: {X_train_count.shape}")
print(f"Shape of X_test_count: {X_test_count.shape}")
print(f"Shape of X_train_tfidf: {X_train_tfidf.shape}")
print(f"Shape of X_test_tfidf: {X_test_tfidf.shape}")

Shape of X_train_count: (1600, 34677)
Shape of X_test_count: (400, 34677)
Shape of X_train_tfidf: (1600, 34677)
Shape of X_test_tfidf: (400, 34677)


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

In [None]:
results = []
trained_models = {}

In [None]:
for vec_name, (Xtr, Xte, vectorizer) in {
    "Count": (X_train_count, X_test_count, count_vectorizer),
    "TF-IDF": (X_train_tfidf, X_test_tfidf, tfidf_vectorizer)
}.items():
    for model_name, model in models.items():
        model.fit(Xtr, y_train)
        y_pred = model.predict(Xte)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, pos_label="pos")
        results.append([vec_name, model_name, acc, f1])
        trained_models[(vec_name, model_name)] = (model, vectorizer)

In [None]:
# Display results
results_df = pd.DataFrame(results, columns=["Vectorizer", "Model", "Accuracy", "F1-score"])
display(results_df)

Unnamed: 0,Vectorizer,Model,Accuracy,F1-score
0,Count,Logistic Regression,0.82,0.818182
1,Count,Naive Bayes,0.8075,0.8
2,Count,SVM,0.81,0.807107
3,TF-IDF,Logistic Regression,0.815,0.81592
4,TF-IDF,Naive Bayes,0.795,0.78534
5,TF-IDF,SVM,0.8275,0.828784


In [None]:
def manual_evaluate(examples):
    print("\nManual Evaluation of Custom Examples:\n")
    for (vec_name, model_name), (model, vectorizer) in trained_models.items():
        print(f"\n=== {vec_name} + {model_name} ===")
        for text in examples:
            clean = preprocess(text)
            X_vec = vectorizer.transform([clean])
            pred = model.predict(X_vec)[0]
            print(f"Review: {text}\n Predicted: {pred}\n")

In [None]:
custom_reviews = [
    "The movie was fantastic, with brilliant acting and a touching story.",
    "I hated the movie, it was boring and way too long.",
    "A wonderful experience, the story was engaging and the music was perfect.",
    "Terrible acting and poor direction, the story made no sense.",
    "I enjoyed the movie, the plot was engaging and the acting was strong.",
    "The acting was weak and the movie felt boring and predictable.",
    "One of the best movies this year, with inspiring story and great acting.",
    "Disappointing movie, boring script and bad acting throughout.",
    "An emotional story with good acting and strong performances.",
    "Not worth watching, bad story and dull acting from start to finish."
]


In [None]:
manual_evaluate(custom_reviews)


Manual Evaluation of Custom Examples:


=== Count + Logistic Regression ===
Review: The movie was fantastic, with brilliant acting and a touching story.
 Predicted: neg

Review: I hated the movie, it was boring and way too long.
 Predicted: neg

Review: A wonderful experience, the story was engaging and the music was perfect.
 Predicted: pos

Review: Terrible acting and poor direction, the story made no sense.
 Predicted: neg

Review: I enjoyed the movie, the plot was engaging and the acting was strong.
 Predicted: neg

Review: The acting was weak and the movie felt boring and predictable.
 Predicted: neg

Review: One of the best movies this year, with inspiring story and great acting.
 Predicted: pos

Review: Disappointing movie, boring script and bad acting throughout.
 Predicted: neg

Review: An emotional story with good acting and strong performances.
 Predicted: pos

Review: Not worth watching, bad story and dull acting from start to finish.
 Predicted: neg


=== Count + Naive Ba

#### TF-IDF + Naive Bayes gave the best output almost all the predictions were correct