# Model Prototyping

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import precision_recall_fscore_support

In [3]:
df = pd.read_feather("../data/preprocessed_reviews.feather")

## Adding Features
- `word_count`
- `has_spoiler_tag`

In [4]:
df["word_count"] = df["review_text"].apply(len)
df["has_spoiler_tag"] = df["review_text"].apply(lambda x : "spoiler" in x)

## Splitting the data for training and testing

In [5]:
X = df[["review_text", "word_count", "has_spoiler_tag"]]
y = df["is_spoiler"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_text = X_train["review_text"].astype(str)
X_test_text = X_test["review_text"].astype(str)
y_test = y_test.astype(str)

## Vectorizing the `review_text` column

In [50]:
# hash_vect = HashingVectorizer(n_features=50000, ngram_range=(1,2), alternate_sign=False)
# X_train_hash = hash_vect.transform(X_train_text)
# X_test_hash = hash_vect.transform(X_test_text)

tfidf_full = TfidfVectorizer(
    max_features=100000,
    ngram_range=(1,2),
    stop_words='english'
)

X_train_tfidf = tfidf_full.fit_transform(X_train_text)
X_test_tfidf = tfidf_full.fit_transform(X_test_text)

In [51]:
keywords = ["plot twist", "ending", "spoiler", "dies", "surprise", "reveal", "betrayal", "twist", "revealed", "kill", "death", "betrays"]
tfidf_kw = TfidfVectorizer(vocabulary=keywords)
X_train_kw = tfidf_kw.fit_transform(X_train_text)
X_test_kw = tfidf_kw.transform(X_test_text)

## Putting the everything together

In [None]:
X_train_numeric = X_train[["word_count", "has_spoiler_tag"]].copy()

X_train_numeric["has_spoiler_tag"] = X_train_numeric["has_spoiler_tag"].astype(int)

X_test_numeric = X_test[["word_count", "has_spoiler_tag"]].copy()
X_test_numeric["has_spoiler_tag"] = X_test_numeric["has_spoiler_tag"].astype(int)

X_train_combined = hstack([
    # X_train_hash,
    X_train_tfidf,
    X_train_kw,
    csr_matrix(X_train_numeric.values)
])

X_test_combined = hstack(threshold = 0.4[
    # X_test_hash,
    X_test_tfidf,
    X_test_kw,
    csr_matrix(X_test_numeric.values)
])

# Training with Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train_combined, y_train)
nb_probs = nb.predict_proba(X_test_combined)[:, 1]

In [106]:
# for i in range(1, 11):
#     threshold = 0.1 * i
#     y_pred = np.where(probs > threshold, 'True', 'False')
#     print(f"Threhold: {threshold}\n\n")
#     print(classification_report(y_test, y_pred))
#     print(confusion_matrix(y_test, y_pred))

threshold = 0.36
y_pred = np.where(probs > threshold, 'True', 'False')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.83      0.63      0.72     84598
        True       0.39      0.65      0.48     30185

    accuracy                           0.64    114783
   macro avg       0.61      0.64      0.60    114783
weighted avg       0.72      0.64      0.66    114783

[[53654 30944]
 [10622 19563]]


In [None]:
thresholds = np.arange(0.0, 1.01, 0.01)
f1_scores = []

for t in thresholds:
    y_pred = (nb_probs >= t).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(y_test.map({'False':0,'True':1}), y_pred, average='binary')
    f1_scores.append(f1)

best_threshold = thresholds[np.argmax(f1_scores)]
best_f1 = np.max(f1_scores)

print(f"Best Threshold: {best_threshold:.2f}")
print(f"Best F1-Score: {best_f1:.4f}")

âœ… Best Threshold: 0.52
ðŸ“Š Best F1-Score: 0.4875


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Training with Logistic Regression

In [None]:
lr = LogisticRegression(
    max_iter=500,
    class_weight='balanced',
    solver='saga',
    n_jobs=-1
)
lr.fit(X_train_combined, y_train)
lr_probs = lr.predict_proba(X_test_combined)[:, 1]



In [107]:
threshold = 0.52
y_pred = np.where(lr_probs > threshold, 'True', 'False')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.84      0.63      0.72     84598
        True       0.39      0.66      0.49     30185

    accuracy                           0.64    114783
   macro avg       0.61      0.64      0.60    114783
weighted avg       0.72      0.64      0.66    114783

[[53391 31207]
 [10398 19787]]


In [None]:
thresholds = np.arange(0.0, 1.01, 0.01)
f1_scores = []

for t in thresholds:
    y_pred = (lr_probs >= t).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(y_test.map({'False':0,'True':1}), y_pred, average='binary')
    f1_scores.append(f1)

best_threshold = thresholds[np.argmax(f1_scores)]
best_f1 = np.max(f1_scores)

print(f"Best Threshold: {best_threshold:.2f}")
print(f"Best F1-Score: {best_f1:.4f}")

âœ… Best Threshold: 0.52
ðŸ“Š Best F1-Score: 0.4875


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
