In [1]:
# load matched reviews
import pandas as pd
matched_reviews = pd.read_csv("matched_reviews_with_metadata_ner.csv")

In [2]:
# Baseline Classification Models without NER Metadata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Prepare data for modeling
df = matched_reviews.copy()

# Ensure text is string
df["text"] = df["text"].astype(str)

X = df["text"]
y = df["label"]     # 0 = negative, 1 = positive

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=20_000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [3]:
# Logistic Regression Model
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_vec, y_train)

# Evaluation
pred_lr = log_reg.predict(X_test_vec)

print("==== Logistic Regression ====")
print("Accuracy:", accuracy_score(y_test, pred_lr))
print("F1 Score:", f1_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))

==== Logistic Regression ====
Accuracy: 0.8712465878070974
F1 Score: 0.8700045934772623
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1122
           1       0.86      0.88      0.87      1076

    accuracy                           0.87      2198
   macro avg       0.87      0.87      0.87      2198
weighted avg       0.87      0.87      0.87      2198



In [4]:
# SVM Model
svm_clf = LinearSVC()
svm_clf.fit(X_train_vec, y_train)

# Evaluation
pred_svm = svm_clf.predict(X_test_vec)

print("==== Linear SVM ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))


==== Linear SVM ====
Accuracy: 0.8689717925386715
F1 Score: 0.8675252989880404
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1122
           1       0.86      0.88      0.87      1076

    accuracy                           0.87      2198
   macro avg       0.87      0.87      0.87      2198
weighted avg       0.87      0.87      0.87      2198



In [5]:
# Baseline Classification Models with NER Metadata
import numpy as np
from textblob import TextBlob

def compute_entity_features(row):
    titles = row["detected_titles"]

    # actors/directors are lists, not strings
    actors = row.get("actors", [])
    directors = row.get("directors", [])

    num_titles = len(titles)
    num_actors = len(actors)
    num_directors = len(directors)

    text = row["text"].lower()

    # Count actor mentions
    actor_mentions = 0
    for a in actors:
        actor_mentions += text.count(a.lower())

    # Count director mentions
    director_mentions = 0
    for d in directors:
        director_mentions += text.count(d.lower())

    # Sentiment toward entity names
    entity_tokens = actors + directors
    entity_sentiment = 0

    if entity_tokens:
        combined = " ".join(entity_tokens)
        try:
            entity_sentiment = TextBlob(combined).sentiment.polarity
        except:
            entity_sentiment = 0

    return pd.Series({
        "num_titles": num_titles,
        "num_actors": num_actors,
        "num_directors": num_directors,
        "actor_mentions": actor_mentions,
        "director_mentions": director_mentions,
        "entity_sentiment": entity_sentiment
    })


# Compute entity features
entity_features = matched_reviews.apply(compute_entity_features, axis=1)
full_df = pd.concat([matched_reviews, entity_features], axis=1)

In [6]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

# Split BEFORE vectorization
X_text_raw = full_df["text"]
X_entity = full_df[[
    "num_titles", "num_actors", "num_directors",
    "actor_mentions", "director_mentions",
    "entity_sentiment"
]].fillna(0).values
y = full_df["label"]

# Train-test split on raw data
X_text_train, X_text_test, X_entity_train, X_entity_test, y_train, y_test = train_test_split(
    X_text_raw, X_entity, y, test_size=0.2, random_state=42, stratify=y
)

# Now fit TF-IDF only on training text
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(X_text_train)  # Learn from train only
X_test_text = tfidf.transform(X_text_test)        # Apply to test

# Scale entity features (fit on train, transform both)
scaler = StandardScaler(with_mean=False)  # Sparse-compatible
X_entity_train_scaled = scaler.fit_transform(X_entity_train)
X_entity_test_scaled = scaler.transform(X_entity_test)

# Combine features
X_train = hstack([X_train_text, X_entity_train])
X_test = hstack([X_test_text, X_entity_test])

In [7]:
# Logistic Regression with NER features
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)

pred_log = log_clf.predict(X_test)

print("==== Entity-Aware Logistic Regression ====")
print("Accuracy:", accuracy_score(y_test, pred_log))
print("F1 Score:", f1_score(y_test, pred_log))
print(classification_report(y_test, pred_log))

==== Entity-Aware Logistic Regression ====
Accuracy: 0.8516833484986351
F1 Score: 0.8473782771535581
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      1122
           1       0.85      0.84      0.85      1076

    accuracy                           0.85      2198
   macro avg       0.85      0.85      0.85      2198
weighted avg       0.85      0.85      0.85      2198



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# SVM with NER features
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)

pred_svm = svm_clf.predict(X_test)

print("==== Entity-Aware SVM ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))

==== Entity-Aware SVM ====
Accuracy: 0.873066424021838
F1 Score: 0.869198312236287
              precision    recall  f1-score   support

           0       0.87      0.88      0.88      1122
           1       0.88      0.86      0.87      1076

    accuracy                           0.87      2198
   macro avg       0.87      0.87      0.87      2198
weighted avg       0.87      0.87      0.87      2198



In [11]:
# prepare data for modeling
df = full_df[["text", "label"]].dropna()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

In [12]:
# Tokenization with DistilBERT tokenizer
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased",
                                                     use_fast=True,
                                                    local_files_only=False
                                                    )     

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )


In [13]:
# Convert to Hugging Face Datasets because transformers work well with them instead of dataframes
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

train_ds = train_ds.map(tokenize_batch, batched=True)
test_ds = test_ds.map(tokenize_batch, batched=True)

train_ds = train_ds.rename_column("label", "labels")
test_ds = test_ds.rename_column("label", "labels")

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 8792/8792 [00:02<00:00, 3552.09 examples/s]
Map: 100%|██████████| 2198/2198 [00:00<00:00, 3067.89 examples/s]


In [14]:
# Load DistilBERT model 
num_labels = df["label"].nunique()

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# import torch
# import accelerate # Accelerate must be 0.33.0
# import transformers

# print(torch.__version__)
# print(accelerate.__version__)
# print(transformers.__version__)

# import sys
# print(sys.executable)
# print(sys.path)

# import importlib
# import transformers
# import accelerate

# importlib.reload(transformers)
# importlib.reload(accelerate)

import sys
print(sys.executable)
import transformers, accelerate
print(transformers.__version__)
print(accelerate.__version__)


C:\Python314\python.exe
4.57.2
1.12.0


In [16]:
# Training with Hugging Face Trainer
from transformers import TrainingArguments
from transformers import Trainer

training_args = TrainingArguments(
    output_dir="./distilbert-sentiment",
    
    eval_steps=500,
    save_steps=500,
    
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    learning_rate=5e-5,
    num_train_epochs=1,
    weight_decay=0.01,

    logging_steps=50,
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

In [None]:
trainer.train()



Step,Training Loss


In [15]:
trainer.evaluate()



{'eval_loss': 0.5090044140815735,
 'eval_runtime': 192.6551,
 'eval_samples_per_second': 10.771,
 'eval_steps_per_second': 0.675,
 'epoch': 3.0}

In [16]:
import numpy as np

# Make predictions on the test dataset
predictions_output = trainer.predict(test_ds)  # test_ds is your HF Dataset

# predictions_output is a PredictionOutput object
logits = predictions_output.predictions  # raw model outputs
labels = predictions_output.label_ids   # true labels

# Convert logits to predicted class indices
preds = np.argmax(logits, axis=1)

In [17]:
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average='weighted')  # or 'macro' if you prefer

print("==== DistilBERT Evaluation ====")
print("Accuracy:", acc)
print("F1 Score:", f1)
print("\nClassification Report:\n")
print(classification_report(labels, preds))

==== DistilBERT Evaluation ====
Accuracy: 0.8790361445783132
F1 Score: 0.8790490691928209

Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1059
           1       0.87      0.89      0.88      1016

    accuracy                           0.88      2075
   macro avg       0.88      0.88      0.88      2075
weighted avg       0.88      0.88      0.88      2075



In [13]:
# # DistilBERT Model for Sentiment Classification
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from torch.optim import AdamW
# from sklearn.metrics import classification_report, accuracy_score, f1_score
# from tqdm import tqdm
# import numpy as np
#
# # Check for GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
#
# # ------------------------------
# # Custom Dataset Class
# # ------------------------------
# class ReviewDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length=512):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length
#
#     def __len__(self):
#         return len(self.texts)
#
#     def __getitem__(self, idx):
#         text = str(self.texts.iloc[idx])
#         label = self.labels.iloc[idx]
#
#         encoding = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_length,
#             padding='max_length',
#             truncation=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )
#
#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'label': torch.tensor(label, dtype=torch.long)
#         }
#
# # ------------------------------
# # Prepare data
# # ------------------------------
# # Use the same train-test split as before
# X_train_text, X_test_text, y_train, y_test = train_test_split(
#     full_df["text"], full_df["label"],
#     test_size=0.2,
#     random_state=42,
#     stratify=full_df["label"]
# )
#
# # Initialize tokenizer
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
#
# # Create datasets
# train_dataset = ReviewDataset(X_train_text, y_train, tokenizer)
# test_dataset = ReviewDataset(X_test_text, y_test, tokenizer)
#
# # Create dataloaders
# batch_size = 16
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size)
#
# # ------------------------------
# # Initialize Model
# # ------------------------------
# model = DistilBertForSequenceClassification.from_pretrained(
#     'distilbert-base-uncased',
#     num_labels=2  # binary classification
# )
# model.to(device)
#
# # Optimizer
# optimizer = AdamW(model.parameters(), lr=2e-5)
#
# # ------------------------------
# # Training Function
# # ------------------------------
# def train_epoch(model, dataloader, optimizer, device):
#     model.train()
#     total_loss = 0
#
#     for batch in tqdm(dataloader, desc="Training"):
#         optimizer.zero_grad()
#
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label'].to(device)
#
#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels
#         )
#
#         loss = outputs.loss
#         total_loss += loss.item()
#
#         loss.backward()
#         optimizer.step()
#
#     return total_loss / len(dataloader)
#
# # ------------------------------
# # Evaluation Function
# # ------------------------------
# def evaluate(model, dataloader, device):
#     model.eval()
#     predictions = []
#     true_labels = []
#
#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Evaluating"):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['label'].to(device)
#
#             outputs = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask
#             )
#
#             logits = outputs.logits
#             preds = torch.argmax(logits, dim=1)
#
#             predictions.extend(preds.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
#
#     return np.array(predictions), np.array(true_labels)
#
# # ------------------------------
# # Train the Model
# # ------------------------------
# num_epochs = 3
#
# print("Starting training...")
# for epoch in range(num_epochs):
#     print(f"\nEpoch {epoch + 1}/{num_epochs}")
#
#     train_loss = train_epoch(model, train_loader, optimizer, device)
#     print(f"Average training loss: {train_loss:.4f}")
#
#     # Evaluate on test set after each epoch
#     predictions, true_labels = evaluate(model, test_loader, device)
#     accuracy = accuracy_score(true_labels, predictions)
#     f1 = f1_score(true_labels, predictions)
#
#     print(f"Test Accuracy: {accuracy:.4f}")
#     print(f"Test F1 Score: {f1:.4f}")
#
# # ------------------------------
# # Final Evaluation
# # ------------------------------
# print("\n==== DistilBERT Final Results ====")
# predictions, true_labels = evaluate(model, test_loader, device)
#
# print("Accuracy:", accuracy_score(true_labels, predictions))
# print("F1 Score:", f1_score(true_labels, predictions))
# print(classification_report(true_labels, predictions))
#
# # ------------------------------
# # Save model (optional)
# # ------------------------------
# model.save_pretrained("./distilbert_sentiment_model")
# tokenizer.save_pretrained("./distilbert_sentiment_model")
# print("\nModel saved to ./distilbert_sentiment_model")

In [None]:
# TinyBERT Model for Sentiment Classification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tqdm import tqdm
import numpy as np

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ------------------------------
# Custom Dataset Class
# ------------------------------
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ------------------------------
# Prepare data
# ------------------------------
# Use the same train-test split as before
X_train_text, X_test_text, y_train, y_test = train_test_split(
    full_df["text"], full_df["label"],
    test_size=0.2,
    random_state=42,
    stratify=full_df["label"]
)

# Initialize TinyBERT tokenizer and model
print("Loading TinyBERT model...")
tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
model = AutoModelForSequenceClassification.from_pretrained(
    'huawei-noah/TinyBERT_General_4L_312D',
    num_labels=2
)
model.to(device)
print("Model loaded successfully!")

# Create datasets (using max_length=256 for faster training)
train_dataset = ReviewDataset(X_train_text, y_train, tokenizer, max_length=256)
test_dataset = ReviewDataset(X_test_text, y_test, tokenizer, max_length=256)

# Create dataloaders (increased batch size for speed)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# ------------------------------
# Training Function
# ------------------------------
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

# ------------------------------
# Evaluation Function
# ------------------------------
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return np.array(predictions), np.array(true_labels)

# ------------------------------
# Train the Model
# ------------------------------
num_epochs = 2  # Reduced to 2 for faster training

print("\nStarting training...")
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Batch size: {batch_size}")
print(f"Epochs: {num_epochs}\n")

for epoch in range(num_epochs):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print('='*50)

    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f"Average training loss: {train_loss:.4f}")

    # Evaluate on test set after each epoch
    predictions, true_labels = evaluate(model, test_loader, device)
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

# ------------------------------
# Final Evaluation
# ------------------------------
print("\n" + "="*50)
print("==== TinyBERT Final Results ====")
print("="*50)
predictions, true_labels = evaluate(model, test_loader, device)

accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(true_labels, predictions, target_names=['Negative', 'Positive']))

# ------------------------------
# Save model (optional)
# ------------------------------
save_model = input("\nSave model? (y/n): ").lower()
if save_model == 'y':
    model.save_pretrained("./tinybert_sentiment_model")
    tokenizer.save_pretrained("./tinybert_sentiment_model")
    print("Model saved to ./tinybert_sentiment_model")

Using device: cpu
Loading TinyBERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!

Starting training...
Training samples: 8297
Test samples: 2075
Batch size: 32
Epochs: 2


Epoch 1/2


Training:  74%|███████▍  | 192/260 [18:03<09:10,  8.10s/it]