In [None]:
# load movie review dataset(with NER Metadata) - size = 7816

import pandas as pd
matched_reviews = pd.read_csv("clean_dataset.csv")
print("Dataset loaded successfully.")
print(matched_reviews.head())

In [None]:
# Baseline Classification Models (Logistic regression and SVM) without NER Metadata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Copy data for modeling
df = matched_reviews.copy()

# Ensure movie review text is string
df["text"] = df["text"].astype(str)

X = df["text"]
y = df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=20_000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Logistic Regression Model (Without NER Metadata)

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_vec, y_train)

# Evaluation
pred_lr = log_reg.predict(X_test_vec)

print("==== Logistic Regression (Without NER Metadata)====")
print("Accuracy:", accuracy_score(y_test, pred_lr))
print("F1 Score:", f1_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))

==== Logistic Regression (Without NER Metadata)====
Accuracy: 0.760230179028133
F1 Score: 0.8500599760095962
              precision    recall  f1-score   support

           0       0.75      0.27      0.40       459
           1       0.76      0.96      0.85      1105

    accuracy                           0.76      1564
   macro avg       0.76      0.62      0.63      1564
weighted avg       0.76      0.76      0.72      1564



In [None]:
# Linear SVM Model Without NER Metadata

svm_clf = LinearSVC()
svm_clf.fit(X_train_vec, y_train)

# Evaluation
pred_svm = svm_clf.predict(X_test_vec)

print("==== Linear SVM (Without NER Metadata) ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))


==== Linear SVM (Without NER Metadata) ====
Accuracy: 0.7736572890025576
F1 Score: 0.8493617021276596
              precision    recall  f1-score   support

           0       0.66      0.46      0.54       459
           1       0.80      0.90      0.85      1105

    accuracy                           0.77      1564
   macro avg       0.73      0.68      0.70      1564
weighted avg       0.76      0.77      0.76      1564



In [None]:
# Baseline Classification Models (Logistic regression and SVM) with NER Metadata

from textblob import TextBlob 

# A function to compute entity features eg. count the number of time actors/directors were mentioned, entity sentiment, etc
def compute_entity_features(row):

    # actors/directors are lists, not strings
    actors = row.get("actors", [])
    directors = row.get("directors", [])

    num_actors = len(actors)
    num_directors = len(directors)

    # review text to lowercase
    text = row["text"].lower()

    # Count actor mentions
    actor_mentions = 0
    for a in actors:
        actor_mentions += text.count(a.lower())

    # Count director mentions
    director_mentions = 0
    for d in directors:
        director_mentions += text.count(d.lower())

    # Sentiment toward entity names
    entity_tokens = actors + directors
    entity_sentiment = 0

    if entity_tokens:
        combined = " ".join(entity_tokens)
        try:
            entity_sentiment = TextBlob(combined).sentiment.polarity
        except:
            entity_sentiment = 0

    return pd.Series({
        "num_actors": num_actors,
        "num_directors": num_directors,
        "actor_mentions": actor_mentions,
        "director_mentions": director_mentions,
        "entity_sentiment": entity_sentiment
    })


# Compute entity features
entity_features = matched_reviews.apply(compute_entity_features, axis=1)
full_df = pd.concat([matched_reviews, entity_features], axis=1)
# print(full_df.head()) - you can optionally run this to check to see new features

In [None]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

# Split BEFORE vectorization
X_text_raw = full_df["text"]
X_entity = full_df[[
    "num_actors", "num_directors",
    "actor_mentions", "director_mentions",
    "entity_sentiment"
]].fillna(0)
y = full_df["label"]

# Train-test split on raw data
X_text_train, X_text_test, X_entity_train, X_entity_test, y_train, y_test = train_test_split(
    X_text_raw, X_entity, y, test_size=0.2, random_state=42, stratify=y
)

# Now fit TF-IDF only on training text
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(X_text_train)  # Learn from train only
X_test_text = tfidf.transform(X_text_test)        # Apply to test

# Scale entity features (fit on train, transform both)
scaler = StandardScaler(with_mean=False)  # Sparse-compatible
X_entity_train_scaled = scaler.fit_transform(X_entity_train.values)
X_entity_test_scaled = scaler.transform(X_entity_test.values)

# Combine features
X_train = hstack([X_train_text, X_entity_train_scaled])
X_test = hstack([X_test_text, X_entity_test_scaled])

In [None]:
# Logistic Regression with NER features

log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)

pred_log = log_clf.predict(X_test)

print("==== Logistic Regression (with NER Metadata) ====")
print("Accuracy:", accuracy_score(y_test, pred_log))
print("F1 Score:", f1_score(y_test, pred_log))
print(classification_report(y_test, pred_log))

==== Entity-Aware Logistic Regression ====
Accuracy: 0.7794117647058824
F1 Score: 0.8570244508910071
              precision    recall  f1-score   support

           0       0.72      0.40      0.52       459
           1       0.79      0.94      0.86      1105

    accuracy                           0.78      1564
   macro avg       0.76      0.67      0.69      1564
weighted avg       0.77      0.78      0.76      1564



In [None]:
# SVM with NER features

svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)

pred_svm = svm_clf.predict(X_test)

print("==== SVM (with NER Metadata) ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))

==== SVM (with NER Metadata) ====
Accuracy: 0.7896419437340153
F1 Score: 0.8572668112798265
              precision    recall  f1-score   support

           0       0.68      0.54      0.60       459
           1       0.82      0.89      0.86      1105

    accuracy                           0.79      1564
   macro avg       0.75      0.72      0.73      1564
weighted avg       0.78      0.79      0.78      1564



In [None]:
# DistilBERT Model with NER Metadata

from transformers import DistilBertTokenizerFast
from datasets import Dataset
import torch
torch.backends.cudnn.benchmark = True

# Load data
entity_cols = ["num_actors", "num_directors",
               "actor_mentions", "director_mentions", "entity_sentiment"]

df = full_df[["text", "label","actors","directors"] + entity_cols].dropna()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

MAX_LEN = 128   # MUCH FASTER (cut 256 → 128)

def tokenize_batch(batch):
    encoded = tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,       # dynamic padding enabled later
    )
    # add entity features
    for col in entity_cols:
        encoded[col] = batch[col]
    return encoded

# Remove original columns except what we return
cols_to_keep = ["input_ids", "attention_mask", "label"] + entity_cols

train_ds = train_ds.map(
    tokenize_batch,
    batched=True,
    remove_columns=[c for c in train_ds.column_names if c not in cols_to_keep]
)

test_ds = test_ds.map(
    tokenize_batch,
    batched=True,
    remove_columns=[c for c in test_ds.column_names if c not in cols_to_keep]
)


train_ds = train_ds.rename_column("label", "labels")
test_ds = test_ds.rename_column("label", "labels")

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 6252/6252 [00:00<00:00, 26221.92 examples/s]
Map: 100%|██████████| 1564/1564 [00:00<00:00, 19078.16 examples/s]

                                                   text  \
0     steve mcqueen provided a thrilling motorcycle ...   
1     liza minnelli and joel gray won oscars for the...   
2     what is that tom hanks and julia roberts movie...   
3     what is the movie making fun of macgyver by re...   
4     i am thinking of an animated film based on a c...   
...                                                 ...   
7811  you see this 1965 musical masterpiece regularl...   
7812  young traveler allan gray discovers evidence o...   
7813  yul bryner recreated his broadway role in this...   
7814  yul brynner won an oscar for his role in this ...   
7815  zac efron is a soldier searching for the woman...   

                              actors                   directors  label  \
0                  ['steve mcqueen']                          []      1   
1     ['liza minnelli', 'joel gray']                          []      1   
2     ['tom hanks', 'julia roberts']                          []  




In [None]:
# Normalize entity features

from sklearn.preprocessing import StandardScaler
from transformers import TrainingArguments, Trainer, DistilBertModel
import torch.nn as nn

scaler = StandardScaler()
df[entity_cols] = scaler.fit_transform(df[entity_cols])

# Dynamic padding - Faster GPU and less memory
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

class DistilBertWithEntities(nn.Module):
    def __init__(self, num_labels, entity_dim=6):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768 + entity_dim, num_labels)

    # DistilBERT forward methold
    def forward(self,input_ids=None,attention_mask=None,labels=None,num_actors=None,num_directors=None,actor_mentions=None,director_mentions=None,
        entity_sentiment=None):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token

        # Stack entity features
        entity_tensors = [
            num_actors.unsqueeze(1).float(),
            num_directors.unsqueeze(1).float(),
            actor_mentions.unsqueeze(1).float(),
            director_mentions.unsqueeze(1).float(),
            entity_sentiment.unsqueeze(1).float(),
        ]
        entity_tensor = torch.cat(entity_tensors, dim=1)

        # Combine text + entity features
        combined = torch.cat((pooled_output, entity_tensor), dim=1)
        combined = self.dropout(combined)
        logits = self.fc(combined)

        # Loss with optional class weights
        loss = None
        if labels is not None:
            class_counts = torch.bincount(labels)
            class_weights = (1.0 / class_counts.float()).to(logits.device)
            loss_fn = nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

num_labels = df["label"].nunique()
model = DistilBertWithEntities(num_labels, entity_dim=len(entity_cols))

# Training Arguments 
training_args = TrainingArguments(
    output_dir="./distilbert-entity-improved",

    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,

    gradient_accumulation_steps=2,   # effective batch = 32
    learning_rate=3e-5,

    num_train_epochs=2,
    save_strategy="epoch",

    fp16=True,
    optim="adamw_torch",
    dataloader_num_workers=4,
    dataloader_pin_memory=True,

    logging_steps=100,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
)



In [None]:
# TRAIN

trainer.train()



Step,Training Loss
100,0.8254
200,0.5064
300,0.3358




TrainOutput(global_step=392, training_loss=0.4936037890765132, metrics={'train_runtime': 879.3719, 'train_samples_per_second': 14.219, 'train_steps_per_second': 0.446, 'total_flos': 0.0, 'train_loss': 0.4936037890765132, 'epoch': 2.0})

In [None]:
# Evaluate


trainer.evaluate()




{'eval_loss': 0.35099494457244873,
 'eval_runtime': 47.4878,
 'eval_samples_per_second': 32.935,
 'eval_steps_per_second': 1.032,
 'epoch': 2.0}

In [None]:
# Get predictions from trainer

predictions = trainer.predict(test_ds)

# Extract logits and labels
preds = predictions.predictions
labels = predictions.label_ids

# Convert logits to predicted class indices
preds = preds.argmax(axis=-1)

acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average='weighted')  # or 'macro' if you prefer

print("==== DistilBERT Evaluation (With NER Metadata) ====")
print("Accuracy:", acc)
print("F1 Score:", f1)
print("\nClassification Report:\n")
print(classification_report(labels, preds))



==== Entity Aware - DistilBERT Evaluation ====
Accuracy: 0.8714833759590793
F1 Score: 0.874136443853217

Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.86      0.80       459
           1       0.94      0.87      0.91      1105

    accuracy                           0.87      1564
   macro avg       0.84      0.87      0.85      1564
weighted avg       0.88      0.87      0.87      1564



In [None]:
# Bias Evaluation - Token Level Attribution Score

from captum.attr import IntegratedGradients

# Function to compute attributions for tokens
def attribute_tokens(text, entity_values, tokenizer, model, max_len=128, target_class=1):

    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_len
    )

    input_ids = inputs["input_ids"].long()
    attention_mask = inputs["attention_mask"].long()

    # Entity features
    entity_tensor = torch.tensor(entity_values).unsqueeze(0).float()

    # Convert input_ids to embeddings
    embeddings = model.bert.embeddings(input_ids)  # shape: (1, seq_len, 768)

    # Wrap model for Captum  function
    def model_forward(embeddings, attention_mask, entity_tensor):

        # Pass embeddings directly into DistilBERT
        outputs = model.bert(
            inputs_embeds=embeddings,
            attention_mask=attention_mask
        )

        pooled = outputs.last_hidden_state[:, 0]

        # Entity features
        entity_feat = torch.cat([
            entity_tensor[:, 0].unsqueeze(1),
            entity_tensor[:, 1].unsqueeze(1),
            entity_tensor[:, 2].unsqueeze(1),
            entity_tensor[:, 3].unsqueeze(1),
            entity_tensor[:, 4].unsqueeze(1),
        ], dim=1)

        combined = torch.cat((pooled, entity_feat), dim=1)
        combined = model.dropout(combined)
        logits = model.fc(combined)

        probs = torch.softmax(logits, dim=-1)
        return probs[:, target_class]

    # Integrated Gradients 
    ig = IntegratedGradients(model_forward)

    attributions, delta = ig.attribute(
        embeddings,
        target=None,
        additional_forward_args=(attention_mask, entity_tensor),
        n_steps=50,
        return_convergence_delta=True
    )

    # Convert attributions to token-level vector
    token_attributions = attributions.sum(dim=-1).squeeze(0).detach().numpy()

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    return tokens, token_attributions, float(delta)


In [None]:
review_row = full_df.iloc[2006]
review_text = review_row['text']
print("Token Level Attribution Score")
print(" Review Text: " +review_text)

entity_features = [
    review_row['num_actors'],
    review_row['num_directors'],
    review_row['actor_mentions'],
    review_row['director_mentions'],
    review_row['entity_sentiment'],
]

tokens, atts, delta = attribute_tokens(review_text, entity_features, tokenizer, model)

for tok, score in zip(tokens, atts):
    print(f"{tok}: {score:.4f}")

# For example, take a review with arnold schwarzenegger as actor (56, 2006)
# First result: 
# arnold: 0.0131
# schwarz: -0.0075

# Second result:
#  arnold: -0.0054
# schwarz: -0.0111

# Actor - adam sandler (285, 517, 574, 726, 842, 1239, 1265, 1404, 1525, 1699, 1872, 1960, 1961, 2582, 2595, 2646, 2694, 3661, 3693, 3756, 3848,
# 4010, 4236, 4246, 4286)

Token Level Attribution Score
 Review Text: arnold schwarzenegger s mission to mars gets a bit wacky when he does n t know if what he remembers actually happened or were implanted memories
[CLS]: -0.0004
arnold: -0.0054
schwarz: -0.0111
##ene: -0.0090
##gger: -0.0153
s: -0.0071
mission: 0.0068
to: -0.0103
mars: -0.0096
gets: -0.0215
a: -0.0368
bit: -0.0180
wa: -0.0039
##cky: -0.0151
when: -0.0279
he: -0.0103
does: -0.0247
n: -0.0163
t: -0.0307
know: -0.0218
if: -0.0368
what: -0.0240
he: -0.0056
remembers: -0.0013
actually: -0.0182
happened: -0.0094
or: -0.0209
were: -0.0393
implant: -0.0137
##ed: -0.0232
memories: -0.0109
[SEP]: -0.0706


In [None]:
# Bias Evaluation - Entity-Level Skew 

# Ensure 'entities' column exists
def combine_entities(row):
    actors = row.get("actors", [])
    directors = row.get("directors", [])
    return actors + directors

# Ensure columns are lists, not strings
import ast
for col in ['actors', 'directors']:
    test_df[col] = test_df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

test_df['entities'] = test_df.apply(combine_entities, axis=1)

# Build entity-level DataFrame
test_df['sentiment'] = preds

entity_rows = []
for _, row in test_df.iterrows():
    if not row['entities']:
        continue
    for entity in row['entities']:
        entity_rows.append({'entity': entity, 'sentiment': row['sentiment']})

df_entities = pd.DataFrame(entity_rows)


# Global positive rate
global_pos_rate = df_entities['sentiment'].mean()


# Entity-level stats
entity_stats = (
    df_entities
    .groupby('entity')['sentiment']
    .agg(['mean','count'])
    .reset_index()
)
entity_stats.rename(columns={'mean':'pos_rate','count':'num_reviews'}, inplace=True)
entity_stats['sentiment_skew'] = entity_stats['pos_rate'] - global_pos_rate

entity_stats

Unnamed: 0,entity,pos_rate,num_reviews,sentiment_skew
0,aaaron johnson,1.00,1,0.234386
1,aang,1.00,1,0.234386
2,aaron sorkin,0.00,1,-0.765614
3,adam elliot,1.00,2,0.234386
4,adam sandler,0.25,4,-0.515614
...,...,...,...,...
672,zach snyder,0.00,1,-0.765614
673,zachary gordon,1.00,1,0.234386
674,zachary levi,1.00,1,0.234386
675,zack galifianakis,1.00,1,0.234386
