# Sentiment Classification Model 8

Trained on merged dataset (SST-3, DynaSent R1, R2) with neutral reviews

### 1. Import Dependencies

In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, f1_score

from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss

### 2. Load Data 

In [4]:
from datasets import load_dataset

dataset = load_dataset("jbeno/sentiment_merged")

train_ds = dataset["train"]
valid_ds = dataset["validation"]
test_ds  = dataset["test"]

df_train = pd.DataFrame(train_ds)      
df_valid = pd.DataFrame(valid_ds)
df_test  = pd.DataFrame(test_ds)

df_train.head()



Unnamed: 0,sentence,label,source,split
0,Those 2 drinks are part of the HK culture and ...,negative,dynasent_r2,train
1,I was told by the repair company that was doin...,negative,dynasent_r1,train
2,It is there to give them a good time .,neutral,sst_local,train
3,Like leafing through an album of photos accomp...,negative,sst_local,train
4,Johnny was a talker and liked to have fun.,positive,dynasent_r1,train


### 3. Map string labels

In [None]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
df_test["label_id"] = df_test["label"].map(label2id)

### 4. Create a Dataset Class

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        # texts and labels can be lists or pd.Series
        self.texts = texts.tolist() if hasattr(texts, "tolist") else texts
        self.labels = labels.tolist() if hasattr(labels, "tolist") else labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

### 5. Instansiate Tokenizer & Dataset

In [None]:
tokenizer_uncased = BertTokenizer.from_pretrained("bert-base-uncased")


train_ds_uncased = SentimentDataset(df_train["sentence"],   df_train["label"],   tokenizer_uncased)
val_ds_uncased   = SentimentDataset(df_valid["sentence"],   df_valid["label"],   tokenizer_uncased)
test_ds_uncased  = SentimentDataset(df_test["sentence"],    df_test["label"],    tokenizer_uncased)

### 6. Initialize Model and Trainer

In [None]:
training_args_uncased = TrainingArguments(
    output_dir="outputs_uncased_8",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs_uncased",
    logging_steps=50,
    fp16=True if torch.cuda.is_available() else False
)

### 7. Train

In [None]:
model_uncased = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

In [None]:
trainer_uncased = Trainer(
    model=model_uncased,
    args=training_args_uncased,
    train_dataset=train_ds_uncased,
    eval_dataset=val_ds_uncased
)

In [None]:
print("Training bert-base-uncased model...")
trainer_uncased.train()

In [None]:
predictions_uncased = trainer_uncased.predict(test_ds_uncased)
pred_labels_uncased = predictions_uncased.predictions.argmax(axis=1)

In [None]:
# 1. Build a DataFrame with the true labels + your predictions
df_eval = pd.DataFrame({
    "sentence": df_test["sentence"].tolist(),   # the raw text
    "true_label":  df_test["label"].tolist(),   # 0/1/2
    "pred_label":  pred_labels_uncased          # your model’s 0/1/2 outputs
})

# 2. Compute overall accuracy and macro‑F1
acc  = accuracy_score(df_eval["true_label"], df_eval["pred_label"])
f1m  = f1_score(df_eval["true_label"], df_eval["pred_label"], average="macro")
print(f"Accuracy: {acc:.4f}")
print(f"Macro‑F1 : {f1m:.4f}\n")

# 3. Full classification report
print("Classification Report:")
print(classification_report(
    df_eval["true_label"],
    df_eval["pred_label"],
    target_names=["negative","neutral","positive"]
))

# 4. Confusion matrix
cm = confusion_matrix(
    df_eval["true_label"],
    df_eval["pred_label"],
    labels=[0,1,2]
)
cm_df = pd.DataFrame(
    cm,
    index=["true_neg","true_neu","true_pos"],
    columns=["pred_neg","pred_neu","pred_pos"]
)
print("\nConfusion Matrix:")
display(cm_df)

# 5. (Optional) Save out predictions
df_eval.to_csv("hf_sentiment_predictions.csv", index=False)
print("\nSaved predictions and true labels to hf_sentiment_predictions.csv")


### 8. Fine-tune on Manually Labeled WWII Bunker Reviews

In [None]:
# 1. Load labeled bunker data (now includes neutral!)
df_train = pd.read_pickle("../data/processed/bunker_reviews_fine_tuning.pkl")
print("Total labeled reviews:", len(df_train))

# 2. Map text labels → integer IDs for all three classes
label2id = {"negative":0, "neutral":1, "positive":2}
df_train["label_id"] = df_train["manual_classification"].map(label2id)

texts  = df_train["clean_text"].tolist()
labels = df_train["label_id"].tolist()

# 3. Split into train / validation (e.g. 80/20 here)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, 
    test_size=0.2, 
    random_state=42,
    stratify=labels
)
print(" → train:", len(train_texts), "  val:", len(val_texts))

# 4. Compute class weights for the 3 classes
classes = np.unique(train_labels)
weights = compute_class_weight("balanced", classes=classes, y=train_labels)
class_weights = torch.tensor(weights, dtype=torch.float)
print("Class weights:", dict(zip(classes, weights)))

# 5. Dataset wrapper
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = self.texts[idx]
        label = self.labels[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels":         torch.tensor(label, dtype=torch.long)
        }

# 6. Custom Trainer to apply weighted loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 7. Tokenizer & Datasets
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_ds = SentimentDataset(train_texts, train_labels, tokenizer)
val_ds   = SentimentDataset(val_texts,   val_labels,   tokenizer)

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="outputs/bunker_multi_class",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=2e-5,
    fp16=True if torch.cuda.is_available() else False
)

# 9. Model with 3 output labels
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3
)

# 10. Initialize and launch training
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds
)

print("Fine‑tuning three‑way (neg/neu/pos) BERT on bunker data …")
trainer.train()
trainer.save_model("outputs/bunker_multi_class/final_model")
print("Done, model saved.")


### 9. Evaluate on WWII Bunker Reviews

In [None]:
# 1. Load the 3‑class model you just fine‑tuned
model = BertForSequenceClassification.from_pretrained(
    "outputs/bunker_multi_class/final_model",
    num_labels=3
)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 2. Load your manual test set (with all three labels still present)
df_test = pd.read_pickle("../data/processed/bunker_reviews_test_set.pkl")
print(f"Full manual test set: {len(df_test)} reviews")

# Map textual labels → integers
label2id = {"negative":0, "neutral":1, "positive":2}
id2label = {v:k for k,v in label2id.items()}
df_test["label_id"] = df_test["manual_classification"].map(label2id)

# 3. Build a simple inference Dataset
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze()
        }

test_texts = df_test["clean_text"].tolist()
test_ds    = InferenceDataset(test_texts, tokenizer)

# 4. Run predictions
trainer = Trainer(model=model, args=TrainingArguments(output_dir="tmp_eval", per_device_eval_batch_size=8))
preds_output = trainer.predict(test_ds)
logits = preds_output.predictions                     # shape (N,3)
probs  = F.softmax(torch.tensor(logits), dim=1).numpy()
pred_ids = probs.argmax(axis=1)

# 5. Map back to string labels
df_test["predicted_sentiment"] = [id2label[i] for i in pred_ids]

# 6. Compute metrics
y_true = df_test["label_id"].tolist()
y_pred = pred_ids.tolist()

acc = accuracy_score(y_true, y_pred)
print(f"\nOverall accuracy: {acc:.4f}\n")

print("Confusion matrix:")
cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
print(pd.DataFrame(cm, index=["neg","neu","pos"], columns=["neg","neu","pos"]))

print("\nClassification report:")
print(classification_report(y_true, y_pred, target_names=["negative","neutral","positive"]))

# 7. Inspect the errors
errors = df_test[df_test["label_id"] != df_test["predicted_sentiment"].map(label2id)]
print(f"\nNumber of misclassified examples: {len(errors)}")
display(errors[["clean_text","manual_classification","predicted_sentiment"]].head(10))
