# Sentiment Classification Model 8

Trained on merged dataset (SST-3, DynaSent R1, R2) with neutral reviews

### 1. Import Dependencies

In [44]:
import pandas as pd
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, f1_score

from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss

### 2. Load Data 

In [45]:
from datasets import load_dataset, ClassLabel

dataset = load_dataset("jbeno/sentiment_merged")

label_names = ["negative", "neutral", "positive"]

dataset = dataset.cast_column("label", ClassLabel(names=label_names))


### 3. Create a Dataset Class

In [46]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [47]:
def tokenize_batch(examples):
    return tokenizer(
        examples["sentence"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

dataset = dataset.map(tokenize_batch, batched=True)



Map:   0%|          | 0/102097 [00:00<?, ? examples/s]

Map:   0%|          | 0/5421 [00:00<?, ? examples/s]

Map:   0%|          | 0/6530 [00:00<?, ? examples/s]

### 4. Instansiate Tokenizer & Dataset

In [48]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

### 5. Initialize Model and Trainer

In [50]:
training_args = TrainingArguments(
    output_dir="outputs_uncased_8",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs_uncased",
    logging_steps=50,
    fp16=True if torch.cuda.is_available() else False,
)



### 6. Train

In [51]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_names))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset= dataset["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [53]:
print("Training bert-base-uncased model...")
trainer.train()

Training bert-base-uncased model...


Epoch,Training Loss,Validation Loss
1,0.6673,0.723802
2,0.4975,0.741225
3,0.2705,1.160512


TrainOutput(global_step=38289, training_loss=0.49712436580082603, metrics={'train_runtime': 3359.4996, 'train_samples_per_second': 91.172, 'train_steps_per_second': 11.397, 'total_flos': 8.058927182932685e+16, 'train_loss': 0.49712436580082603, 'epoch': 3.0})

In [64]:
metrics = trainer.evaluate(eval_dataset=dataset["test"])
# turn into a 2‑column table (metric name + value)
df_metrics = (
    pd.Series(metrics)
      .rename_axis("metric")
      .to_frame("value")
      .sort_values(by="metric")
)
display(df_metrics)


Unnamed: 0_level_0,value
metric,Unnamed: 1_level_1
eval_accuracy,0.73951
eval_f1,0.74148
eval_loss,1.11131
eval_model_preparation_time,0.0245
eval_precision,0.749701
eval_recall,0.73951
eval_runtime,19.63
eval_samples_per_second,332.655
eval_steps_per_second,41.62


In [66]:
predictions_uncased = trainer_uncased.predict(test_ds_uncased)
pred_labels_uncased = predictions_uncased.predictions.argmax(axis=1)

Epoch,Training Loss,Validation Loss


### 7. Fine-tune on Manually Labeled WWII Bunker Reviews

In [70]:
# 1) load + label‑encode
df_train = pd.read_pickle("../data/processed/bunker_reviews_fine_tuning.pkl")
label2id = {"negative":0, "neutral":1, "positive":2}
df_train["label_id"] = df_train["manual_classification"].map(label2id)

# drop anything that couldn't be mapped
df_train = df_train.dropna(subset=["label_id"])
df_train["label_id"] = df_train["label_id"].astype(int)

# 2) Split
texts  = df_train["clean_text"].tolist()
labels = df_train["label_id"].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

# 3. Compute per‑class weights
classes = np.unique(train_labels)
weights = compute_class_weight("balanced", classes=classes, y=train_labels)
class_weights = torch.tensor(weights, dtype=torch.float)

# 4. Dataset wrapper (as before)
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels":         torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 5. Weighted Trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 6. Tokenizer & Datasets
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_ds = SentimentDataset(train_texts, train_labels, tokenizer)
val_ds   = SentimentDataset(val_texts,   val_labels,   tokenizer)

# 7. Training args
training_args = TrainingArguments(
    output_dir="outputs/bunker_multi_class",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=2e-5,
    fp16=True if torch.cuda.is_available() else False
)

# 8. Model & Trainer
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3
)
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds
)

# 9. Train!
print("Fine‑tuning three‑way (neg/neu/pos) BERT on bunker data …")
trainer.train()
trainer.save_model("outputs/bunker_multi_class/final_model")
print("Done, model saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine‑tuning three‑way (neg/neu/pos) BERT on bunker data …


Epoch,Training Loss,Validation Loss
1,1.0534,0.952271
2,0.5994,0.694078
3,0.4014,0.913246


Done, model saved.


### 8. Evaluate on WWII Bunker Reviews

In [72]:
# 1. Load the fine‑tuned 3‑way model
model = BertForSequenceClassification.from_pretrained(
    "outputs/bunker_multi_class/final_model",
    num_labels=3
)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 2. Load your manual test set
df_test = pd.read_pickle("../data/processed/bunker_reviews_test_set.pkl")
print(f"Full manual test set: {len(df_test)} reviews")

# Create mappings label<->ID
label2id = {"negative":0, "neutral":1, "positive":2}
id2label = {v:k for k,v in label2id.items()}

# Encode the true labels
df_test["label_id"] = df_test["manual_classification"].map(label2id)

# 3. Build an inference Dataset
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze()
        }

test_texts = df_test["clean_text"].tolist()
test_ds    = InferenceDataset(test_texts, tokenizer)

# 4. Run predictions
eval_args = TrainingArguments(
    output_dir="tmp_eval",
    per_device_eval_batch_size=8,
    dataloader_drop_last=False,
)
trainer = Trainer(model=model, args=eval_args)
preds_output = trainer.predict(test_ds)
logits = preds_output.predictions           # shape (N, 3)
probs  = F.softmax(torch.tensor(logits), dim=1).numpy()
pred_ids = probs.argmax(axis=1)

# 5. Map back to string labels
df_test["predicted_sentiment"] = [id2label[i] for i in pred_ids]

# 6. Compute metrics
y_true = df_test["label_id"].tolist()
y_pred = pred_ids.tolist()

acc = accuracy_score(y_true, y_pred)
print(f"\nOverall accuracy: {acc:.4f}\n")

print("Confusion matrix:")
cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
cm_df = pd.DataFrame(cm,
                     index=["negative","neutral","positive"],
                     columns=["negative","neutral","positive"])
display(cm_df)

print("\nClassification report:")
print(classification_report(y_true, y_pred,
                            target_names=["negative","neutral","positive"]))

# 7. Inspect a few of the misclassified reviews
errors = df_test[df_test["label_id"] != df_test["predicted_sentiment"].map(label2id)]
print(f"\nNumber of misclassified examples: {len(errors)}")
display(errors[["clean_text","manual_classification","predicted_sentiment"]].head(10))

# 8. Save all misclassified reviews to disk
errors.to_csv("../data/processed/bunker_multi_class_misclassified.csv", 
              columns=["clean_text","manual_classification","predicted_sentiment"], 
              index=False)
print("All misclassified reviews saved to data/processed/bunker_multi_class_misclassified.csv")


Full manual test set: 194 reviews



Overall accuracy: 0.9124

Confusion matrix:


Unnamed: 0,negative,neutral,positive
negative,12,3,0
neutral,2,14,6
positive,0,6,151



Classification report:
              precision    recall  f1-score   support

    negative       0.86      0.80      0.83        15
     neutral       0.61      0.64      0.62        22
    positive       0.96      0.96      0.96       157

    accuracy                           0.91       194
   macro avg       0.81      0.80      0.80       194
weighted avg       0.91      0.91      0.91       194


Number of misclassified examples: 17


Unnamed: 0,clean_text,manual_classification,predicted_sentiment
4,"Great ship, only the guide makes it difficult ...",neutral,positive
8,I guess it's ok but not really any indication ...,negative,neutral
35,It's okay and you get to see a lot. But this s...,negative,neutral
49,Magic,positive,neutral
53,"Only recommended with a guided tour, as you ca...",neutral,positive
54,Underwater base and pool of lights (idem) Restful,neutral,positive
75,"Super interesting! However, I felt a little un...",positive,neutral
87,Top!,positive,neutral
93,Interesting museum in principle and is a histo...,negative,neutral
104,"Interesting place, too bad I didn’t get inside...",neutral,positive


All misclassified reviews saved to data/processed/bunker_multi_class_misclassified.csv
