# Sentiment Classification Model

 Trained on IMDB dataset, predicts on WW2 Bunker Reviews

### 1. Import Dependencies

In [1]:
import pandas as pd
import torch
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


### 2. Load Data

In [2]:
df = pd.read_pickle("../data/processed/processed_movie_reviews.pkl")
df.head()

Unnamed: 0,review,sentiment,clean_text,tokens
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...,"{'input_ids': [101, 2028, 1997, 1996, 2060, 15..."
1,A wonderful little production. <br /><br />The...,1,A wonderful little production. The filming tec...,"{'input_ids': [101, 1037, 6919, 2210, 2537, 10..."
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...,"{'input_ids': [101, 1045, 2245, 2023, 2001, 10..."
3,Basically there's a family where a little boy ...,0,Basically there's a family where a little boy ...,"{'input_ids': [101, 10468, 2045, 1005, 1055, 1..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Mattei's Love in the Time of Money is a...,"{'input_ids': [101, 9004, 3334, 4717, 7416, 10..."


### 3. Prepare Train/Validation Splits

In [3]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_text"], df["sentiment"], test_size=0.2, random_state=42
)

### 4. Create a Dataset Class

In [4]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist() if hasattr(texts, "tolist") else texts
        self.labels = labels.tolist() if hasattr(labels, "tolist") else labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }
        return item


### 5. Instansiate Tokenizer & Dataset

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

### 6. Initialize Model and Trainer

In [7]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="outputs",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs",
    logging_steps=50,
    bf16 = True if torch.backends.mps.is_available() else False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 7. Train

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

### 8. Evaluate

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

### 10. Predict on Unlabeled Data

In [None]:
df_bunker = pd.read_csv("data/processed/processed_bunker_sentiment.csv")
# or read_pickle if needed
bunker_texts = df_bunker["clean_text"]

# Create dataset (with dummy labels, e.g., -1)
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts.tolist() if hasattr(texts, "tolist") else texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze()
        }

bunker_dataset = InferenceDataset(bunker_texts, tokenizer)

predictions = trainer.predict(bunker_dataset)
predicted_classes = predictions.predictions.argmax(axis=1)  # 0 or 1

df_bunker["predicted_sentiment"] = predicted_classes
df_bunker.to_csv("data/processed/bunker_predictions.csv", index=False)