# Predict Average Bunker Score

Preidcts the average score for each bunker solely based on the sentiment analysis of the text in each review.

### 1. Imports

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt


### 2: File paths

In [None]:
data_path = "data/processed/processed_bunker_sentiment.pkl"
model_path = "notebooks/outputs_final/bunker_multi_class/final_model"

### 3: Load and filter data

In [None]:
df = pd.read_pickle(data_path)
bunker_counts = df["bunker_name"].value_counts()
valid_bunkers = bunker_counts[bunker_counts >= 100].index
df = df[df["bunker_name"].isin(valid_bunkers)].copy()
print(f"Filtered to {len(valid_bunkers)} bunkers with >= 100 reviews")

### 4: Load model and tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

### 5. Prediction function

In [None]:
def predict_sentiment_batch(token_batch):
    input_ids = torch.tensor([x["input_ids"] for x in token_batch]).to(device)
    attention_mask = torch.tensor([x["attention_mask"] for x in token_batch]).to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)
    return preds.cpu().tolist()

### 6: Predict in batches

In [None]:
batch_size = 32
tokens = df["tokens"].tolist()
predictions = []

for i in tqdm(range(0, len(tokens), batch_size)):
    batch = tokens[i:i+batch_size]
    preds = predict_sentiment_batch(batch)
    predictions.extend(preds)

### 7. Add predictions to dataframe

In [None]:
df["predicted_label"] = predictions
label_to_score = {0: -1, 1: 0, 2: 1}
df["sentiment_score"] = df["predicted_label"].map(label_to_score)

### 8. Aggregate scores per bunker

In [None]:
bunker_scores = df.groupby("bunker_name")["sentiment_score"].mean().reset_index()
bunker_scores = bunker_scores.sort_values(by="sentiment_score", ascending=False)
bunker_scores.head()

### 9: Save results

In [None]:
output_path = "data/processed/bunker_sentiment_ranking.csv"
bunker_scores.to_csv(output_path, index=False)
print(f"Saved bunker sentiment ranking to: {output_path}")

### 10. Distribution of Average Sentiment Scores Across Bunkers

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(bunker_scores["sentiment_score"], bins=20, kde=True, color="steelblue")
plt.title("Distribution of Average Sentiment Scores Across Bunkers")
plt.xlabel("Average Sentiment Score")
plt.ylabel("Number of Bunkers")
plt.tight_layout()
plt.show()