# EDA

# Fine-tuning

In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import pickle

import pandas as pd  # type: ignore
import numpy as np

from conformal_hybrid import *

In [2]:
RETRAIN = False

In [3]:
save_dir = Path("models_hybrid")

all_data = load_and_preprocess_data_hybrid()
pickle.dump(all_data, open(Path(save_dir, f"all_data.pkl"), "wb"))

dropout_rate = 0.25
hidden_dimension_1 = 2048
hidden_dimension_2 = 128
weight_decay = 0.1
lr = 4e-5

if RETRAIN:
    for i in range(ENSEMBLE_MODELS):
        # for i in [0]:
        print("##############################################################")
        print(f"Training model {i}")
        print(f"Loading and preprocessing data for model {i}")

        data = all_data[i]
        save_path = Path(save_dir, f"submission_{i}.pth")

        # model, tokenizer, best_threshold = train_model(data, num_epochs=3)
        # model, tokenizer, best_threshold, metrics_df = train_model(data, num_epochs=3)
        model, tokenizer, best_threshold, epoch_metrics, step_metrics = (
            train_model_hybrid(
                data,
                num_epochs=EPOCHS,
                learning_rate=lr,
                log_steps=25,
                name=f"model_{i}",
                save_dir=save_dir,
                hidden_dimension_1=hidden_dimension_1,
                hidden_dimension_2=hidden_dimension_2,
                dropout_rate=dropout_rate,
                weight_decay=weight_decay,
            )
        )
        model.model_summary()
        if not Path(save_dir).exists():
            Path(save_dir).mkdir()

        torch.save(model.state_dict(), save_path)
        # pickle.dump(data, open(Path(save_dir, f"data_{i}.pkl"), "wb"))
        del model
        print(f"Done training model {i}")
        print(
            "##############################################################\n"
        )


models = []
# Load the trained model
for i in range(ENSEMBLE_MODELS):
    model = GenreClassifierHybrid(
        n_genres=20,
        hidden_dimension_1=hidden_dimension_1,
        hidden_dimension_2=hidden_dimension_2,
        dropout_rate=dropout_rate,
    )
    save_path = Path(save_dir, f"submission_{i}.pth")
    model.load_state_dict(torch.load(save_path))
    model = model.to(device)
    model.eval()  # Set to evaluation mode
    # data = pickle.load(open(Path(save_dir, f"data_{i}.pkl"), "rb"))
    models.append({"model": model, "save_path": save_path})
    print(f"Loaded model {i}")


print("Models loaded successfully!")



In [4]:
best_threshold = 0.5
print("Computing calibration scores for conformal prediction...")

submissions = []
i = 0
# for i in range(N_MODELS):
data = all_data[i]
model = models[i]["model"]

indices = np.arange(len(data["val_texts"]))
train_idx, test_idx = train_test_split(indices, test_size=0.5, random_state=42)


val_texts = data["val_texts"][train_idx]
val_numeric = data["val_numeric"][train_idx]
val_labels = data["val_labels"][train_idx]
eval_texts = data["val_texts"][test_idx]
eval_numeric = data["val_numeric"][test_idx]
eval_labels = data["val_labels"][test_idx]

# Update data dictionary for calibration
cal_data = {
    "train_texts": None,
    "val_texts": val_texts,
    "train_numeric": data["train_numeric"],
    "val_numeric": val_numeric,
    "train_labels": None,
    "val_labels": val_labels,
    "test_texts": eval_texts,
    "test_numeric": eval_numeric,
    "test_ids": np.arange(len(eval_texts)),
    "genre_columns": data["genre_columns"],
}

best_alpha_max, best_q_max, best_metric_max = find_optimal_alpha_hybrid(
    model,
    data,
    tokenizer,
    # nonconformity_scores,
    best_threshold,
    alpha_values=np.linspace(0.025, 0.2, 20),
    conformal_scores_function=compute_calibration_scores_hybrid_max,
    prediction_function=predict_with_conformal_hybrid_max,
)

print("")
best_alpha, best_q, best_metric = find_optimal_alpha_hybrid(
    model,
    data,
    tokenizer,
    # nonconformity_scores,
    best_threshold,
    alpha_values=np.linspace(0.025, 0.2, 20),
    conformal_scores_function=compute_calibration_scores_hybrid,
    prediction_function=predict_with_conformal_hybrid,
)







































































































































































In [5]:
print(
    f"Best alpha for hybrid max: {best_alpha_max}, q: {best_q_max}, metric: {best_metric_max}"
)
print(
    f"Best alpha for hybrid: {best_alpha}, q: {best_q}, metric: {best_metric}"
)

cal_scores_max = compute_calibration_scores_hybrid_max(
    model, cal_data, tokenizer
)

submission_max, probs, q, standard_preds = predict_with_conformal_hybrid_max(
    model, data, tokenizer, cal_scores_max, threshold=0.5, alpha=best_alpha_max
)


cal_scores = compute_calibration_scores_hybrid(model, cal_data, tokenizer)

submission, probs, q, standard_preds = predict_with_conformal_hybrid(
    model, data, tokenizer, cal_scores, threshold=0.5, alpha=best_alpha
)

print("Saving submission file...")
submission.to_csv(Path(save_dir, "submission.csv"), index=False)
submission_max.to_csv(Path(save_dir, "submission_max.csv"), index=False)

print("Done!")









In [6]:
df_sample = pd.read_csv(data_dir + "/sample.csv")

genre_to_idx = {genre: idx for idx, genre in enumerate(df_sample.columns[1:])}
idx_to_genre = {idx: genre for genre, idx in genre_to_idx.items()}
# Ensure your model is in evaluation mode
model.eval()

data["test_texts"][:15]



In [7]:
train_idx = 9

val_text = data["test_texts"][train_idx]
val_numeric = data["test_numeric"][train_idx]

# val_text = "* Title: 2001: A Space Odyssey [SEP]* Overview: visionary epic chronicling humanity's evolution and its transformative encounter with mysterious alien intelligence, triggered by the discovery of an enigmatic monolith."
val_text = val_text[:216]
print(val_text)

# Tokenize input
encoding = tokenizer(val_text, return_tensors="pt").to(device)
input_ids = encoding["input_ids"].to(device)
print(len(input_ids[0]))



In [8]:
attention_mask = encoding["attention_mask"].to(device)

# First, get embeddings layer explicitly
embedding_layer = model.bert.embeddings.word_embeddings

# Generate embeddings as leaf tensors directly from embedding_layer
embeddings = embedding_layer(input_ids)
embeddings.retain_grad()  # Explicitly retain grad if embeddings is non-leaf
embeddings.requires_grad_()  # Ensure requires_grad is True

# Forward pass manually (since we have custom embeddings)
outputs = model.bert(inputs_embeds=embeddings, attention_mask=attention_mask)

cls_output = outputs.last_hidden_state[:, 0, :]  # shape: [1, hidden_dim]
numeric_tensor = (
    torch.tensor(val_numeric).to(device).unsqueeze(0)
)  # [1, num_features]

hybrid_input = torch.cat(
    [cls_output, numeric_tensor], dim=1
)  # [1, hidden_dim + num_features]
hybrid_input = hybrid_input.to(torch.float)


# Forward through classifier
logits = model.classifier(hybrid_input)

In [9]:
category1 = "Comedy"
category2 = "Science.Fiction"

target_class = genre_to_idx[category1]
score = logits[:, target_class]

# Compute gradients w.r.t. embeddings
model.zero_grad()
score.backward(retain_graph=True)

# Extract gradients
saliency_gradients = embeddings.grad.data.abs().squeeze(
    0
)  # [seq_len, emb_size]

token_gradients_1 = saliency_gradients.mean(dim=1)  # [seq_len]
# token_gradients_comedy /= (
#     token_gradients_comedy.max()
# )  # Normalize gradients between [0,1]


# Choose a target class to compute saliency (e.g., first class, index 0)
target_class = genre_to_idx[category2]
score = logits[:, target_class]

# Compute gradients w.r.t. embeddings
model.zero_grad()
score.backward(retain_graph=True)

# Extract gradients
saliency_gradients = embeddings.grad.data.abs().squeeze(
    0
)  # [seq_len, emb_size]

token_gradients_2 = saliency_gradients.mean(dim=1)  # [seq_len]
# token_gradients_2 /= (
#     token_gradients_2.max()
# )  # Normalize gradients between [0,1]

In [10]:
token_gradients = token_gradients_2 - token_gradients_1
token_gradients /= token_gradients.max()

In [11]:
import matplotlib.pyplot as plt
import numpy as np

token_gradients_np = token_gradients.cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
plt.figure(figsize=(len(tokens) * 0.5, 2))
ax = plt.gca()

# Create color-coded heatmap
gradient_array = np.expand_dims(
    token_gradients_np, axis=0
)  # shape [1, seq_len]

# Display gradient as heatmap
cax = ax.matshow(gradient_array, cmap="Reds", aspect="auto")

# Set token labels on x-axis
ax.set_xticks(range(len(tokens)))
ax.set_xticklabels(tokens, rotation=90, fontsize=12)

# Remove y-axis ticks
ax.set_yticks([])

# Add colorbar for reference
plt.colorbar(cax, orientation="vertical", pad=0.02, fraction=0.025)

plt.title("Saliency map", fontsize=14, pad=20)
plt.savefig("saliency_space_odyssey.pdf", bbox_inches="tight")
plt.show()



In [12]:
import matplotlib.pyplot as plt
import numpy as np

# Dummy data for demonstration; replace with your actual tokens and gradients.
# tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
# token_gradients_np = token_gradients.cpu().numpy()


# --- Step 1. Wrap tokens into lines based on a character limit per line ---
char_limit = (
    100  # Maximum characters per line (approximate, using token text lengths)
)
lines = []
current_line = []
current_length = 0

for token, grad in zip(tokens, token_gradients_np):
    # We add an extra space after each token.
    token_str = token + " "
    token_length = len(token_str)
    # If adding this token exceeds the char limit and there is already content in the current line,
    # wrap to a new line.
    if current_length + token_length > char_limit and current_line:
        lines.append(current_line)
        current_line = [(token, grad)]
        current_length = token_length
    else:
        current_line.append((token, grad))
        current_length += token_length
if current_line:
    lines.append(current_line)

# --- Step 2. Plot tokens line by line with gradient-coded backgrounds ---
cmap = plt.cm.Reds  # Colormap mapping gradients [0,1] to a red-scale color

# Create a figure with height proportional to the number of lines.
line_count = len(lines)
fig, ax = plt.subplots(figsize=(10, line_count * 0.7))
ax.axis("off")  # Hide axes

x_margin = 0.05  # starting x in data coordinates
y_start = 0.9  # starting y (top of the figure)
y_gap = 0.15  # vertical gap between lines

# Get a renderer for text measurements.
fig.canvas.draw()  # Ensure the renderer is initialized
renderer = fig.canvas.get_renderer()

for i, line in enumerate(lines):
    x_pos = x_margin
    y_pos = y_start - i * y_gap
    for token, grad in line:
        color = cmap(grad)  # Map gradient to a color
        # Draw token with a background box
        text_obj = ax.text(
            x_pos,
            y_pos,
            token + " ",
            fontsize=12,
            ha="left",
            va="center",
            bbox=dict(facecolor=color, edgecolor="none", pad=2),
        )
        # Force a draw to compute text dimensions
        fig.canvas.draw()
        extent = text_obj.get_window_extent(renderer=renderer)
        # Convert extent width from display (pixel) coordinates to data coordinates
        inv = ax.transData.inverted()
        (x0, y0), (x1, y1) = inv.transform(
            [(extent.x0, extent.y0), (extent.x1, extent.y1)]
        )
        token_width = x1 - x0
        x_pos += token_width

plt.title(
    f"({category2} gradient - {category1} gradient)\n '2001: A Space Odyssey' (title and overview only)",
    fontsize=14,
    pad=20,
)
plt.savefig(
    Path(save_dir, "saliency_space_odyssey_wrapped.pdf"), bbox_inches="tight"
)
plt.show()



In [65]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import torch
import numpy as np


perplexity = 150
n_movies = 250
proba = 0.7

# Select a subset of test sentences for visualization
sample_texts = data["test_texts"][:n_movies]
features = []

# Ensure the model is in evaluation mode
model.eval()
for text in sample_texts:
    encoding = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.bert(**encoding)
        cls_feature = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        features.append(cls_feature[0])

features = np.array(features)
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
tsne_results = tsne.fit_transform(features)

# plt.figure(figsize=(8,6))
# plt.scatter(tsne_results[:, 0], tsne_results[:, 1])
# plt.title("t-SNE Visualization of Model's CLS Features")
# plt.show()

import re

# import matplotlib.pyplot as plt
# import numpy as np
# from sklearn.manifold import TSNE
# import torch

# Assuming sample_texts and tsne_results are already defined as in your notebook:
# sample_texts = data["test_texts"][:50]
# tsne_results = tsne.fit_transform(features)

plt.figure(figsize=(12, 9))
ax = plt.gca()
ax.scatter(tsne_results[:, 0], tsne_results[:, 1], c="darkred", alpha=0.25)

plt.title("t-SNE Visualization of Model's CLS Features")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.grid(True)

# Loop over each sample point and annotate with the extracted movie title.
for i, text in enumerate(sample_texts):
    if np.random.uniform(0, 1) < proba:
        continue
    # Use a regular expression to find the movie title segment.
    # This regex looks for "* Title:" followed by any characters until the first occurrence of "[SEP]"
    match = re.search(r"\* Title:\s*([^[]+)\[SEP\]", text)
    if match:
        title = match.group(1).strip()
    else:
        title = "Unknown"
    # Annotate the point with the movie title.
    ax.annotate(
        title, (tsne_results[i, 0], tsne_results[i, 1]), fontsize=8, alpha=0.7
    )
plt.savefig(
    Path(save_dir, f"tsne_{perplexity}_p_{proba}.pdf"), bbox_inches="tight"
)
plt.show()



In [57]:
import shap
import torch

# Initialize JavaScript in notebook mode for SHAP plots.
shap.initjs()


def prediction_fn(texts):
    # Ensure texts is a list of strings
    if not isinstance(texts, list):
        texts = list(texts)
    # Tokenize the list of input texts
    encoding = tokenizer(
        texts, return_tensors="pt", padding=True, truncation=True
    ).to(device)
    with torch.no_grad():
        outputs = model.bert(**encoding)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        # Create dummy numerical features (zeros) with the proper shape.
        # Assuming the hybrid model expects 4 numerical features.
        dummy_numeric = torch.zeros(cls_embeddings.size(0), 4).to(device)
        # Concatenate text embeddings and dummy numerical features.
        combined = torch.cat([cls_embeddings, dummy_numeric], dim=1)
        logits = model.classifier(combined)
        probs = torch.sigmoid(logits)
    return probs.cpu().numpy()


explainer = shap.Explainer(
    prediction_fn,
    masker=shap.maskers.Text(tokenizer),
    output_names=["Genre Probabilities"],
)
sample_texts_explainer = list(sample_texts[:5])

shap_values = explainer(sample_texts_explainer)
shap.plots.text(shap_values[0])



