In [None]:
from my_import import *
df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')
df_test = pd.read_csv('df_test.csv')
df_full = pd.read_csv('final_cleaned_dataset_df.csv')

#Make sure the genre collumns is in lists not strings
#NEED TO DO THIS EVERYTIME EXPORT DATASET
df_train['genres'] = df_train['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_val['genres'] = df_val['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_test['genres'] = df_test['genres'].apply(lambda x: list(ast.literal_eval(x)))

df_train=df_train.drop(columns=['title','index'])
df_val=df_val.drop(columns=['title','index'])
df_test=df_test.drop(columns=['title','index'])


display(df_train)




# ========== STEP 1: Setup & Data Prep ==========
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
from sklearn.metrics import (
    f1_score, jaccard_score, hamming_loss, accuracy_score,precision_score, recall_score
)
import torch
from torch.nn import BCEWithLogitsLoss

# Confirm GPU availability
print("GPU available:", torch.cuda.is_available())
print("Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# ========== STEP 2: Label Setup ==========
# Assuming df_train, df_val, df_test exist and contain "synopsis" and "genres" columns
all_genres = sorted(set(genre for sublist in df_train["genres"] for genre in sublist))
label2id = {genre: idx for idx, genre in enumerate(all_genres)}
id2label = {idx: genre for genre, idx in label2id.items()}
num_labels = len(label2id)

def encode_labels(genres):
    vec = np.zeros(num_labels, dtype=np.float32)
    for genre in genres:
        vec[label2id[genre]] = 1.0
    return vec

df_train["labels"] = df_train["genres"].apply(encode_labels)
df_val["labels"] = df_val["genres"].apply(encode_labels)
df_test["labels"] = df_test["genres"].apply(encode_labels)
display(df_train)



# ========== STEP 3: Tokenization ==========
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

train_encodings = tokenizer(df_train["synopsis"].tolist(), truncation=True)
val_encodings = tokenizer(df_val["synopsis"].tolist(), truncation=True)
test_encodings = tokenizer(df_test["synopsis"].tolist(), truncation=True)

# ========== STEP 4: Create Datasets ==========
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": list(df_train["labels"])
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": list(df_val["labels"])
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": list(df_test["labels"])
})

# ========== STEP 5: Model Setup ==========
config = AutoConfig.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# ========== STEP 6: Custom Trainer ==========
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # <--- added **kwargs here
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = BCEWithLogitsLoss()(logits, labels.float())
        return (loss, outputs) if return_outputs else loss


# ========== STEP 7: Metrics ==========
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(pred):
    labels = pred.label_ids
    probability = sigmoid(pred.predictions)  # fixed typo: pred.predications → pred.predictions
    preds = (probability > 0.5).astype(int)

    f1 = f1_score(labels, preds, average="samples")
    precision = precision_score(labels, preds, average="samples")
    recall = recall_score(labels, preds, average="samples")
    jaccard = jaccard_score(labels, preds, average="samples")
    hits = (np.logical_and(labels, preds).sum(axis=1) > 0).mean()
    hamming = hamming_loss(labels, preds)
    exact = accuracy_score(labels, preds)

    return {
        "f1_samples": f1,
        "precision": precision,
        "recall": recall,
        "jaccard": jaccard,
        "hit_rate": hits,
        "hamming_loss": hamming,
        "exact_match": exact
    }

# ========== STEP 8: Data Collator ==========
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# ========== STEP 9: Training Arguments ==========
training_args = TrainingArguments(
    output_dir="./7k_distilbert-base-uncased_batch12_LRlindecay0.1",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=8,
    learning_rate=1e-5,
    lr_scheduler_type="linear",         # decay strategy: linear, cosine, polynomial, etc.
    warmup_ratio=0.1,   
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # Save based on validation loss
    greater_is_better=False,            # Lower loss is better
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",
    disable_tqdm=False,
    log_level="info"
)

# ========== STEP 10: Trainer ==========
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# ========== STEP 11: Train ==========
trainer.train()
