In [1]:
from my_import import *
df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')
df_test = pd.read_csv('df_test.csv')
df_full = pd.read_csv('final_cleaned_dataset_df.csv')

#Make sure the genre collumns is in lists not strings
#NEED TO DO THIS EVERYTIME EXPORT DATASET
df_train['genres'] = df_train['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_val['genres'] = df_val['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_test['genres'] = df_test['genres'].apply(lambda x: list(ast.literal_eval(x)))

df_train=df_train.drop(columns=['title','index'])
df_val=df_val.drop(columns=['title','index'])
df_test=df_test.drop(columns=['title','index'])


display(df_train)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,synopsis,genres
0,"Sometime in the future, the world was complete...","[Action, Adventure]"
1,"Set in 2014, the anime follows the adventures ...",[Comedy]
2,Follows a pig whose family's mission is to col...,"[Comedy, Kids]"
3,"In honor of the 2018 World Cup, this season of...","[Kids, Sci-Fi, Sports, Super Power]"
4,"Fairies living in a fluffy forest, where both ...","[Fantasy, Kids]"
...,...,...
9189,The Konohagakure Grand Sports Festival has beg...,"[Action, Comedy, Fantasy, Other, Shounen, Sports]"
9190,Special bundled with the Blu-ray/DVD volume of .,[Ecchi]
9191,"According to the official Hobby Japan website,...","[Comedy, Ecchi, Fantasy, Parody]"
9192,A series of comedic shorts featuring chibi ver...,"[Adventure, Comedy, Fantasy, Parody]"


In [None]:
# ========== STEP 1: Setup & Data Prep ==========
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, jaccard_score
import torch
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, jaccard_score, hamming_loss, accuracy_score

# Confirm GPU availability
print("GPU available:", torch.cuda.is_available())
print("Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# Example: df_train and df_test should have columns "synopsis" and "genres" (list of strings)
# df_train = pd.read_csv("your_train_data.csv")
# df_test = pd.read_csv("your_test_data.csv")

# ========== STEP 2: Label Setup ==========
# Extract unique genres
all_genres = sorted(set(genre for sublist in df_train["genres"] for genre in sublist))
label2id = {genre: idx for idx, genre in enumerate(all_genres)}
id2label = {idx: genre for genre, idx in label2id.items()}
num_labels = len(label2id)

# One-hot encode labels
def encode_labels(genres):
    vec = np.zeros(num_labels, dtype=np.float32)
    for genre in genres:
        vec[label2id[genre]] = 1.0
    return vec

df_train["labels"] = df_train["genres"].apply(encode_labels)
df_val["labels"] = df_val["genres"].apply(encode_labels)
df_test["labels"] = df_test["genres"].apply(encode_labels)

# ========== STEP 3: Tokenization ==========
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

train_encodings = tokenizer(df_train["synopsis"].tolist(), padding=True, truncation=True, return_tensors="np", max_length=200)
val_encodings = tokenizer(df_val["synopsis"].tolist(), padding=True, truncation=True, return_tensors="np", max_length=256)
test_encodings = tokenizer(df_test["synopsis"].tolist(), padding=True, truncation=True, return_tensors="np", max_length=256)

# ========== STEP 4: Create Datasets ==========
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": list(df_train["labels"])
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": list(df_val["labels"])
})
test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": list(df_test["labels"])
})

# ========== STEP 5: Model Setup ==========
config = AutoConfig.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config)
model = model.to("cuda")

# ========== STEP 6: Custom Trainer ==========
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = BCEWithLogitsLoss()(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

# ========== STEP 7: Metrics ==========
def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    f1 = f1_score(labels, preds, average="samples")
    jaccard = jaccard_score(labels, preds, average="samples")
    hits = (np.logical_and(labels, preds).sum(axis=1) > 0).mean()
    return {"f1_samples": f1, "jaccard": jaccard, "hit_rate": hits}

# ========== STEP 8: TrainingArgs ==========
training_args = TrainingArguments(
    output_dir="./anime-genre-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",
    disable_tqdm=False,
    log_level="info",
   
)

# ========== STEP 9: Trainer & Train ==========
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


GPU available: True
Using device: NVIDIA GeForce GTX 1650


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = MultiLabelTrainer(
***** Running training *****
  Num examples = 9,194
  Num Epochs = 2
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 6,130
  Number of trainable parameters = 66,975,004


Epoch,Training Loss,Validation Loss,F1 Samples,Jaccard,Hit Rate
1,0.2533,0.249846,0.234319,0.17868,0.42236
2,0.2712,0.236609,0.302998,0.233347,0.528838



***** Running Evaluation *****
  Num examples = 1127
  Batch size = 3
Saving model checkpoint to ./anime-genre-model\checkpoint-3065
Configuration saved in ./anime-genre-model\checkpoint-3065\config.json
Model weights saved in ./anime-genre-model\checkpoint-3065\model.safetensors
tokenizer config file saved in ./anime-genre-model\checkpoint-3065\tokenizer_config.json
Special tokens file saved in ./anime-genre-model\checkpoint-3065\special_tokens_map.json

***** Running Evaluation *****
  Num examples = 1127
  Batch size = 3
Saving model checkpoint to ./anime-genre-model\checkpoint-6130
Configuration saved in ./anime-genre-model\checkpoint-6130\config.json
Model weights saved in ./anime-genre-model\checkpoint-6130\model.safetensors
tokenizer config file saved in ./anime-genre-model\checkpoint-6130\tokenizer_config.json
Special tokens file saved in ./anime-genre-model\checkpoint-6130\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models

TrainOutput(global_step=6130, training_loss=0.25619256471148516, metrics={'train_runtime': 1006.3924, 'train_samples_per_second': 18.271, 'train_steps_per_second': 6.091, 'total_flos': 1218469973139456.0, 'train_loss': 0.25619256471148516, 'epoch': 2.0})

In [6]:
x=train_encodings['input_ids']

In [None]:
# df_val["labels"] = df_val["genres"].apply(encode_labels)
# val_encodings = tokenizer(df_val["synopsis"].tolist(), padding=True, truncation=True, return_tensors="np", max_length=256)
# val_dataset = Dataset.from_dict({
#     "input_ids": val_encodings["input_ids"],
#     "attention_mask": val_encodings["attention_mask"],
#     "labels": list(df_val["labels"])
# })
prediction=trainer.predict(val_dataset)


***** Running Prediction *****
  Num examples = 1139
  Batch size = 3


PredictionOutput(predictions=array([[-1.8221236 , -3.1561813 , -1.5802286 , ..., -5.560172  ,
        -4.028331  , -1.2129418 ],
       [-4.501064  , -3.9590275 ,  0.84210795, ..., -3.5361032 ,
        -5.0380325 , -3.8415973 ],
       [ 0.6901433 ,  0.01577828, -2.5583565 , ..., -4.143129  ,
        -3.4288852 , -3.1410546 ],
       ...,
       [-4.694147  , -4.54762   , -0.9086509 , ..., -3.0604844 ,
        -5.1731577 , -4.563735  ],
       [-2.0607045 , -2.9925125 ,  1.304122  , ..., -3.7645273 ,
        -3.5458267 , -2.9565072 ],
       [ 1.0861759 , -1.0807424 , -2.6614997 , ..., -4.4800687 ,
        -2.2402742 , -1.5804557 ]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.23532433807849884, 'test_f1_samples': 0.29647015748

In [10]:
# Get all unique genres
all_genres = sorted(set(genre for sublist in df_train["genres"] for genre in sublist))

# Create mappings
label2id = {genre: idx for idx, genre in enumerate(all_genres)}
id2label = {idx: genre for genre, idx in label2id.items()}
num_labels = len(label2id)

def encode_labels(genres):
    vec = np.zeros(num_labels, dtype=np.float32)
    for genre in genres:
        vec[label2id[genre]] = 1.0
    return vec

df_train["labels"] = df_train["genres"].apply(encode_labels)
df_test["labels"] = df_test["genres"].apply(encode_labels)

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenize the text
encodings = tokenizer(df_train["synopsis"].tolist(), padding=True, truncation=True, return_tensors="np")
encode_test=tokenizer(df_test["synopsis"].tolist(), padding=True, truncation=True, return_tensors="np")


dataset = Dataset.from_dict({
    "input_ids": encodings["input_ids"],
    "attention_mask": encodings["attention_mask"],
    "labels": list(df_train["labels"])
})
dataset_test=Dataset.from_dict({
    "input_ids": encode_test["input_ids"],
    "attention_mask": encode_test["attention_mask"],
    "labels": list(df_test["labels"])
})


config = AutoConfig.from_pretrained(model_ckpt,
                                    num_labels=num_labels,
                                    problem_type="multi_label_classification",
                                    id2label=id2label,
                                    label2id=label2id)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config)
model = model.to("cuda")

from torch.nn import BCEWithLogitsLoss

class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = BCEWithLogitsLoss()(logits, labels.float())
        return (loss, outputs) if return_outputs else loss
    

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)

    f1 = f1_score(labels, preds, average="samples")
    jaccard = jaccard_score(labels, preds, average="samples")
    hits = (np.logical_and(labels, preds).sum(axis=1) > 0).mean()

    return {"f1_samples": f1, "jaccard": jaccard, "hit_rate": hits}


training_args = TrainingArguments(
    output_dir="./anime-genre-model",
    eval_strategy="epoch",         # Evaluate each epoch
    save_strategy="epoch",               # Save each epoch
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1_samples",
    logging_steps=10,                    # Log every 10 steps
    logging_dir="./logs",                # Optional: store logs for TensorBoard
    report_to="none",                    # or 'tensorboard' if you want
    disable_tqdm=False,                  # Show progress bar
    log_level="info",                     # Show training logs
    fp16=True
)

trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = MultiLabelTrainer(
Using auto half precision backend
***** Running training *****
  Num examples = 9,194
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1,152
  Number of trainable parameters = 66,975,004


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [21]:
unique_genres =list(set(genre for sublist in df_train['genres'] for genre in sublist))
# Ensure genres are a sorted list
unique_genres_list = sorted(list(unique_genres))
print(unique_genres_list)
print("Num of unique genres:",len(unique_genres))

id2label = {k:v for k,v in enumerate(unique_genres_list)}
label2id = {v:k for k,v in enumerate(unique_genres_list)}

print(id2label)
print(label2id)
# Your genre-to-index dictionary
genre2id = {'Action': 0, 'Adventure': 1, 'Comedy': 2, 'Demons': 3, 'Drama': 4, 'Ecchi': 5,
            'Fantasy': 6, 'Harem': 7, 'Hentai': 8, 'Historical': 9, 'Horror': 10, 'Kids': 11,
            'Mecha': 12, 'Military': 13, 'Music': 14, 'Mystery': 15, 'Other': 16, 'Parody': 17,
            'Romance': 18, 'School': 19, 'Sci-Fi': 20, 'Seinen': 21, 'Shoujo': 22, 'Shounen': 23,
            'Slice of Life': 24, 'Sports': 25, 'Super Power': 26, 'Supernatural': 27}

num_labels = len(genre2id)

def encode_multilabel(example):
    vec = np.zeros(num_labels, dtype=np.float32)
    for genre_id in example["genres"]:
        vec[genre_id] = 1.0
    example["labels"] = vec
    return example


['Action', 'Adventure', 'Comedy', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Harem', 'Hentai', 'Historical', 'Horror', 'Kids', 'Mecha', 'Military', 'Music', 'Mystery', 'Other', 'Parody', 'Romance', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Sports', 'Super Power', 'Supernatural']
Num of unique genres: 28
{0: 'Action', 1: 'Adventure', 2: 'Comedy', 3: 'Demons', 4: 'Drama', 5: 'Ecchi', 6: 'Fantasy', 7: 'Harem', 8: 'Hentai', 9: 'Historical', 10: 'Horror', 11: 'Kids', 12: 'Mecha', 13: 'Military', 14: 'Music', 15: 'Mystery', 16: 'Other', 17: 'Parody', 18: 'Romance', 19: 'School', 20: 'Sci-Fi', 21: 'Seinen', 22: 'Shoujo', 23: 'Shounen', 24: 'Slice of Life', 25: 'Sports', 26: 'Super Power', 27: 'Supernatural'}
{'Action': 0, 'Adventure': 1, 'Comedy': 2, 'Demons': 3, 'Drama': 4, 'Ecchi': 5, 'Fantasy': 6, 'Harem': 7, 'Hentai': 8, 'Historical': 9, 'Horror': 10, 'Kids': 11, 'Mecha': 12, 'Military': 13, 'Music': 14, 'Mystery': 15, 'Other': 16, 'Parody': 17, 'Romance': 18, 'Sch

In [3]:
from datasets import Features, ClassLabel, Value, Dataset, DatasetDict ,Sequence

ds_features = Features({"synopsis": Value("string"), "genres": Sequence(ClassLabel(names=unique_genres_list))})

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train.reset_index(drop=True),features=ds_features),
    "valid": Dataset.from_pandas(df_val.reset_index(drop=True),features=ds_features),
    "test": Dataset.from_pandas(df_test.reset_index(drop=True),features=ds_features)})

dataset
dataset_encoded = dataset_encoded.map(encode_multilabel)

NameError: name 'unique_genres_list' is not defined

In [4]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [24]:
def tokenize(batch):
    return tokenizer(batch["synopsis"], padding=True, truncation=True)

dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 9194/9194 [00:02<00:00, 4106.86 examples/s]
Map: 100%|██████████| 1139/1139 [00:00<00:00, 5056.65 examples/s]
Map: 100%|██████████| 1127/1127 [00:00<00:00, 5428.44 examples/s]


In [25]:
print(dataset_encoded["train"].column_names)
print(dataset_encoded['train']['genres'][2])
print(dataset_encoded['train']['input_ids'][2])
decoded_text=tokenizer.decode(dataset_encoded['train']['input_ids'][2])
print(decoded_text)

['synopsis', 'genres', 'input_ids', 'attention_mask']
[2, 11]
[101, 4076, 1037, 10369, 3005, 2155, 1005, 1055, 3260, 2003, 2000, 8145, 2104, 27578, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
from transformers import AutoModelForSequenceClassification, AutoConfig

num_labels = len(unique_genres_list)

config = (AutoConfig
          .from_pretrained(model_ckpt, num_labels=num_labels, 
                           label2id=label2id, id2label=id2label))

model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, config=config)
         .to(device))

NameError: name 'unique_genres_list' is not defined

In [7]:
from sklearn.metrics import f1_score, jaccard_score
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    probs = pred.predictions
    preds = (probs > 0.5).astype(int)  # apply sigmoid thresholding

    f1 = f1_score(labels, preds, average="samples")
    jaccard = jaccard_score(labels, preds, average="samples")
    
    # Hit rate: at least one correct label per sample
    hits = (np.logical_and(labels, preds).sum(axis=1) > 0).mean()

    return {
        "f1_samples": f1,
        "jaccard": jaccard,
        "hit_rate": hits
    }

In [8]:
from transformers import Trainer, TrainingArguments

batch_size = 16
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=4,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
#                                   push_to_hub=True, 
                                  log_level="error",
                                  save_total_limit=1
                                  )

NameError: name 'dataset_encoded' is not defined

In [14]:
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # <== added **kwargs
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(logits, labels.float())

        return (loss, outputs) if return_outputs else loss
    
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()


  trainer = MultiLabelTrainer(


KeyError: 'labels'

In [9]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["valid"],
                  tokenizer=tokenizer)
trainer.train()

NameError: name 'model' is not defined