In [1]:
from my_import import *
df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')
df_test = pd.read_csv('df_test.csv')
df_full = pd.read_csv('final_cleaned_dataset_df.csv')

#Make sure the genre collumns is in lists not strings
#NEED TO DO THIS EVERYTIME EXPORT DATASET
df_train['genres'] = df_train['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_val['genres'] = df_val['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_test['genres'] = df_test['genres'].apply(lambda x: list(ast.literal_eval(x)))

df_train=df_train.drop(columns=['title','index'])
df_val=df_val.drop(columns=['title','index'])
df_test=df_test.drop(columns=['title','index'])


display(df_train)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,synopsis,genres
0,"Sometime in the future, the world was complete...","[Action, Adventure]"
1,"Set in 2014, the anime follows the adventures ...",[Comedy]
2,Follows a pig whose family's mission is to col...,"[Comedy, Kids]"
3,"In honor of the 2018 World Cup, this season of...","[Kids, Sci-Fi, Sports, Super Power]"
4,"Fairies living in a fluffy forest, where both ...","[Fantasy, Kids]"
...,...,...
9189,The Konohagakure Grand Sports Festival has beg...,"[Action, Comedy, Fantasy, Other, Shounen, Sports]"
9190,Special bundled with the Blu-ray/DVD volume of .,[Ecchi]
9191,"According to the official Hobby Japan website,...","[Comedy, Ecchi, Fantasy, Parody]"
9192,A series of comedic shorts featuring chibi ver...,"[Adventure, Comedy, Fantasy, Parody]"


In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, jaccard_score
import numpy as np
import pandas as pd
from tqdm import tqdm

# ========== STEP 1: Device & Data ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# One-hot encoding
all_genres = sorted(set(genre for sublist in df_train["genres"] for genre in sublist))
label2id = {genre: idx for idx, genre in enumerate(all_genres)}
id2label = {idx: genre for genre, idx in label2id.items()}
num_labels = len(label2id)

def encode_labels(genres):
    vec = np.zeros(num_labels, dtype=np.float32)
    for genre in genres:
        vec[label2id[genre]] = 1.0
    return vec

df_train["labels"] = df_train["genres"].apply(encode_labels)
df_test["labels"] = df_test["genres"].apply(encode_labels)

# ========== STEP 2: Tokenize ==========
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

train_encodings = tokenizer(df_train["synopsis"].tolist(), padding=True, truncation=True, return_tensors="pt", max_length=256)
test_encodings = tokenizer(df_test["synopsis"].tolist(), padding=True, truncation=True, return_tensors="pt", max_length=256)

train_labels = torch.tensor(list(df_train["labels"].values))
test_labels = torch.tensor(list(df_test["labels"].values))

# ========== STEP 3: Create Dataloaders ==========
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels)
test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# ========== STEP 4: Load Model ==========
config = AutoConfig.from_pretrained(model_ckpt,
                                    num_labels=num_labels,
                                    problem_type="multi_label_classification",
                                    id2label=id2label,
                                    label2id=label2id)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

# ========== STEP 5: Optimizer ==========
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

# ========== STEP 6: Training Loop ==========
num_epochs = 4

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch+1}/{num_epochs}")
    for batch in tqdm(train_loader, desc="Training"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Train Loss: {avg_loss:.4f}")

    # ========== STEP 7: Evaluation ==========
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.sigmoid(logits) > 0.5

            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)

    f1 = f1_score(y_true, y_pred, average="samples")
    jaccard = jaccard_score(y_true, y_pred, average="samples")
    hits = (np.logical_and(y_true, y_pred).sum(axis=1) > 0).mean()

    print(f"F1 (samples): {f1:.4f} | Jaccard: {jaccard:.4f} | Hit Rate: {hits:.4f}")


Using device: cuda


  train_labels = torch.tensor(list(df_train["labels"].values))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/4


Training: 100%|██████████| 1150/1150 [12:44<00:00,  1.50it/s]


Train Loss: 0.2997


Evaluating: 100%|██████████| 141/141 [00:20<00:00,  6.79it/s]


F1 (samples): 0.3306 | Jaccard: 0.2505 | Hit Rate: 0.5865

Epoch 2/4


Training: 100%|██████████| 1150/1150 [08:10<00:00,  2.34it/s]


Train Loss: 0.2364


Evaluating: 100%|██████████| 141/141 [00:29<00:00,  4.79it/s]


F1 (samples): 0.4300 | Jaccard: 0.3365 | Hit Rate: 0.7090

Epoch 3/4


Training: 100%|██████████| 1150/1150 [08:07<00:00,  2.36it/s]


Train Loss: 0.2021


Evaluating: 100%|██████████| 141/141 [00:20<00:00,  6.79it/s]


F1 (samples): 0.4973 | Jaccard: 0.3919 | Hit Rate: 0.8004

Epoch 4/4


Training: 100%|██████████| 1150/1150 [07:50<00:00,  2.44it/s]


Train Loss: 0.1725


Evaluating: 100%|██████████| 141/141 [00:20<00:00,  6.78it/s]

F1 (samples): 0.4922 | Jaccard: 0.3854 | Hit Rate: 0.8048





In [7]:
def predict_genres(synopsis, threshold=0.5):
    model.eval()
    inputs = tokenizer(synopsis, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

    predicted_indices = [i for i, p in enumerate(probs) if p > threshold]
    return [id2label[i] for i in predicted_indices]

# Example use
synopsis = "Student council president Miyuki Shirogane and vice-president Kaguya Shinomiya appear to be the perfect couple; but both are too proud to confess their love and scheme to make the other confess their love first."
genres = predict_genres(synopsis)
print("Genres:", genres)


Genres: ['Romance', 'School']


In [8]:
model.save_pretrained("anime-genre-model")
tokenizer.save_pretrained("anime-genre-model")

('anime-genre-model\\tokenizer_config.json',
 'anime-genre-model\\special_tokens_map.json',
 'anime-genre-model\\vocab.txt',
 'anime-genre-model\\added_tokens.json',
 'anime-genre-model\\tokenizer.json')

In [11]:
tokenizer = AutoTokenizer.from_pretrained("anime-genre-model")
model = AutoModelForSequenceClassification.from_pretrained("anime-genre-model").to(device)

synopsis = "The year is 2052—an era of unprecedented peace and prosperity prevails across the globe. The reason for this: mankind has been freed from sickness and pain. Nobel Prize winning neuroscientist Dr. Skinner has developed a miracle cure-all drug with no apparent drawbacks called Hapuna. Hapuna soon becomes ubiquitous... and essential. However, soon after Hapuna is officially introduced, Dr. Skinner vanishes."
genres = predict_genres(synopsis)
print("Genres:", genres)


Genres: ['Sci-Fi']
