In [2]:
from my_import import *

from datasets import load_from_disk

train_dataset = load_from_disk("path/to/train_dataset")
val_dataset = load_from_disk("path/to/val_dataset")
test_dataset = load_from_disk("path/to/test_dataset")

df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')
df_test = pd.read_csv('df_test.csv')
df_full = pd.read_csv('final_cleaned_dataset_df.csv')

#Make sure the genre collumns is in lists not strings
#NEED TO DO THIS EVERYTIME EXPORT DATASET
df_train['genres'] = df_train['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_val['genres'] = df_val['genres'].apply(lambda x: list(ast.literal_eval(x)))
df_test['genres'] = df_test['genres'].apply(lambda x: list(ast.literal_eval(x)))

df_train=df_train.drop(columns=['title','index'])
df_val=df_val.drop(columns=['title','index'])
df_test=df_test.drop(columns=['title','index'])

# Assuming df_train, df_val, df_test exist and contain "synopsis" and "genres" columns
all_genres = sorted(set(genre for sublist in df_train["genres"] for genre in sublist))
label2id = {genre: idx for idx, genre in enumerate(all_genres)}
id2label = {idx: genre for genre, idx in label2id.items()}
num_labels = len(label2id)

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df_train['clean_synopsis'] = df_train['synopsis'].apply(clean_text)
df_val['clean_synopsis'] = df_val['synopsis'].apply(clean_text)
df_test['clean_synopsis'] = df_test['synopsis'].apply(clean_text)

In [3]:
# Flatten the genre lists
all_genres_comb = [genre for genre_list in df_train['genres'] for genre in genre_list]

# Count each genre
genre_counts = Counter(all_genres_comb)

# Total number of samples
total_samples = len(df_train)

# Compute prior probability for each genre
genre_priors = {genre: count / total_samples for genre, count in genre_counts.items()}

# Display
prior_threshold=[]
for genre, prob in sorted(genre_priors.items()):
    print(f"{genre}: {prob:.4f}")
    prior_threshold.append(prob)
prior_threshold

Action: 0.3128
Adventure: 0.2180
Comedy: 0.4091
Drama: 0.2069
Fantasy: 0.2368
Historical: 0.0743
Kids: 0.0834
Mecha: 0.0763
Music: 0.0514
Mystery: 0.0666
Romance: 0.1678
School: 0.1360
Sci-Fi: 0.1993
Seinen: 0.0712
Shoujo: 0.0565
Shounen: 0.1603
Slice of Life: 0.1410
Sports: 0.0507
Super Power: 0.0500
Supernatural: 0.1277


[0.3127808988764045,
 0.21797752808988763,
 0.40912921348314607,
 0.20688202247191012,
 0.23679775280898877,
 0.07429775280898876,
 0.08342696629213484,
 0.07626404494382022,
 0.05140449438202247,
 0.06657303370786517,
 0.1678370786516854,
 0.13595505617977527,
 0.19929775280898876,
 0.07120786516853933,
 0.056460674157303374,
 0.16025280898876404,
 0.14101123595505619,
 0.050702247191011234,
 0.05,
 0.12766853932584268]

In [5]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef

def optimize_thresholds_with_precision_constraint(probs, labels, thresholds=np.linspace(0, 1, 1000), min_precision=0.5):
    best_thresholds = []
    for i in range(probs.shape[1]):
        best_f1 = 0
        best_thresh = 0.5
        for t in thresholds:
            pred = (probs[:, i] >= t).astype(int)
            precision = precision_score(labels[:, i], pred, zero_division=0)
            f1 = f1_score(labels[:, i], pred, zero_division=0)
            if f1 > best_f1 and precision >= min_precision:
                best_f1 = f1
                best_thresh = t
        best_thresholds.append(best_thresh)
        print(f"Label {i}: Best threshold = {best_thresh:.2f}, F1 = {best_f1:.4f}")
    return (best_thresholds)

def compute_tpr_tnr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    return tpr, tnr

def optimize_thresholds_with_mcc_constraint(probs, labels, thresholds=np.linspace(0, 1, 1000)):
    best_thresholds = []

    for i in range(probs.shape[1]):
        best_mcc = -1
        best_thresh = 0.5

        for t in thresholds:
            pred = (probs[:, i] >= t).astype(int)
            tpr, tnr = compute_tpr_tnr(labels[:, i], pred)
            mcc = matthews_corrcoef(labels[:, i], pred)
            if mcc > best_mcc:
                best_mcc = mcc
                best_thresh = t
 

        best_thresholds.append(best_thresh)
        print(f"Label {i}: Best threshold = {best_thresh:.2f}, MCC = {best_mcc:.4f}")
    
    return best_thresholds

# optimised_thresh=optimize_thresholds_with_precision_constraint(val_prob,val_labels)
# optimised_thresh=optimize_thresholds_with_mcc_constraint(val_prob,val_labels)

In [37]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score,
    hamming_loss, jaccard_score, roc_auc_score
)

# Load model and tokenizer
model_path = "./7k_distilbert-base-uncased_batch12_LRcosinedecay0.1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Prediction and evaluation function
def predict_genres_with_id2label(dataset, df_source, id2label, threshold=0.5):
    data_collator = DataCollatorWithPadding(tokenizer)
    loader = DataLoader(dataset, batch_size=8, collate_fn=data_collator)

    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="🔍 Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs.logits).cpu().numpy()

            all_probs.append(probs)
            all_labels.append(labels)

    probabilities = np.vstack(all_probs)
    labels = np.vstack(all_labels)
    predictions = (probabilities >= threshold).astype(int)

    # Decode predicted genres using id2label
    predicted_genres = []
    for row in predictions:
        genres = [id2label[i] for i, val in enumerate(row) if val == 1]
        predicted_genres.append(genres)

    # === Calculate metrics ===
    f1 = f1_score(labels, predictions, average='macro')
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    hamming = hamming_loss(labels, predictions)
    jaccard = jaccard_score(labels, predictions, average='macro')
    hit_rate = (np.logical_and(labels, predictions).sum(axis=1) > 0).mean()
    try:
        roc_auc = roc_auc_score(labels, probabilities, average='macro')
    except ValueError:
        roc_auc = np.nan

    # Print metrics
    print("\n📊 Evaluation Metrics:")
    metrics_df = pd.DataFrame({
        'Metric': [
            'F1 Score', 'Precision', 'Recall', 'Exact Match Accuracy',
            'Hamming Loss', 'Jaccard Score', 'Hit Rate', 'ROC AUC'
        ],
        'Value': [
            f1, precision, recall, accuracy,
            hamming, jaccard, hit_rate, roc_auc
        ]
    })
    print(metrics_df)

    # Build result DataFrame
    result_df = pd.DataFrame({
        "synopsis": df_source["synopsis"].values,
        "true_genres": df_source["genres"].values,
        "predicted_genres": predicted_genres
    })

    return result_df, metrics_df, probabilities, labels,predictions

# Run for each dataset
# thresholds=[0.45000000000000007,
#  0.30000000000000004,
#  0.25,
#  0.45000000000000007,
#  0.30000000000000004,
#  0.25,
#  0.30000000000000004,
#  0.35,
#  0.25,
#  0.30000000000000004,
#  0.45000000000000007,
#  0.4,
#  0.4,
#  0.30000000000000004,
#  0.2,
#  0.4,
#  0.45000000000000007,
#  0.2,
#  0.5,
#  0.30000000000000004]
print("0.5 threshoold")
thresholds=0.5
#df_train_results, train_metrics, train_prob, train_labels, train_pred = predict_genres_with_id2label(train_dataset, df_train, id2label,threshold=thresholds)
df_val_results, val_metrics, val_prob, val_labels, val_pred = predict_genres_with_id2label(val_dataset, df_val, id2label,threshold=thresholds)
df_test_results, test_metrics, test_prob, test_labels, test_pred = predict_genres_with_id2label(test_dataset, df_test, id2label,threshold=thresholds)

print("prior_threshold")
thresholds=prior_threshold
df_test_results, test_metrics, test_prob, test_labels, test_pred = predict_genres_with_id2label(test_dataset, df_test, id2label,threshold=thresholds)

# print("F1 constraint")
# optimised_thresh=optimize_thresholds_with_precision_constraint(val_prob,val_labels)
# df_test_results, test_metrics, test_prob, test_labels, test_pred = predict_genres_with_id2label(test_dataset, df_test, id2label,threshold=optimised_thresh)

print("MCC constraint")
optimised_thresh=optimize_thresholds_with_mcc_constraint(val_prob,val_labels)
df_test_results, test_metrics, test_prob, test_labels, test_pred = predict_genres_with_id2label(test_dataset, df_test, id2label,threshold=optimised_thresh)

0.5 threshoold


🔍 Predicting: 100%|██████████| 112/112 [00:20<00:00,  5.51it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.417524
1             Precision  0.585934
2                Recall  0.342807
3  Exact Match Accuracy  0.094170
4          Hamming Loss  0.112500
5         Jaccard Score  0.289048
6              Hit Rate  0.770179
7               ROC AUC  0.849290


🔍 Predicting: 100%|██████████| 112/112 [00:19<00:00,  5.83it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.426427
1             Precision  0.599033
2                Recall  0.348882
3  Exact Match Accuracy  0.089787
4          Hamming Loss  0.109933
5         Jaccard Score  0.297562
6              Hit Rate  0.755331
7               ROC AUC  0.855030
prior_threshold


🔍 Predicting: 100%|██████████| 112/112 [00:19<00:00,  5.77it/s]



📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.472842
1             Precision  0.362804
2                Recall  0.781380
3  Exact Match Accuracy  0.005612
4          Hamming Loss  0.221717
5         Jaccard Score  0.322181
6              Hit Rate  0.940516
7               ROC AUC  0.855030
MCC constraint
Label 0: Best threshold = 0.46, MCC = 0.5580
Label 1: Best threshold = 0.30, MCC = 0.4837
Label 2: Best threshold = 0.31, MCC = 0.4384
Label 3: Best threshold = 0.27, MCC = 0.3254
Label 4: Best threshold = 0.33, MCC = 0.5585
Label 5: Best threshold = 0.26, MCC = 0.5072
Label 6: Best threshold = 0.14, MCC = 0.4819
Label 7: Best threshold = 0.34, MCC = 0.5994
Label 8: Best threshold = 0.28, MCC = 0.6443
Label 9: Best threshold = 0.33, MCC = 0.4540
Label 10: Best threshold = 0.46, MCC = 0.5488
Label 11: Best threshold = 0.37, MCC = 0.6012
Label 12: Best threshold = 0.63, MCC = 0.6110
Label 13: Best threshold = 0.04, MCC = 0.1627
Label 14: Best thresho

🔍 Predicting: 100%|██████████| 112/112 [00:19<00:00,  5.76it/s]



📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.526741
1             Precision  0.526066
2                Recall  0.625889
3  Exact Match Accuracy  0.014590
4          Hamming Loss  0.172447
5         Jaccard Score  0.372431
6              Hit Rate  0.894501
7               ROC AUC  0.855030


In [39]:
def predict_new_synopses(new_synopses, id2label, threshold=0.5):
    """
    Predict genres for a list of new synopses using the trained model.
    """
    # Tokenize the input texts
    inputs = tokenizer(
        new_synopses,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    # Run model inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()

    # Apply threshold to determine predicted genres
    predictions = (probs >= threshold).astype(int)

    # Decode the predicted genres using id2label
    predicted_genres = []
    for row in predictions:
        genres = [id2label[i] for i, val in enumerate(row) if val == 1]
        predicted_genres.append(genres)

    # Create a DataFrame to display the results
    result_df = pd.DataFrame({
        "synopsis": new_synopses,
        "predicted_genres": predicted_genres
    })

    print("\n🎬 Predicted Genres for New Synopses:")
    print(result_df)

    return result_df

# Example usage
new_synopses = [
    "A young boy discovers his hidden powers and joins a group of heroes to fight evil forces.",
    "A romantic comedy where two strangers meet at a cafe and end up falling in love despite their differences.",
    "Love girl school romance"
]

# Predict genres for the new synopses
predicted_df = predict_new_synopses(new_synopses, id2label, threshold=optimised_thresh)
display(predicted_df)



🎬 Predicted Genres for New Synopses:
                                            synopsis  \
0  A young boy discovers his hidden powers and jo...   
1  A romantic comedy where two strangers meet at ...   
2                           Love girl school romance   

                                    predicted_genres  
0  [Action, Adventure, Fantasy, Kids, Shounen, Su...  
1                   [Comedy, Drama, Romance, Seinen]  
2                   [Comedy, Drama, Romance, Seinen]  


Unnamed: 0,synopsis,predicted_genres
0,A young boy discovers his hidden powers and jo...,"[Action, Adventure, Fantasy, Kids, Shounen, Su..."
1,A romantic comedy where two strangers meet at ...,"[Comedy, Drama, Romance, Seinen]"
2,Love girl school romance,"[Comedy, Drama, Romance, Seinen]"


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer
model_name = "./7k_distilbert-base-uncased_batch12_LRcosinedecay0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).detach().numpy()[0]
    return probs

def word_importance(sentence):
    # Original prediction
    original_probs = predict(sentence)
    words = sentence.split()
    importance = []

    for i, word in enumerate(words):
        # Create a new sentence without the current word
        new_sentence = " ".join(words[:i] + words[i+1:])
        new_probs = predict(new_sentence)
        
        # Calculate the difference in probabilities
        diff = abs(original_probs - new_probs)
        importance.append((word, diff.mean()))  # Average difference over all labels

    # Sort words by importance (descending)
    importance.sort(key=lambda x: x[1], reverse=True)
    
    print("Word Importance Ranking:")
    for word, impact in importance:
        print(f"{word}: {impact:.4f}")

# Example usage
sentence = "A young hero goes on an adventure at a far away land to find the lost infinity stones that can help save the world."
word_importance(sentence)



Word Importance Ranking:
evil: 0.0377
hero: 0.0343
battles: 0.0261
forces.: 0.0189
A: 0.0160
young: 0.0148


In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load model and tokenizer
model_name = "./7k_distilbert-base-uncased_batch12_LRcosinedecay0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).detach().numpy()[0]
    return probs

def word_importance(sentence):
    # Original prediction
    original_probs = predict(sentence)
    words = sentence.split()
    epsilon = 1e-6
    importance = []

    for i, word in enumerate(words):
        # Create a new sentence without the current word
        new_sentence = " ".join(words[:i] + words[i+1:])
        new_probs = predict(new_sentence)
        
        # Calculate the normalized difference in probabilities
        diff = np.abs(original_probs - new_probs) / (original_probs + epsilon)
        importance.append((word, diff.mean()))  # Average difference over all labels

    # Sort words by importance (descending)
    importance.sort(key=lambda x: x[1], reverse=True)
    
    print("Normalized Word Importance Ranking:")
    for word, impact in importance:
        print(f"{word}: {impact:.4f}")

# Example usage
sentence = "A Chinese prince meets a regular civilian during his travels and quickly falls in love. They marry and have a daughter, but the prince is soon separated from them during a rebellion. Many years later, the prince has become emperor and locates his lost daughter. She comes to live with him, but unfortunately has picked up several unrefined habits during her times as a civilian."
word_importance(sentence)


Normalized Word Importance Ranking:
habits: 0.8353
love.: 0.2241
prince: 0.2183
civilian.: 0.2004
travels: 0.1911
rebellion.: 0.1905
unrefined: 0.1758
Chinese: 0.1750
regular: 0.1708
prince: 0.1673
emperor: 0.1589
his: 0.1540
prince: 0.1427
civilian: 0.1347
unfortunately: 0.1214
marry: 0.1176
picked: 0.1122
A: 0.1107
and: 0.1006
a: 0.0970
meets: 0.0959
years: 0.0952
locates: 0.0937
daughter.: 0.0927
the: 0.0917
a: 0.0881
They: 0.0878
them: 0.0844
soon: 0.0841
during: 0.0825
live: 0.0818
comes: 0.0809
and: 0.0806
up: 0.0799
quickly: 0.0771
during: 0.0743
a: 0.0738
several: 0.0731
his: 0.0714
the: 0.0712
from: 0.0700
falls: 0.0694
but: 0.0691
her: 0.0656
separated: 0.0647
with: 0.0640
and: 0.0638
in: 0.0636
She: 0.0624
during: 0.0616
but: 0.0609
as: 0.0595
daughter,: 0.0569
to: 0.0558
become: 0.0554
has: 0.0553
a: 0.0502
has: 0.0493
have: 0.0481
Many: 0.0471
him,: 0.0445
is: 0.0364
lost: 0.0338
times: 0.0336
later,: 0.0322


In [26]:
optimised_thresh=optimize_thresholds_with_mcc_constraint(val_prob,val_labels)
df_test_results, test_metrics, test_prob, test_labels, test_pred = predict_genres_with_id2label(test_dataset, df_test, id2label,threshold=optimised_thresh)


Label 0: Best threshold = 0.46, MCC = 0.5580
Label 1: Best threshold = 0.30, MCC = 0.4837
Label 2: Best threshold = 0.31, MCC = 0.4384
Label 3: Best threshold = 0.27, MCC = 0.3254
Label 4: Best threshold = 0.33, MCC = 0.5585
Label 5: Best threshold = 0.26, MCC = 0.5072
Label 6: Best threshold = 0.14, MCC = 0.4819
Label 7: Best threshold = 0.34, MCC = 0.5994
Label 8: Best threshold = 0.28, MCC = 0.6443
Label 9: Best threshold = 0.33, MCC = 0.4540
Label 10: Best threshold = 0.46, MCC = 0.5488
Label 11: Best threshold = 0.37, MCC = 0.6012
Label 12: Best threshold = 0.63, MCC = 0.6110
Label 13: Best threshold = 0.04, MCC = 0.1627
Label 14: Best threshold = 0.22, MCC = 0.3936
Label 15: Best threshold = 0.11, MCC = 0.3047
Label 16: Best threshold = 0.29, MCC = 0.4146
Label 17: Best threshold = 0.22, MCC = 0.8024
Label 18: Best threshold = 0.04, MCC = 0.1954
Label 19: Best threshold = 0.30, MCC = 0.5757


🔍 Predicting: 100%|██████████| 112/112 [00:46<00:00,  2.42it/s]



📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.526741
1             Precision  0.526066
2                Recall  0.625889
3  Exact Match Accuracy  0.014590
4          Hamming Loss  0.172447
5         Jaccard Score  0.372431
6              Hit Rate  0.894501
7               ROC AUC  0.855030


In [42]:
#display(df_train_results)
#display(df_val_results)
display(df_test_results)
df_test_results.to_csv("Predict transformers.csv")

Unnamed: 0,synopsis,true_genres,predicted_genres
0,The daily life of the Motsumoto family. The th...,"[Comedy, Slice of Life]","[Comedy, Seinen, Slice of Life]"
1,"The Soul Tree, the great source of our race. ...","[Action, Adventure, Kids]","[Action, Adventure, Fantasy]"
2,"Fairies living in a fluffy forest, where both ...","[Fantasy, Kids]","[Comedy, Fantasy, Kids]"
3,A cyborg warrior from an ancient Antarctic kin...,"[Action, Mecha, Sci-Fi]","[Action, Adventure, Mecha, Sci-Fi, Shounen, Su..."
4,The 2018 LINE sticker set Poccolies is inspiri...,"[Kids, Slice of Life]","[Comedy, Kids]"
...,...,...,...
886,A series of four animated prequels to Street F...,[Action],"[Action, Drama, Seinen]"
887,"According to HMV, the DVD & Blu-ray volume 7 o...","[Comedy, Seinen, Sports]","[Action, Comedy, Seinen, Shounen, Super Power,..."
888,"Lucius ends up in a space station, where he ex...","[Comedy, Seinen]","[Adventure, Drama, Sci-Fi]"
889,"The ""Megumi and Taiyou"" commercial depicts the...","[Romance, Slice of Life]",[Drama]


In [13]:
print("\n📊 Evaluation Metrics:")
print(train_metrics)
print("\n📊 Evaluation Metrics:")
print(val_metrics)
print("\n📊 Evaluation Metrics:")
print(test_metrics)


📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.551047
1             Precision  0.515300
2                Recall  0.592124
3  Exact Match Accuracy  0.049719
4          Hamming Loss  0.139705
5         Jaccard Score  0.380307
6              Hit Rate  0.901404
7               ROC AUC  0.879735

📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.515501
1             Precision  0.485239
2                Recall  0.549789
3  Exact Match Accuracy  0.040359
4          Hamming Loss  0.150673
5         Jaccard Score  0.347256
6              Hit Rate  0.857623
7               ROC AUC  0.846807

📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.492473
1             Precision  0.458765
2                Recall  0.531528
3  Exact Match Accuracy  0.032548
4          Hamming Loss  0.158923
5         Jaccard Score  0.326676
6              Hit Rate  0.843996
7               ROC AUC  0.845878


In [12]:

from sklearn.metrics import classification_report

#print(classification_report(train_labels, train_pred, target_names=all_genres))
print(classification_report(val_labels, val_pred, target_names=all_genres))
print(classification_report(test_labels, test_pred, target_names=all_genres))


               precision    recall  f1-score   support

       Action       0.67      0.72      0.70       278
    Adventure       0.52      0.68      0.59       205
       Comedy       0.59      0.83      0.69       364
        Drama       0.50      0.32      0.39       184
      Fantasy       0.55      0.77      0.64       214
   Historical       0.63      0.56      0.59        66
         Kids       0.56      0.44      0.49        75
        Mecha       0.53      0.74      0.61        69
        Music       0.82      0.59      0.68        46
      Mystery       0.52      0.50      0.51        60
      Romance       0.58      0.61      0.59       144
       School       0.61      0.73      0.66       121
       Sci-Fi       0.63      0.75      0.69       178
       Seinen       1.00      0.01      0.03        70
       Shoujo       0.58      0.19      0.29        57
      Shounen       0.51      0.26      0.34       142
Slice of Life       0.50      0.49      0.49       125
       Sp

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
roc_auc_per_genre = roc_auc_score(test_labels, test_prob, average=None)
for label, auc in zip(all_genres, roc_auc_per_genre):
    print(f"{label}: ROC AUC = {auc:.4f}")

Action: ROC AUC = 0.8736
Adventure: ROC AUC = 0.8442
Comedy: ROC AUC = 0.7933
Drama: ROC AUC = 0.7605
Fantasy: ROC AUC = 0.8726
Historical: ROC AUC = 0.9041
Kids: ROC AUC = 0.8712
Mecha: ROC AUC = 0.8997
Music: ROC AUC = 0.9267
Mystery: ROC AUC = 0.8861
Romance: ROC AUC = 0.8390
School: ROC AUC = 0.8887
Sci-Fi: ROC AUC = 0.9099
Seinen: ROC AUC = 0.6816
Shoujo: ROC AUC = 0.8106
Shounen: ROC AUC = 0.7651
Slice of Life: ROC AUC = 0.8637
Sports: ROC AUC = 0.9730
Super Power: ROC AUC = 0.8379
Supernatural: ROC AUC = 0.8990


In [15]:
df=pd.DataFrame(roc_auc_per_genre)
display(df)
df.to_csv('transformers.csv', float_format='%.2f', index=False)

Unnamed: 0,0
0,0.873586
1,0.844208
2,0.793346
3,0.760531
4,0.872572
5,0.904114
6,0.871233
7,0.899673
8,0.926679
9,0.886082


In [40]:
display(df_train_results[df_train_results['predicted_genres'].apply(lambda x: 'Super Power' in x)])

Unnamed: 0,synopsis,true_genres,predicted_genres
6062,A retelling of Dragon Ball's origin with a dif...,"[Action, Adventure, Comedy, Fantasy, Sci-Fi, S...","[Action, Adventure, Comedy, Fantasy, Sci-Fi, S..."


In [24]:
id2label

{0: 'Action',
 1: 'Adventure',
 2: 'Comedy',
 3: 'Drama',
 4: 'Fantasy',
 5: 'Historical',
 6: 'Kids',
 7: 'Mecha',
 8: 'Music',
 9: 'Mystery',
 10: 'Romance',
 11: 'School',
 12: 'Sci-Fi',
 13: 'Seinen',
 14: 'Shoujo',
 15: 'Shounen',
 16: 'Slice of Life',
 17: 'Sports',
 18: 'Super Power',
 19: 'Supernatural'}

In [44]:

obs=pd.concat([df_test,pd.DataFrame(test_prob)],axis=1)
display(obs[obs['genres'].apply(lambda x: 'Super Power' in x)])


Unnamed: 0,synopsis,genres,0,1,2,3,4,5,6,7,...,10,11,12,13,14,15,16,17,18,19
45,G.G. Bonds enters the realm of dreams to stop ...,"[Kids, Sci-Fi, Super Power]",0.663281,0.707267,0.263706,0.072617,0.749556,0.024102,0.284564,0.131971,...,0.045903,0.029595,0.552349,0.03279,0.034693,0.240885,0.020868,0.020348,0.185164,0.098412
61,"The anime will be a ""surreal comedy"" that emph...","[Comedy, Super Power]",0.52103,0.167736,0.86567,0.051174,0.025231,0.025825,0.024388,0.044723,...,0.06515,0.039438,0.288733,0.077824,0.011326,0.396796,0.024678,0.021006,0.061839,0.050297
62,Attempting to make GG Bond the partner for Iro...,"[Kids, Sci-Fi, Super Power]",0.726634,0.623016,0.257222,0.218827,0.201862,0.027583,0.078414,0.395652,...,0.1489,0.056064,0.93329,0.047952,0.029621,0.316504,0.020557,0.020528,0.232219,0.055638
80,Will G.G. Bond be able to save Fairy World whe...,"[Kids, Sci-Fi, Super Power]",0.59346,0.619906,0.505294,0.055446,0.713423,0.023895,0.227496,0.057773,...,0.068822,0.046098,0.285189,0.031364,0.034101,0.338254,0.023091,0.017202,0.185768,0.090271
81,Nano Invaders follows the eye-popping adventur...,"[Action, Adventure, Shounen, Super Power]",0.1685,0.444846,0.682921,0.032812,0.330417,0.010815,0.135602,0.039647,...,0.04137,0.012427,0.400116,0.015774,0.029976,0.103777,0.03099,0.003614,0.029226,0.025301
115,Hirono Kenta is your average geeky junior high...,"[Action, Comedy, Romance, School, Super Power]",0.844306,0.590395,0.590975,0.075228,0.843451,0.02114,0.020945,0.034035,...,0.159471,0.101895,0.202523,0.030061,0.056374,0.551343,0.01149,0.010876,0.201774,0.329518
116,This gag comedy series focused on the adventur...,"[Comedy, Sci-Fi, Shounen, Super Power]",0.077084,0.153079,0.928644,0.013399,0.065989,0.016168,0.332006,0.018026,...,0.018825,0.026049,0.067804,0.06004,0.014092,0.169303,0.114385,0.024827,0.028477,0.024701
139,The Garadain Empire has exhausted the primary ...,"[Action, Mecha, Sci-Fi, Super Power]",0.82431,0.447577,0.080146,0.332158,0.054147,0.034697,0.029006,0.760134,...,0.08786,0.029959,0.95828,0.051405,0.013656,0.178779,0.014442,0.02388,0.078563,0.023833
141,Nanako is a young girl who unexpectedly acquir...,"[Comedy, Romance, Super Power]",0.09965,0.116986,0.49103,0.325557,0.081614,0.060507,0.023749,0.013527,...,0.245418,0.067136,0.127648,0.159204,0.123743,0.111246,0.127096,0.008137,0.065432,0.45136
171,After escaping a bus hijacking with the help o...,"[Comedy, Sci-Fi, Super Power, Supernatural]",0.170837,0.058122,0.813003,0.089844,0.100662,0.014151,0.00822,0.008742,...,0.226562,0.063457,0.130494,0.068523,0.056458,0.156328,0.041073,0.003264,0.044419,0.396869


In [20]:
prob

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.189228,0.044054,0.386391,0.051000,0.277327,0.050243,0.033583,0.001554,0.009934,0.022480,0.050698,0.037957,0.008199,0.025480,0.026420,0.074520,0.035108,0.004645,0.020307,0.440925
1,0.324529,0.572769,0.106856,0.046828,0.686594,0.032926,0.204487,0.027330,0.016130,0.008468,0.013098,0.006150,0.143250,0.014191,0.010580,0.041499,0.014987,0.005692,0.025756,0.026076
2,0.049240,0.105725,0.968567,0.051954,0.056790,0.011804,0.041912,0.007780,0.005220,0.053840,0.064715,0.044282,0.106565,0.117988,0.018083,0.186150,0.193840,0.004472,0.032082,0.061187
3,0.128654,0.415124,0.036904,0.436235,0.178163,0.089529,0.183013,0.112099,0.048500,0.068455,0.028694,0.009538,0.543586,0.037757,0.022116,0.031354,0.048627,0.013473,0.024290,0.028815
4,0.446903,0.456485,0.399449,0.088408,0.101008,0.010899,0.162823,0.584253,0.030186,0.013591,0.035995,0.028534,0.890235,0.042499,0.011675,0.186851,0.032842,0.026075,0.065842,0.009373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7115,0.134180,0.112552,0.306784,0.441100,0.411875,0.123365,0.048619,0.032923,0.247250,0.035705,0.739579,0.311053,0.059391,0.083019,0.386229,0.095430,0.239557,0.042325,0.035795,0.166650
7116,0.018437,0.039653,0.105136,0.512189,0.351195,0.083268,0.051431,0.005113,0.167191,0.069928,0.256247,0.109433,0.037396,0.043489,0.324474,0.022790,0.340423,0.011536,0.012292,0.316496
7117,0.812504,0.509887,0.694502,0.120855,0.056722,0.052954,0.056984,0.275701,0.006911,0.049960,0.051577,0.031769,0.600018,0.141380,0.009649,0.541420,0.027861,0.019782,0.131327,0.041555
7118,0.661058,0.169973,0.741584,0.078496,0.056980,0.022723,0.074123,0.053872,0.016841,0.032788,0.033724,0.162324,0.114255,0.102010,0.012383,0.726588,0.047558,0.507971,0.124508,0.049842


In [19]:
df_train

Unnamed: 0,synopsis,genres
0,Shuramaru is hated and feared by the villagers...,[Supernatural]
1,"Sometime in the future, the world was complete...","[Action, Adventure]"
2,"Set in 2014, the anime follows the adventures ...",[Comedy]
3,"This story is about Mick, a sleeping chironomi...",[Adventure]
4,The anime is based on MegaHouse's line of Zigu...,"[Action, Mecha]"
...,...,...
7115,Picture Drama episodes included in each DVD vo...,"[Action, Adventure, Comedy, Drama, Fantasy, Ro..."
7116,"On a hot summer day, Takashi Natsume and his f...","[Drama, Shoujo, Slice of Life, Supernatural]"
7117,"Due to the arrival of aliens called the ""Amant...","[Action, Comedy, Historical, Mecha, Sci-Fi, Sh..."
7118,The Konohagakure Grand Sports Festival has beg...,"[Action, Comedy, Fantasy, Shounen, Sports]"
