In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import ast
import nltk
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')
df_test = pd.read_csv('df_test.csv')

for df in [df_train, df_val, df_test]:
    df['genres'] = df['genres'].apply(lambda x: list(ast.literal_eval(x)))

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in stop_words]
    return ' '.join(tokens)

for df in [df_train, df_val, df_test]:
    df['clean_synopsis'] = df['synopsis'].apply(clean_text)


In [5]:
tokenized_texts = [text.split() for text in df_train['clean_synopsis']]
w2v_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=2, workers=4)

word2idx = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}
word2idx["<PAD>"] = 0
idx2word = {i: w for w, i in word2idx.items()}

embedding_matrix = np.zeros((len(word2idx), 100))
for word, idx in word2idx.items():
    if word in w2v_model.wv:
        embedding_matrix[idx] = w2v_model.wv[word]


In [6]:
def text_to_seq(text, word2idx, maxlen=300):
    seq = [word2idx.get(word, 0) for word in text.split()]
    return seq[:maxlen] + [0] * max(0, maxlen - len(seq))

maxlen = 300
df_train['seq'] = df_train['clean_synopsis'].apply(lambda x: text_to_seq(x, word2idx, maxlen))
df_val['seq'] = df_val['clean_synopsis'].apply(lambda x: text_to_seq(x, word2idx, maxlen))
df_test['seq'] = df_test['clean_synopsis'].apply(lambda x: text_to_seq(x, word2idx, maxlen))


In [7]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(df_train['genres'])
y_val = mlb.transform(df_val['genres'])
y_test = mlb.transform(df_test['genres'])
id2label = {i: label for i, label in enumerate(mlb.classes_)}


In [8]:
class AnimeDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

train_dataset = AnimeDataset(df_train['seq'].tolist(), y_train)
val_dataset = AnimeDataset(df_val['seq'].tolist(), y_val)
test_dataset = AnimeDataset(df_test['seq'].tolist(), y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)


In [9]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[0], hidden[1]), dim=1)
        output = self.fc(hidden)
        return self.sigmoid(output)


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(len(word2idx), 100, 128, len(mlb.classes_), embedding_matrix).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    model.train()
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


Epoch 1: Loss = 0.4034
Epoch 2: Loss = 0.3748
Epoch 3: Loss = 0.3513
Epoch 4: Loss = 0.3302
Epoch 5: Loss = 0.3686


In [13]:
def predict_genres_with_lstm(model, dataloader, df_source, mlb, id2label, threshold=0.5, device='cpu'):
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x = batch_x.to(device)
            outputs = model(batch_x)
            all_probs.append(outputs.cpu().numpy())
            all_labels.append(batch_y.numpy())

    # Concatenate
    probabilities = np.vstack(all_probs)
    y_true = np.vstack(all_labels)

    # Apply threshold
    predictions = (probabilities >= threshold).astype(int)

    # Decode predicted genres using id2label
    predicted_genres = []
    for row in predictions:
        genres = [id2label[i] for i, val in enumerate(row) if val == 1]
        predicted_genres.append(genres)

    # === Calculate metrics ===
    from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, hamming_loss, jaccard_score, roc_auc_score

    f1 = f1_score(y_true, predictions, average='micro')
    precision = precision_score(y_true, predictions, average='micro')
    recall = recall_score(y_true, predictions, average='micro')
    accuracy = accuracy_score(y_true, predictions)
    hamming = hamming_loss(y_true, predictions)
    jaccard = jaccard_score(y_true, predictions, average='micro')
    hit_rate = (np.logical_and(y_true, predictions).sum(axis=1) > 0).mean()
    try:
        roc_auc = roc_auc_score(y_true, probabilities, average='micro')
    except ValueError:
        roc_auc = np.nan

    metrics_df = pd.DataFrame({
        'Metric': [
            'F1 Score', 'Precision', 'Recall', 'Exact Match Accuracy',
            'Hamming Loss', 'Jaccard Score', 'Hit Rate', 'ROC AUC'
        ],
        'Value': [
            f1, precision, recall, accuracy,
            hamming, jaccard, hit_rate, roc_auc
        ]
    })

    result_df = pd.DataFrame({
        "synopsis": df_source["synopsis"].values,
        "true_genres": df_source["genres"].values,
        "predicted_genres": predicted_genres
    })

    print("\n📊 Evaluation Metrics:")
    print(metrics_df)

    return result_df, metrics_df, probabilities, y_true, predictions

use_thresh = 0.5
id2label = {i: label for i, label in enumerate(mlb.classes_)}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df_train_results, train_metrics, train_prob, train_labels, train_pred = predict_genres_with_lstm(
    model, train_loader, df_train, mlb, id2label, threshold=use_thresh, device=device)

df_val_results, val_metrics, val_prob, val_labels, val_pred = predict_genres_with_lstm(
    model, val_loader, df_val, mlb, id2label, threshold=use_thresh, device=device)

df_test_results, test_metrics, test_prob, test_labels, test_pred = predict_genres_with_lstm(
    model, test_loader, df_test, mlb, id2label, threshold=use_thresh, device=device)



📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.176237
1             Precision  0.564499
2                Recall  0.104418
3  Exact Match Accuracy  0.007022
4          Hamming Loss  0.141341
5         Jaccard Score  0.096634
6              Hit Rate  0.274719
7               ROC AUC  0.763261

📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.199746
1             Precision  0.578269
2                Recall  0.120723
3  Exact Match Accuracy  0.011211
4          Hamming Loss  0.141031
5         Jaccard Score  0.110954
6              Hit Rate  0.316143
7               ROC AUC  0.773717

📊 Evaluation Metrics:
                 Metric     Value
0              F1 Score  0.209826
1             Precision  0.580702
2                Recall  0.128046
3  Exact Match Accuracy  0.014590
4          Hamming Loss  0.139899
5         Jaccard Score  0.117210
6              Hit Rate  0.337823
7               ROC AUC  0.771184


NameError: name 'df_train_results' is not defined