In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import optuna
from optuna.trial import TrialState

In [21]:
data = pd.read_csv("media_data.csv")
data.head()

Unnamed: 0,_id,id,genres,network,rating,release_date,runtime,source,status,summary,title,type
0,683c39e61a9de58bff15874f,tvmaze_1,"['Drama', 'Science-Fiction', 'Thriller']",CBS,6.5,2013-06-24,60,tvmaze,Ended,Under the Dome is the story of a small town th...,Under the Dome,tvshow
1,683c39e61a9de58bff158750,tvmaze_2,"['Action', 'Crime', 'Science-Fiction']",CBS,8.8,2011-09-22,60,tvmaze,Ended,You are being watched. The government has a se...,Person of Interest,tvshow
2,683c39e61a9de58bff158751,tvmaze_3,"['Drama', 'Horror', 'Romance']",CTV Sci-Fi Channel,7.4,2014-01-11,60,tvmaze,Ended,Based on the critically acclaimed series of no...,Bitten,tvshow
3,683c39e61a9de58bff158752,tvmaze_4,"['Drama', 'Action', 'Science-Fiction']",The CW,7.4,2012-10-10,60,tvmaze,Ended,"After a violent shipwreck, billionaire playboy...",Arrow,tvshow
4,683c39e61a9de58bff158753,tvmaze_5,"['Drama', 'Crime', 'Thriller']",HBO,8.1,2014-01-12,60,tvmaze,Running,Touch darkness and darkness touches you back. ...,True Detective,tvshow


In [22]:
# Convert stringified lists to actual Python lists
data['genres'] = data['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Now extract the first genre
data['genres'] = data['genres'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

In [23]:
data.head()


Unnamed: 0,_id,id,genres,network,rating,release_date,runtime,source,status,summary,title,type
0,683c39e61a9de58bff15874f,tvmaze_1,Drama,CBS,6.5,2013-06-24,60,tvmaze,Ended,Under the Dome is the story of a small town th...,Under the Dome,tvshow
1,683c39e61a9de58bff158750,tvmaze_2,Action,CBS,8.8,2011-09-22,60,tvmaze,Ended,You are being watched. The government has a se...,Person of Interest,tvshow
2,683c39e61a9de58bff158751,tvmaze_3,Drama,CTV Sci-Fi Channel,7.4,2014-01-11,60,tvmaze,Ended,Based on the critically acclaimed series of no...,Bitten,tvshow
3,683c39e61a9de58bff158752,tvmaze_4,Drama,The CW,7.4,2012-10-10,60,tvmaze,Ended,"After a violent shipwreck, billionaire playboy...",Arrow,tvshow
4,683c39e61a9de58bff158753,tvmaze_5,Drama,HBO,8.1,2014-01-12,60,tvmaze,Running,Touch darkness and darkness touches you back. ...,True Detective,tvshow


In [24]:
texts = data['title'] + " " + data['summary'].fillna('')
labels = data['genres'].fillna('')

X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, test_size=0.2, random_state=42
        )


In [25]:
X_train.head()

1457    Digimon: Digital Monsters While at summer camp...
208     Hostages Hostages concerns the family of a doc...
1054    The Crimson Petal and the White Follow Sugar i...
451     Columbo This is the detective series that insp...
415     Angel "If you need help, then look no further....
dtype: object

In [26]:
y_train.head()

1457    Action
208      Drama
1054     Drama
451      Drama
415      Drama
Name: genres, dtype: object

In [27]:
# First, get all unique genres from both train and test
all_genres = pd.concat([y_train, y_test]).unique()

# Create a label encoder with all possible genres
label_encoder = LabelEncoder()
label_encoder.fit(all_genres)  # Fit on all possible genres

# Now encode your labels
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [28]:
class MovieGenreDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx] if isinstance(self.texts, pd.Series) else self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [29]:
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

In [30]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(label_encoder.classes_)
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
train_dataset = MovieGenreDataset(X_train, y_train_encoded, tokenizer, MAX_LEN)
test_dataset = MovieGenreDataset(X_test, y_test_encoded, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)



In [33]:
for epoch in range(EPOCHS):
    model.train()
    losses = []
    
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{EPOCHS}', leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_train_loss = np.mean(losses)
    print(f'Epoch {epoch + 1} - Training loss: {avg_train_loss:.4f}')
    
    # Validation
    model.eval()
    val_losses = []
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            val_losses.append(outputs.loss.item())
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Convert to numpy arrays for sklearn
    true_labels = np.array(true_labels)
    predictions = np.array(predictions)

    # Handle potential missing classes in predictions
    present_labels = np.unique(np.concatenate([true_labels, predictions]))
    target_names = [label_encoder.classes_[i] for i in present_labels]

    print(classification_report(
        true_labels,
        predictions,
        target_names=target_names,
        labels=present_labels,
        zero_division=0
    ))

                                                                      

Epoch 1 - Training loss: 1.8724
                 precision    recall  f1-score   support

                      0.42      0.85      0.56        39
         Action       0.00      0.00      0.00        23
      Adventure       0.00      0.00      0.00         7
          Anime       0.00      0.00      0.00         1
       Children       0.00      0.00      0.00         2
         Comedy       0.81      0.49      0.61        72
          Crime       0.00      0.00      0.00         5
            DIY       0.00      0.00      0.00         1
          Drama       0.64      0.90      0.75       136
         Family       0.00      0.00      0.00         2
           Food       0.00      0.00      0.00         6
        History       0.00      0.00      0.00         2
         Horror       0.00      0.00      0.00         2
          Music       0.00      0.00      0.00         4
        Mystery       0.00      0.00      0.00         2
         Nature       0.00      0.00      0.00         

                                                                      

Epoch 2 - Training loss: 1.1427
                 precision    recall  f1-score   support

                      0.48      0.74      0.58        39
         Action       0.56      0.61      0.58        23
      Adventure       0.00      0.00      0.00         7
          Anime       0.00      0.00      0.00         1
       Children       0.00      0.00      0.00         2
         Comedy       0.67      0.92      0.77        72
          Crime       0.00      0.00      0.00         5
            DIY       0.00      0.00      0.00         1
          Drama       0.81      0.77      0.79       136
         Family       0.00      0.00      0.00         2
           Food       0.00      0.00      0.00         6
        History       0.00      0.00      0.00         2
         Horror       0.00      0.00      0.00         2
          Music       0.00      0.00      0.00         4
        Mystery       0.00      0.00      0.00         2
         Nature       0.00      0.00      0.00         

                                                                      

Epoch 3 - Training loss: 0.8509
                 precision    recall  f1-score   support

                      0.55      0.77      0.64        39
         Action       0.57      0.52      0.55        23
      Adventure       0.00      0.00      0.00         7
          Anime       0.00      0.00      0.00         1
       Children       0.00      0.00      0.00         2
         Comedy       0.73      0.92      0.81        72
          Crime       0.00      0.00      0.00         5
            DIY       0.00      0.00      0.00         1
          Drama       0.77      0.81      0.79       136
         Family       0.00      0.00      0.00         2
           Food       1.00      0.83      0.91         6
        History       0.00      0.00      0.00         2
         Horror       0.00      0.00      0.00         2
          Music       0.00      0.00      0.00         4
        Mystery       0.00      0.00      0.00         2
         Nature       0.00      0.00      0.00         

In [None]:
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_epochs", 2, 5)
    
    # Initialize model with current hyperparameters
    model = RobertaForSequenceClassification.from_pretrained(
        'roberta-base',
        num_labels=len(label_encoder.classes_)
    ).to(device)
    
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    # Create data loaders with current batch size
    train_dataset = MovieGenreDataset(X_train, y_train_encoded, tokenizer, MAX_LEN)
    test_dataset = MovieGenreDataset(X_test, y_test_encoded, tokenizer, MAX_LEN)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        losses = []
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    
    # Validation
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy
    accuracy = (np.array(predictions) == np.array(true_labels)).mean()
    return accuracy

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# Print best hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"  Value (accuracy): {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [34]:
model.save_pretrained('./movie_genre_classifier')
tokenizer.save_pretrained('./movie_genre_classifier')

('./movie_genre_classifier\\tokenizer_config.json',
 './movie_genre_classifier\\special_tokens_map.json',
 './movie_genre_classifier\\vocab.json',
 './movie_genre_classifier\\merges.txt',
 './movie_genre_classifier\\added_tokens.json')

In [None]:
model_path = './movie_genre_classifier'
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path).to(device)

def predict_genre(text, model, tokenizer, label_encoder, max_len=256):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    _, prediction = torch.max(outputs.logits, dim=1)
    return label_encoder.inverse_transform(prediction.cpu().numpy())[0]

In [41]:
sample_text = "Death Note is an anime series based around a manga of the same name whereby a human finds a death god's notebook. Any person's name written in this notebook will die. The main character who finds this noteboook is Light Yagami who faces off against an unfaced character named L who tries to challenge him."
predicted_genre = predict_genre(sample_text, model, tokenizer, label_encoder)
print(f"Predicted genre: {predicted_genre}")

Predicted genre: Action
