# Prediccion del Genero de una Pelicula

## Solucion con Finetuning de Transformer Modelo Xlnet Large

In [None]:
import torch
import numpy as np
import pandas as pd

In [None]:
#device = "mps" # para mac M1 en adelante
device = "cuda:0" if torch.cuda.is_available() else "cpu" #para gpu
print(device)

In [None]:
df_train = pd.read_parquet('https://github.com/amiune/amiune.github.io/raw/master/movie-genre-prediction/train.parquet', engine='pyarrow')
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.genre = df_train.genre.astype('category')

id2label = dict(zip(df_train.genre.cat.codes, df_train.genre))
print(id2label)
label2id = dict(zip(df_train.genre, df_train.genre.cat.codes))
print(label2id)

In [None]:
num_classes = len(df_train.genre.value_counts())
print(num_classes)

In [None]:
#df_train["text"] = df_train["movie_name"].str.lower() + ". " + df_train["synopsis"].str.lower()
df_train["text"] = df_train["movie_name"] + ". " + df_train["synopsis"]

In [None]:
df_train.iloc[0,:]["text"]

In [None]:
def read_columns(df, text_column, label_column):
    texts = df[text_column].tolist()
    labels = df[label_column].cat.codes.tolist()
    return texts, labels

In [None]:
train_texts, train_labels = read_columns(df_train.iloc[0:,:], "synopsis","genre")
print(len(train_texts),len(train_labels))

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.05, stratify=train_labels, random_state=42)
print(len(train_texts),len(train_labels))
print(len(val_texts),len(val_labels))

# HF Transformers

In [None]:
!pip install transformers[torch] --quiet
!pip install sentencepiece

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlnet-large-cased')

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
print(type(train_encodings))
print(train_encodings.keys())

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #item es un diccionario que contiene tres keys: 'input_ids', 'attention_mask' y 'labels'
        #cada key contiene el tensor correspodiente al indice idx
        item = {}
        item['input_ids'] = torch.tensor(self.encodings['input_ids'][idx]).to(device)
        item['attention_mask'] = torch.tensor(self.encodings['attention_mask'][idx]).to(device)
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)
print(len(train_dataset),len(val_dataset))
print(train_dataset[0])

In [None]:
!pip install evaluate --quiet

In [None]:
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

#https://huggingface.co/transformers/v4.2.2/main_classes/trainer.html#trainingarguments
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=64,
    learning_rate=0.0000045,
    weight_decay=0.65,
    dataloader_pin_memory=False,     # remove if possible for faster training
    save_steps=1_000_000_000,         # dont save checkpoints
    save_total_limit=0,
    evaluation_strategy = "epoch",
    output_dir="./results"
)

model = AutoModelForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=num_classes).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
df_test = pd.read_parquet('https://github.com/amiune/amiune.github.io/raw/master/movie-genre-prediction/test.parquet', engine='pyarrow')
df_test.head()

In [None]:
df_test["text"] = df_test["movie_name"] + ". " + df_test["synopsis"]
test_texts = df_test["text"].to_list()
test_texts[0]

In [None]:
test_preds = []
test_probs = np.zeros((len(test_texts),num_classes))
for i in range(len(test_texts)):
  val_encoding = tokenizer(test_texts[i], truncation=True, padding=True, return_tensors="pt").to(device)
  outputs = model(**val_encoding)
  logits = outputs.logits.cpu().detach().numpy()
  # obtener prediccion
  test_preds.append(id2label[np.argmax(logits)])
  # calcular probabilidades
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(outputs.logits.squeeze().cpu())
  for j in range(num_classes):
    test_probs[i,j] = probs[j]

from google.colab import files

tmp_dict = {"id":df_test["id"]}
for j in range(num_classes):
    tmp_dict[id2label[j]] = test_probs[:,j]
df_probs = pd.DataFrame(tmp_dict)
df_probs.to_csv("xlnet_large_probs2.csv", index=False)
files.download('xlnet_large_probs2.csv')

df_submission = pd.DataFrame({"id":df_test["id"],"genre":test_preds})
df_submission.to_csv("submission6.csv", index=False)
files.download('submission6.csv')

### Referencias:

- [Curso simple Coursera](https://www.coursera.org/)

- [Curso Avanzado Stanford](https://web.stanford.edu/class/cs224n/index.html#schedule)

- [Implementar Transformers para Clasificacion](https://huggingface.co/transformers/v3.2.0/custom_datasets.html)