# Prediccion del Genero de una Pelicula

## Solucion con Finetuning de Transformer

In [None]:
import torch
import numpy as np
import pandas as pd

In [None]:
#device = "mps" # para mac M1 en adelante
device = "cuda:0" if torch.cuda.is_available() else "cpu" #para gpu
print(device)

cuda:0


In [None]:
df_train = pd.read_parquet('https://github.com/amiune/freecodingtour/raw/main/cursos/espanol/deeplearning/data/train.parquet', engine='pyarrow')

In [None]:
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [None]:
df_train.shape

(54000, 4)

In [None]:
df_train.genre = df_train.genre.astype('category')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54000 entries, 0 to 53999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   id          54000 non-null  int64   
 1   movie_name  54000 non-null  object  
 2   synopsis    54000 non-null  object  
 3   genre       54000 non-null  category
dtypes: category(1), int64(1), object(2)
memory usage: 1.3+ MB


In [None]:
df_train.genre.value_counts()

action       5400
adventure    5400
crime        5400
family       5400
fantasy      5400
horror       5400
mystery      5400
romance      5400
scifi        5400
thriller     5400
Name: genre, dtype: int64

In [None]:
id2label = dict(zip(df_train.genre.cat.codes, df_train.genre))
print(id2label)
label2id = dict(zip(df_train.genre, df_train.genre.cat.codes))
print(label2id)

{4: 'fantasy', 5: 'horror', 3: 'family', 8: 'scifi', 0: 'action', 2: 'crime', 1: 'adventure', 6: 'mystery', 7: 'romance', 9: 'thriller'}
{'fantasy': 4, 'horror': 5, 'family': 3, 'scifi': 8, 'action': 0, 'crime': 2, 'adventure': 1, 'mystery': 6, 'romance': 7, 'thriller': 9}


In [None]:
num_classes = len(df_train.genre.value_counts())
print(num_classes)

10


In [None]:
df_train.iloc[0,:]["synopsis"]

'A young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. Selling them makes him rich.'

In [None]:
def read_columns(df, text_column, label_column):
    texts = df[text_column].tolist()
    labels = df[label_column].cat.codes.tolist()
    return texts, labels

In [None]:
train_texts, train_labels = read_columns(df_train.iloc[0:,:], "synopsis","genre")

In [None]:
print(len(train_texts),len(train_labels))

100 100


In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [None]:
print(len(train_texts),len(train_labels))

70 70


# HF Transformers

In [None]:
!pip install transformers[torch] --quiet
#!pip install accelerate -U --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
type(train_encodings)

transformers.tokenization_utils_base.BatchEncoding

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #item es un diccionario que contiene tres keys: 'input_ids', 'attention_mask' y 'labels'
        #cada key contiene el tensor correspodiente al indice idx
        #item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item = {}
        item['input_ids'] = torch.tensor(self.encodings['input_ids'][idx]).to(device)
        item['attention_mask'] = torch.tensor(self.encodings['attention_mask'][idx]).to(device)
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)

In [None]:
print(len(train_dataset),len(val_dataset))

70 30


In [None]:
train_dataset[0]

{'input_ids': tensor([  101,  4748, 16338,  2006,  1037,  7186,  1010,  2048, 28616,  8873,
          2102,  3459,  9497,  2015,  5998,  2000, 25372,  2019, 27776, 15265,
          7716, 11636,  2155,  1010,  2041,  1011,  2448,  1037, 12779,  1010,
          1998, 13676,  1037,  3521,  3066,  2006,  1037, 13675, 25508,  2075,
         15745,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0], device='cuda:0'),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        device='cuda:0'),
 'labels': tensor(3, device='cuda:0')}

In [None]:
!pip install evaluate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    dataloader_pin_memory=False,     # remove if possible for faster training
    evaluation_strategy = "epoch",
    output_dir="./results"
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_classes).to(device)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.we

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.286304,0.133333


TrainOutput(global_step=3, training_loss=2.2936102549235025, metrics={'train_runtime': 3.9048, 'train_samples_per_second': 17.926, 'train_steps_per_second': 0.768, 'total_flos': 1304161992000.0, 'train_loss': 2.2936102549235025, 'epoch': 1.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.9675090312957764,
 'eval_accuracy': 0.3664197530864198,
 'eval_runtime': 41.3284,
 'eval_samples_per_second': 391.983,
 'eval_steps_per_second': 6.146,
 'epoch': 4.0}

In [None]:
#from transformers import DistilBertForSequenceClassification
#model = DistilBertForSequenceClassification.from_pretrained("amiune/bert-clasificacion-peliculas", num_labels=num_classes).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
encoding = tokenizer("This is a movie about a couple in love", truncation=True, padding=True, return_tensors="pt").to(device)
encoding

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3185, 2055, 1037, 3232, 1999, 2293,  102]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [None]:
outputs = model(**encoding)

In [None]:
logits = outputs.logits
logits.shape

torch.Size([1, 10])

In [None]:
# calcular probabilidades
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
# transformar predicciones a nombres
predicted_labels = [(probs[idx].item(), id2label[idx]) for idx, label in enumerate(probs)]
list(reversed(sorted(predicted_labels)))

[(0.5358123183250427, 'crime'),
 (0.5237465500831604, 'horror'),
 (0.5168167948722839, 'fantasy'),
 (0.5101510286331177, 'action'),
 (0.5030176639556885, 'scifi'),
 (0.5011835098266602, 'family'),
 (0.49890342354774475, 'mystery'),
 (0.48189038038253784, 'romance'),
 (0.4747811555862427, 'thriller'),
 (0.468729168176651, 'adventure')]

In [None]:
max(predicted_labels)[1]

'crime'

In [None]:
len(val_texts)

16200

## Como predecir nuevos valores en batch

In [None]:
test_encoding = tokenizer(["This is a movie about a couple in love",
                      "This is a very scary movie"],
                     truncation=True, padding=True,  return_tensors="pt").to(device)
outputs = model(**test_encoding)
logits = outputs.logits
logits.shape

torch.Size([2, 10])

In [None]:
# calcular probabilidades
for i in range(logits.shape[0]):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(logits[i].squeeze().cpu())
  # transformar predicciones a nombres
  predicted_labels = [(probs[idx].item(), id2label[idx]) for idx, label in enumerate(probs)]
  print(list(reversed(sorted(predicted_labels))))
  print(max(predicted_labels)[1])

[(0.9866400361061096, 'romance'), (0.8892194032669067, 'family'), (0.5927843451499939, 'fantasy'), (0.3652147054672241, 'thriller'), (0.31700220704078674, 'action'), (0.25062334537506104, 'crime'), (0.1991966962814331, 'mystery'), (0.18163041770458221, 'adventure'), (0.14182519912719727, 'scifi'), (0.0764179527759552, 'horror')]
romance
[(0.9810206890106201, 'horror'), (0.8305793404579163, 'thriller'), (0.5052431225776672, 'fantasy'), (0.48686283826828003, 'mystery'), (0.4394770562648773, 'action'), (0.43945422768592834, 'scifi'), (0.23916655778884888, 'adventure'), (0.18225450813770294, 'crime'), (0.09471964091062546, 'family'), (0.08791545033454895, 'romance')]
horror


In [None]:
#TODO: add id2label to the model
#from transformers import pipeline
#pipe = pipeline("text-classification", model="amiune/bert-clasificacion-peliculas", device=0)
#pipe("This is a very scary movie")

## Guardar mi modelo localmente en el servidor de Colab y luego descargarlo

In [None]:
trainer.save_model("./mi_modelo")

In [None]:
from google.colab import files
files.download('./mi_modelo/pytorch_model.bin')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Loguearse en HuggingFace y guardar mi modelo

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("bert-clasificacion-peliculas")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/amiune/bert-clasificacion-peliculas/commit/240f6725089d6d7fe605a74c1a239fb24842ad6c', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='240f6725089d6d7fe605a74c1a239fb24842ad6c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("bert-clasificacion-peliculas")

CommitInfo(commit_url='https://huggingface.co/amiune/bert-clasificacion-peliculas/commit/3405c8c8bad6f4987c589fad31bc60b6bb48d0ad', commit_message='Upload tokenizer', commit_description='', oid='3405c8c8bad6f4987c589fad31bc60b6bb48d0ad', pr_url=None, pr_revision=None, pr_num=None)

# Cargar el modelo desde HuggingFace y predecir la tabla de Test

In [None]:
import torch
import numpy as np
import pandas as pd

In [None]:
#device = "mps" # para mac M1 en adelante
device = "cuda:0" if torch.cuda.is_available() else "cpu" #para gpu
print(device)

cuda:0


In [None]:
!pip install transformers[torch] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m88.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("amiune/bert-clasificacion-peliculas").to(device)

In [None]:
df_test = pd.read_parquet('https://github.com/amiune/freecodingtour/raw/main/cursos/espanol/deeplearning/data/test.parquet', engine='pyarrow')
df_test.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action
1,48456,Intermedio,A group of four teenage friends become trapped...,action
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action
3,84007,Paranoiac,A man long believed dead returns to the family...,action
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action


In [None]:
test_texts = df_test["synopsis"].tolist()

In [None]:
id2label = {4: 'fantasy', 5: 'horror', 3: 'family', 8: 'scifi', 0: 'action', 2: 'crime', 1: 'adventure', 6: 'mystery', 7: 'romance', 9: 'thriller'}

In [None]:
test_preds = []
for i in range(len(test_texts)):
  val_encoding = tokenizer(test_texts[i], truncation=True, padding=True,  return_tensors="pt").to(device)
  outputs = model(**val_encoding)
  logits = outputs.logits.cpu().detach().numpy()
  test_preds.append(id2label[np.argmax(logits)])

In [None]:
df_submission = pd.DataFrame({"id":df_test["id"],"genre":test_preds})

In [None]:
df_submission.to_csv("submission4.csv", index=False)

In [None]:
from google.colab import files
files.download('submission4.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Referencias:

- [Curso simple Coursera](https://www.coursera.org/)

- [Curso Avanzado Stanford](https://web.stanford.edu/class/cs224n/index.html#schedule)

- [Implementar Transformers para Clasificacion](https://huggingface.co/transformers/v3.2.0/custom_datasets.html)

- https://huggingface.co/docs/transformers/notebooks