# NLP con Deep Learning: Transformers

- [Curso simple Coursera](https://www.coursera.org/)

- [Curso Avanzado Stanford](https://web.stanford.edu/class/cs224n/index.html#schedule)

- [Implementar Transformers para Clasificacion](https://huggingface.co/transformers/v3.2.0/custom_datasets.html)

In [None]:
import torch
import numpy as np
import pandas as pd

In [None]:
#device = "mps" # para mac M1 en adelante
device = "cuda:0" if torch.cuda.is_available() else "cpu" #para gpu
print(device)

cuda:0


In [None]:
df_train = pd.read_parquet('https://github.com/amiune/amiune.github.io/raw/master/movie-genre-prediction/train.parquet', engine='pyarrow')
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [None]:
df_train.shape

(54000, 4)

In [None]:
df_train.genre = df_train.genre.astype('category')

id2label = dict(zip(df_train.genre.cat.codes, df_train.genre))
print(id2label)
label2id = dict(zip(df_train.genre, df_train.genre.cat.codes))
print(label2id)

{4: 'fantasy', 5: 'horror', 3: 'family', 8: 'scifi', 0: 'action', 2: 'crime', 1: 'adventure', 6: 'mystery', 7: 'romance', 9: 'thriller'}
{'fantasy': 4, 'horror': 5, 'family': 3, 'scifi': 8, 'action': 0, 'crime': 2, 'adventure': 1, 'mystery': 6, 'romance': 7, 'thriller': 9}


In [None]:
num_classes = len(df_train.genre.value_counts())
print(num_classes)

10


In [None]:
#df_train["text"] = df_train["movie_name"].str.lower() + ". " + df_train["synopsis"].str.lower()
df_train["text"] = df_train["movie_name"] + ". " + df_train["synopsis"]

In [None]:
df_train.iloc[0,:]["text"]

'Super Me. A young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. Selling them makes him rich.'

In [None]:
def read_columns(df, text_column, label_column):
    texts = df[text_column].tolist()
    labels = df[label_column].cat.codes.tolist()
    return texts, labels

In [None]:
train_texts, train_labels = read_columns(df_train.iloc[0:,:], "synopsis","genre")
print(len(train_texts),len(train_labels))

54000 54000


In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.05, stratify=train_labels, random_state=42)
print(len(train_texts),len(train_labels))
print(len(val_texts),len(val_labels))

51300 51300
2700 2700


# HF Transformers

In [None]:
!pip install transformers[torch] --quiet
!pip install sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlnet-large-cased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
print(type(train_encodings))
print(train_encodings.keys())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


<class 'transformers.tokenization_utils_base.BatchEncoding'>
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #item es un diccionario que contiene tres keys: 'input_ids', 'attention_mask' y 'labels'
        #cada key contiene el tensor correspodiente al indice idx
        item = {}
        item['input_ids'] = torch.tensor(self.encodings['input_ids'][idx]).to(device)
        item['attention_mask'] = torch.tensor(self.encodings['attention_mask'][idx]).to(device)
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)
print(len(train_dataset),len(val_dataset))
print(train_dataset[0])

51300 2700
{'input_ids': tensor([    5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,  9117,    56,    76,   203,
           55,    18,    89,   468,   263,   177,    19,    52, 11968,    22,
           67,   620,  5976,  3165,    17,    10, 11608,    11,    19,  1898,
           22,    39,   176,    70, 28919,   137,    18,   929,     9,     4,
            3], device='cuda:0'), 'attention_mask': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
!pip install evaluate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

#https://huggingface.co/transformers/v4.2.2/main_classes/trainer.html#trainingarguments
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=64,
    learning_rate=0.0000045,
    weight_decay=0.65,
    dataloader_pin_memory=False,     # remove if possible for faster training
    save_steps=1_000_000_000,         # dont save checkpoints
    save_total_limit=0,
    evaluation_strategy = "epoch",
    output_dir="./results"
)

model = AutoModelForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=num_classes).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Downloading pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.6656,1.638963,0.411852
2,1.4944,1.652795,0.419259
3,1.3953,1.703733,0.420741


TrainOutput(global_step=38475, training_loss=1.5559087107598277, metrics={'train_runtime': 16539.4536, 'train_samples_per_second': 9.305, 'train_steps_per_second': 2.326, 'total_flos': 3.064268332521e+16, 'train_loss': 1.5559087107598277, 'epoch': 3.0})

In [None]:
df_test = pd.read_parquet('https://github.com/amiune/amiune.github.io/raw/master/movie-genre-prediction/test.parquet', engine='pyarrow')
df_test.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action
1,48456,Intermedio,A group of four teenage friends become trapped...,action
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action
3,84007,Paranoiac,A man long believed dead returns to the family...,action
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action


In [None]:
df_test["text"] = df_test["movie_name"] + ". " + df_test["synopsis"]
test_texts = df_test["text"].to_list()
test_texts[0]

"A Death Sentence. 12 y.o. Ida's dad'll die without a DKK1,500,000 operation. Ida plans to steal the money from the bank, her mom installed alarm systems in. She'll need her climbing skills, her 2 friends and 3 go-karts."

In [None]:
test_preds = []
test_probs = np.zeros((len(test_texts),num_classes))
for i in range(len(test_texts)):
  val_encoding = tokenizer(test_texts[i], truncation=True, padding=True, return_tensors="pt").to(device)
  outputs = model(**val_encoding)
  logits = outputs.logits.cpu().detach().numpy()
  # obtener prediccion
  test_preds.append(id2label[np.argmax(logits)])
  # calcular probabilidades
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(outputs.logits.squeeze().cpu())
  for j in range(num_classes):
    test_probs[i,j] = probs[j]

from google.colab import files

tmp_dict = {"id":df_test["id"]}
for j in range(num_classes):
    tmp_dict[id2label[j]] = test_probs[:,j]
df_probs = pd.DataFrame(tmp_dict)
df_probs.to_csv("xlnet_large_probs2.csv", index=False)
files.download('xlnet_large_probs2.csv')

df_submission = pd.DataFrame({"id":df_test["id"],"genre":test_preds})
df_submission.to_csv("submission6.csv", index=False)
files.download('submission6.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>