# **Modelo 2 (Generacion de Poemas) GPT2**


---

In [1]:
pip install transformers -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 6.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 85.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling Py

In [2]:
# Basicas
import pandas as pd
import numpy as np
import os
import time
import datetime
# Pytorch
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
# Texto
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import get_scheduler
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
from transformers import pipeline
# Sklearn
from sklearn.model_selection import train_test_split
# Funciones y variables
def format_time(elapsed): return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [3]:
# Informacion del modelo ------------------------------------------------------\
max_length = 800 # Longitud maxima de los poemas
modelo_gpt = "DeepESP/gpt2-spanish" # Modelo pre entrenado
RANDOM_SEED = 2022 # Semilla

# **Datos**

In [4]:
url = 'https://raw.githubusercontent.com/andreamorgar/poesIA/master/data/poems.csv'
poems_df = pd.read_csv(url)
poems_df = poems_df.dropna()

In [5]:
# Filtrar poemas grandes
poems_df['string'] = poems_df.apply(lambda row: f'\n{row["title"]}\n{row["content"]}', axis=1)
poems_df['length'] = poems_df.string.map(len)
poems_filtered = poems_df[poems_df.length < max_length]
_ , poems_filtered = train_test_split(poems_filtered, test_size = 0.9 ,shuffle=True,random_state = 2022)
poems_filtered

Unnamed: 0,author,content,title,string,length
2544,Francisco de Figueroa,"\n\nPerdido ando, señora, entre la gente,\nsin...",SONETO XVII,"\nSONETO XVII\n\n\nPerdido ando, señora, entre...",558
908,Luis de Góngora,"\n\n¡Oh, de alto valor, de virtud rara\nSacro ...",A DON ANTONIO VENEGAS,"\nA DON ANTONIO VENEGAS\n\n\n¡Oh, de alto valo...",511
5029,José Gautier Benítez,\n\nCuando no reste ya ni un solo grano\nde mi...,A MIS AMIGOS,\nA MIS AMIGOS\n\n\nCuando no reste ya ni un s...,599
3942,Víctor Jiménez,"A la ausencia, al olvido, a la nostalgia\nmi c...",Tango para engañar a la tristeza,\nTango para engañar a la tristeza\nA la ausen...,272
3927,Gabriel García Márquez,"Si alguien llama a tu puerta, amiga mía,\ny al...",Si alguien llama a tu puerta,\nSi alguien llama a tu puerta\nSi alguien lla...,544
...,...,...,...,...,...
479,Jorge Teillier,\n\nSentados frente al fuego que envejece\nmir...,SENTADOS FRENTE AL FUEGO,\nSENTADOS FRENTE AL FUEGO\n\n\nSentados frent...,703
4052,Juan Ramón Jiménez,\n\n¡Qué miedo el azul del cielo!\n¡Negro!\n¡N...,TRASCIELO DEL CIELO AZUL,\nTRASCIELO DEL CIELO AZUL\n\n\n¡Qué miedo el ...,267
299,Víctor Botas,No me preguntes cómo pasa el tiempo\nLi Kiu Li...,Las rosas de Babilonia,\nLas rosas de Babilonia\nNo me preguntes cómo...,703
2065,Toni García Arias,Decías unas cosas que me asustaban.\nEn cubier...,Sobre la cubierta,\nSobre la cubierta\nDecías unas cosas que me ...,372


In [6]:
print(list(poems_filtered['string'])[20])


Llegué a Valladolid; registré luego


Llegué a Valladolid; registré luego
Desde el bonete al clavo de la mula;
Guardo el registro, que será mi bula
Contra el cuidado del señor don Diego.

Busqué la Corte en él, y yo estoy ciego,
O en la ciudad no está, o se disimula.
Celebrando dïetas vi a la gula,
Que Platón para todos está en griego.

La lisonja hallé y la ceremonia
Con luto, idolatrados los caciques,
Amor sin fe, interés con sus virotes.

Todo se halla en esta Babilonia,
Como en botica, grandes alambiques,
Y más en ella títulos que botes.


# **Tokens para los datos (modelo DeepESP/gpt2-spanish)**

In [7]:
df = poems_filtered['string'] # Datos
# Tokenizador del modelo pre entrenado ----------------------------------------\
tokenizer = GPT2Tokenizer.from_pretrained(modelo_gpt)
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

Downloading:   0%|          | 0.00/821k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/487k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/914 [00:00<?, ?B/s]

In [8]:
# Tokenizador del modelo ------------------------------------------------------\
class DataTokens(Dataset):
  def __init__(self, data, tokenizer, gpt2_type="gpt2", max_length=max_length):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    for row in data:
      self.encodings_dict = self.tokenizer('<BOS>' + row + '<EOS>', padding="max_length", truncation=True, max_length=max_length)
      self.input_ids.append(torch.tensor(self.encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(self.encodings_dict['attention_mask']))
  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]
# Clase de los datos ----------------------------------------------------------\
class DataModule():
  # Definimos un tamaño de lote en la clase
  def __init__(self, dataset, tokenizer, gpt2_type="gpt2", p = 0.8):
      super(DataModule,self).__init__()
      self.dataset = dataset
      self.tokenizer = tokenizer
      self.p = p
      self.gpt2_type = gpt2_type
  # Definimos el tratamiento de los datos
  def train_val_split(self, split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size
  def setup(self, stage=None):
    self.dataset = DataTokens(self.dataset, self.tokenizer, gpt2_type=self.gpt2_type)
    train_size, val_size = self.train_val_split(self.p, self.dataset)
    self.train_dataset, self.val_dataset = random_split(self.dataset, [train_size, val_size])
  # Iterable de entrenamiento
  def train_dataloader(self, batch_size = 32):
      return torch.utils.data.DataLoader(self.train_dataset, batch_size=batch_size)
  # Iterable de validacion
  def val_dataloader(self, batch_size = 32):
      return torch.utils.data.DataLoader(self.val_dataset, batch_size=batch_size)

# **Reentrenamiento para el modelo (DeepESP/gpt2-spanish)**

In [9]:
# Fijar semillas --------------------------------------------------------------\
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fa9486e46f0>

In [10]:
# Entrenamiento del modelo ----------------------------------------------------\
class Trainer_poet():
    def __init__(self, dataset, model, batch_size=16, epochs=5, learning_rate = 1e-4, eps = 1e-8, warmup_steps=50):
      # DataLoaders
      self.data_loader = dataset
      self.data_loader.setup()
      self.train_dataloader = self.data_loader.train_dataloader(batch_size)
      self.val_dataloader = self.data_loader.val_dataloader(batch_size)
      # Modelo
      self.model = model
      self.epochs = epochs
      self.optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
      total_steps = len(self.train_dataloader) * epochs
      self.scheduler = get_linear_schedule_with_warmup(optimizer=self.optimizer,num_warmup_steps=warmup_steps,num_training_steps=total_steps)
    def train(self):
      device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
      model = self.model
      model.to(device)
      model.train()
      start_time = time.time()
      # Entrenamiento
      print('Inicio entrenamiento ....')
      train_dataloader = self.train_dataloader
      val_dataloader = self.val_dataloader
      for epoch_i in range(self.epochs):
        print(f'Epoch {epoch_i + 1} de {self.epochs}')
        t0 = time.time()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):
          b_input_ids = batch[0].to(device)
          b_masks = batch[1].to(device)
          outputs = model(b_input_ids,labels=b_input_ids,attention_mask=b_masks)
          loss = outputs[0]
          batch_loss = loss.item()
          total_train_loss += batch_loss
          self.optimizer.step()
          self.scheduler.step()
          self.optimizer.zero_grad()
          loss.backward()
        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)
        print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')
        # Validacion
        t0 = time.time()
        model.eval()
        total_eval_loss = 0
        nb_eval_steps = 0
        for batch in val_dataloader:
          b_input_ids = batch[0].to(device)
          b_masks = batch[1].to(device)
          with torch.no_grad():
            outputs  = model(b_input_ids,attention_mask=b_masks,labels=b_input_ids)
            loss = outputs[0]
          batch_loss = loss.item()
          total_eval_loss += batch_loss
        avg_val_loss = total_eval_loss / len(val_dataloader)
        print(f'Average Validation Loss: {avg_val_loss}')
      self.total_train_loss = total_train_loss
      self.total_eval_loss = total_eval_loss
      self.model = model
      print(f'Total Training Time: {format_time(time.time()-start_time)}')
      return model

In [None]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=max_length).from_pretrained(modelo_gpt, output_hidden_states=True)
model_gpt2_esp = GPT2LMHeadModel.from_pretrained(modelo_gpt, config=configuration)
model_gpt2_esp.resize_token_embeddings(len(tokenizer))

Dataset = DataModule(df, tokenizer, gpt2_type=modelo_gpt)
Trainer_model = Trainer_poet(Dataset, model_gpt2_esp, epochs=50, batch_size=8)

In [None]:
model = Trainer_model.train()
torch.save(model, 'modelo_gpt2_poesia.pt')

# **Generación de Poesía**

In [None]:
model = torch.load('modelo_gpt2_poesia.pt')

In [None]:
def generate(model,tokenizer,prompt,length=60,top_p=0.8,temperature=1.):
    #prompt = trad_es_en(prompt)[0]['translation_text']
    model.eval()
    generated_num = 0
    generated_list = []
    filter_value = -float("Inf")
    with torch.no_grad():
      entry_finished = False
      generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
      for i in range(length):
          outputs = model(generated, labels=generated)
          loss, logits = outputs[:2]
          logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
          sorted_logits, sorted_indices = torch.sort(logits, descending=True)
          cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
          sorted_indices_to_remove = cumulative_probs > top_p
          sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
          sorted_indices_to_remove[..., 0] = 0
          indices_to_remove = sorted_indices[sorted_indices_to_remove]
          logits[:, indices_to_remove] = filter_value
          next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
          generated = torch.cat((generated, next_token), dim=1)
          if next_token in tokenizer.encode("<|endoftext|>"): break 
      output_list = list(generated.squeeze().numpy())
      output_text = tokenizer.decode(output_list,skip_special_tokens=True)
      generated_list.append(output_text)
    #generated_list = trad_en_es(generated_list)[0]['translation_text']  
    return generated_list[0]

In [None]:
Palabra = 'CIELO ESTRELLADO'
text = generate(model.to('cpu'), tokenizer,Palabra,temperature=0.7,length = 100,top_p = 0.8)
print(text)

In [None]:
def Generate2(model,Palabra):
  input_ids = tokenizer.encode(Palabra, return_tensors="pt")
  output = model.generate(
      input_ids,
      do_sample=True,
      top_k=50,
      max_length=40,
      top_p=0.95,
      num_return_sequences=3,
      #temperature=1.5
      #no_repeat_ngram_size=2,
      #early_stopping=True,
      #num_beams=5
  )
  output = tokenizer.decode(output[0], skip_special_tokens=True)
  return output

In [None]:
Palabra = 'CIELO ESTRELLADO'
text = Generate2(model.to('cpu'),Palabra)
print(text)