# **Modelo 2 (Generacion de Poemas) GPT2**


---

In [None]:
pip install transformers -U

In [None]:
# Basicas
import pandas as pd
import numpy as np
import os
import time
import datetime
# Pytorch
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
# Texto
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import get_scheduler
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
from transformers import pipeline
# Sklearn
from sklearn.model_selection import train_test_split
# Funciones y variables
AVAIL_GPUS = min(1, torch.cuda.device_count())
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# **Datos**

In [None]:
url = 'https://raw.githubusercontent.com/andreamorgar/poesIA/master/data/poems.csv'
poems_df = pd.read_csv(url)
poems_df = poems_df.dropna()

In [None]:
# Filtrar poemas grandes
poems_df['string'] = poems_df.apply(lambda row: f'\n{row["title"]}\n{row["content"]}', axis=1)
poems_df['length'] = poems_df.string.map(len)
MAX_POEM_LENGTH=500
poems_filtered = poems_df[poems_df.length<MAX_POEM_LENGTH]
_ , poems_filtered = train_test_split(poems_filtered, test_size = 0.9 ,shuffle=True)
poems_filtered

Unnamed: 0,author,content,title,string,length
3420,Amado Nervo,\n¿Versos autobiográficos ? Ahí están mis canc...,Autobiografía,\nAutobiografía\n\n¿Versos autobiográficos ? A...,468
4687,Oliverio Girondo,\n\nMenos rodante dado\ndeliquio sumo psíquico...,MENOS,\nMENOS\n\n\nMenos rodante dado\ndeliquio sumo...,314
2113,Genaro Ortega Gutiérrez,Obligados a abandonar\nmuchos sueños ya rotos ...,Vida íntima de la pleura,\nVida íntima de la pleura\nObligados a abando...,429
3046,Alejandra Pizarnik,\n\nEn el eco de mis muertes\naún hay miedo.\n...,EL MIEDO,\nEL MIEDO\n\n\nEn el eco de mis muertes\naún ...,276
3010,Juan Ramón Jiménez,"\n\nSólo lo hiciste un momento.\nMas quedaste,...",LA ACTITUD,\nLA ACTITUD\n\n\nSólo lo hiciste un momento.\...,96
...,...,...,...,...,...
1106,Mario Benedetti,De un tiempo a esta parte\nel infinito\nse ha ...,El infinito,\nEl infinito\nDe un tiempo a esta parte\nel i...,211
2451,Alfredo Buxán,"¿Qué bien echas en falta si respiras,\nsi cuel...",El resentido,\nEl resentido\n¿Qué bien echas en falta si re...,322
3268,Nicanor Parra,\n\n 1\n\nYa no me queda nada por deci...,TRES POESÍAS,\nTRES POESÍAS\n\n\n 1\n\nYa no me que...,369
2559,José Ángel Buesa,"\n\nUn gran amor, un gran amor lejano\nes algo...",EL GRAN AMOR,"\nEL GRAN AMOR\n\n\nUn gran amor, un gran amor...",432


In [None]:
print(list(poems_filtered['string'])[20])


NO PUEDE


No puede conmigo
la tristeza
la arrastro hacia la vida
y se evapora.


# **Tokens para los datos (modelo DeepESP/gpt2-spanish)**

In [None]:
# Informacion del modelo ------------------------------------------------------\
df = poems_filtered['string'] # Datos
max_length = 800 # Longitud maxima de los poemas
modelo_gpt = "DeepESP/gpt2-spanish" # Modelo pre entrenado
RANDOM_SEED = 73 # Semilla

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(modelo_gpt)
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
# Tokenizador del modelo ------------------------------------------------------\
class DataTokens(Dataset):
  def __init__(self, data, tokenizer, gpt2_type="gpt2", max_length=max_length):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    for row in data:
      self.encodings_dict = self.tokenizer('<BOS>' + row + '<EOS>', padding="max_length", truncation=True, max_length=max_length)
      self.input_ids.append(torch.tensor(self.encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(self.encodings_dict['attention_mask']))
  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]
# Clase de los datos ----------------------------------------------------------\
class DataModule():
  # Definimos un tamaño de lote en la clase
  def __init__(self, dataset, tokenizer, gpt2_type="gpt2", batch_size = 32, p = 0.8):
      super(DataModule,self).__init__()
      self.batch_size = batch_size
      self.dataset = dataset
      self.tokenizer = tokenizer
      self.p = p
  # Definimos el tratamiento de los datos
  def train_val_split(self, split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size
  def setup(self, stage=None):
    self.dataset = DataTokens(self.dataset, self.tokenizer, gpt2_type=gpt2_type)
    train_size, val_size = self.train_val_split(self.p, self.dataset)
    self.train_dataset, self.val_dataset = random_split(self.dataset, [train_size, val_size])
  # Iterable de entrenamiento
  def train_dataloader(self):
      return DataLoader(self.train_dataset, batch_size=self.batch_size)
  # Iterable de validacion
  def val_dataloader(self):
      return DataLoader(self.val_dataset, batch_size=self.batch_size)

# **Reentrenamiento para el modelo (DeepESP/gpt2-spanish)**

In [None]:
# Fijar semillas --------------------------------------------------------------\
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
# Entrenamiento del modelo ----------------------------------------------------\
class Trainer_poet():
    def __init__(self, dataset, model, batch_size=16, epochs=5, learning_rate = 1e-4, eps = 1e-8, warmup_steps=50):
      # DataLoaders
      self.data_loader = DataModule(dataset, batch_size = batch_size, p = 0.8)
      data_loader.setup()
      self.train_dataloader = data_loader.train_dataloader()
      self.val_dataloader = data_loader.val_dataloader()
      # Modelo
      self.model = model
      self.epochs = epochs
      self.batch_size = batch_size
      self.optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
      total_steps = len(train_dataloader) * epochs
      self.scheduler = get_scheduler(name='linear',optimizer=optimizer,num_warmup_steps=warmup_steps,num_training_steps=total_steps)
    def train(self):
      device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
      model = self.model
      model.to(device)
      model.train()
      start_time = time.time()
      # Entrenamiento
      torch.cuda.empty_cache()
      print('Inicio entrenamiento ....')
      for epoch_i in range(self.epochs):
        print(f'Epoch {epoch_i + 1} de {epochs}')
        t0 = time.time()
        total_train_loss = 0
        for step, batch in enumerate(self.train_dataloader):
          b_input_ids = batch[0].to(device)
          b_masks = batch[1].to(device)
          outputs = model(b_input_ids,labels=b_input_ids,attention_mask=b_masks)
          loss = outputs[0]
          batch_loss = loss.item()
          total_train_loss += batch_loss
          self.optimizer.step()
          self.scheduler.step()
          self.optimizer.zero_grad()
          loss.backward()
        avg_train_loss = total_train_loss / len(self.train_dataloader)
        training_time = format_time(time.time() - t0)
        print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')
        # Validacion
        t0 = time.time()
        model.eval()
        total_eval_loss = 0
        nb_eval_steps = 0
        for batch in self.val_dataloader:
          b_input_ids = batch[0].to(device)
          b_masks = batch[1].to(device)
          with torch.no_grad():
            outputs  = model(b_input_ids,attention_mask=b_masks,labels=b_input_ids)
            loss = outputs[0]
          batch_loss = loss.item()
          total_eval_loss += batch_loss
        avg_val_loss = total_eval_loss / len(val_dataloader)
        print(f'Average Validation Loss: {avg_val_loss}')
      self.total_train_loss = total_train_loss
      self.total_eval_loss = total_eval_loss
      self.model = model
      print(f'Total Training Time: {format_time(time.time()-start_time)}')
      return model

In [None]:
model_gpt2_esp = GPT2LMHeadModel.from_pretrained(modelo_gpt, pad_token_id=tokenizer.eos_token_id)
Trainer_model = Trainer_poet(df, model_gpt2_esp, epochs=50)
model = Trainer_model.train()
torch.save(model, 'modelo_gpt2_poesia.pt')

# **Generación de Poesía**

In [None]:
model = torch.load('modelo_gpt2_poesia.pt')

In [None]:
def generate(model,tokenizer,prompt,length=60,top_p=0.8,temperature=1.):
    #prompt = trad_es_en(prompt)[0]['translation_text']
    model.eval()
    generated_num = 0
    generated_list = []
    filter_value = -float("Inf")
    with torch.no_grad():
      entry_finished = False
      generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
      for i in range(length):
          outputs = model(generated, labels=generated)
          loss, logits = outputs[:2]
          logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
          sorted_logits, sorted_indices = torch.sort(logits, descending=True)
          cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
          sorted_indices_to_remove = cumulative_probs > top_p
          sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
          sorted_indices_to_remove[..., 0] = 0
          indices_to_remove = sorted_indices[sorted_indices_to_remove]
          logits[:, indices_to_remove] = filter_value
          next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
          generated = torch.cat((generated, next_token), dim=1)
          if next_token in tokenizer.encode("<|endoftext|>"): break 
      output_list = list(generated.squeeze().numpy())
      output_text = tokenizer.decode(output_list,skip_special_tokens=True)
      generated_list.append(output_text)
    #generated_list = trad_en_es(generated_list)[0]['translation_text']  
    return generated_list[0]

In [None]:
Palabra = 'CIELO ESTRELLADO'
text = generate(model.to('cpu'), tokenizer,Palabra,temperature=0.7,length = 100,top_p = 0.8)
print(text)

In [None]:
def Generate2(model,Palabra):
  input_ids = tokenizer.encode(Palabra, return_tensors="pt")
  output = model.generate(
      input_ids,
      do_sample=True,
      top_k=50,
      max_length=40,
      top_p=0.95,
      num_return_sequences=3,
      #temperature=1.5
      #no_repeat_ngram_size=2,
      #early_stopping=True,
      #num_beams=5
  )
  output = tokenizer.decode(output[0], skip_special_tokens=True)
  return output

In [None]:
Palabra = 'CIELO ESTRELLADO'
text = Generate2(model.to('cpu'),Palabra)
print(text)