# **Modelo 2 (Generacion de Poemas) GPT2**


---

In [None]:
pip install transformers -U

In [None]:
# Basicas
import pandas as pd
import numpy as np
import os
# Pytorch
import torch
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch.nn.functional as F
# Texto
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
from transformers import pipeline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
AVAIL_GPUS = min(1, torch.cuda.device_count())

# **Datos**

In [7]:
url = 'https://raw.githubusercontent.com/andreamorgar/poesIA/master/data/poems.csv'
poems_df = pd.read_csv(url)
poems_df = poems_df.dropna()

In [56]:
# Filtrar poemas grandes
poems_df['string'] = poems_df.apply(lambda row: f'\n{row["title"]}\n{row["content"]}', axis=1)
poems_df['length'] = poems_df.string.map(len)
MAX_POEM_LENGTH=1000
poems_filtered = poems_df[poems_df.length<MAX_POEM_LENGTH]
_ , poems_filtered = train_test_split(poems_filtered, test_size = 0.5 ,shuffle=True)
poems_filtered

Unnamed: 0,author,content,title,string,length
1995,Juan de Arguijo,\n\nDel gran Pompeyo el enemigo fuerte\nllega ...,A JULIO CÉSAR,\nA JULIO CÉSAR\n\n\n\nDel gran Pompeyo el ene...,518
1007,"Santa Teresa de Jesús, Sánchez de Cep","\n\nYa toda me entregué y dí,\ny de tal suerte...",YA TODA ME ENTREGUÉ,\nYA TODA ME ENTREGUÉ\n\n\n\nYa toda me entreg...,509
2870,Gabriela Mistral,\nHay países que yo recuerdo \ncomo recuerdo m...,Agua,\nAgua\n\n\nHay países que yo recuerdo \ncomo ...,980
2534,Bertolt Brecht,"No tenías ninguna,\nyo sólo una,\nque amaba.",Debilidades,"\nDebilidades\n\nNo tenías ninguna,\nyo sólo u...",56
4919,Pablo Neruda,Cien sonetos de amor\n\nAl golpe de la ola con...,Cien sonetos de amor,\nCien sonetos de amor\n\nCien sonetos de amor...,662
...,...,...,...,...,...
3896,Oliverio Girondo,\n\n¿Surgió de bajo tierra?\n¿Se desprendió de...,APARICIÓN URBANA,\nAPARICIÓN URBANA\n\n\n\n¿Surgió de bajo tier...,368
5039,Ángeles Carbajal,"Fue corto el viaje:\nun instante, una eternida...",La tierra prometida,\nLa tierra prometida\n\nFue corto el viaje:\n...,236
3087,Julia de Burgos,\nTengo el desesperante silencio de la angusti...,Silencio de angustia,\nSilencio de angustia\n\n\nTengo el desespera...,338
5062,Gustavo Adolfo Bécquer,\nLos invisibles átomos del aire \nen derredor...,Rima X,\nRima X\n\n\nLos invisibles átomos del aire \...,305


In [57]:
print(list(poems_filtered['string'])[0])


A JULIO CÉSAR



Del gran Pompeyo el enemigo fuerte
llega en oscura noche al pobre techo,
do Amiclas con seguro y libre pecho
ni teme daño ni recela muerte.

Ya que llamar segunda vez advierte,
rogado deja el mal compuesto lecho,
y en frágil barca el peligroso estrecho
rompe, presagio de siniestra suerte.

Brama furioso el mar sintiendo el peso
que sostiene, y al tímido piloto
César anima, y dice: «Rema amigo,

»Rema; no temas infeliz suceso
por más que te contrasten Euro y Noto;
la fortuna de César va contigo».


# **Tokens para los datos (modelo DeepESP/gpt2-spanish)**

In [58]:
df = poems_filtered

In [89]:
class DataTokens(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []
        for row in df['string']:
          self.row = f"<|{control_code}|>{row[:max_length]}<|endoftext|>"
          self.lyrics.append(torch.tensor(self.tokenizer.encode(self.row))) 
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
    def __len__(self):
        return self.lyrics_count
    def __getitem__(self, item):
        return self.lyrics[item]
dataset = DataTokens(df['string'], truncate=True, gpt2_type="DeepESP/gpt2-spanish")

In [178]:
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

# **Reentrenamiento para el modelo (DeepESP/gpt2-spanish)**

In [179]:
model = GPT2LMHeadModel.from_pretrained(pretrained_weights, pad_token_id=tokenizer.eos_token_id)
def train(dataset, model, batch_size=32, epochs=5, lr=2e-5, warmup_steps=200):
    device=torch.device("cuda")
    model = model.cuda()
    model.train()
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)
    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None
    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)
            if carry_on and idx != len(train_dataloader) - 1:
                continue
            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()
            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
            accumulating_batch_count += 1
            input_tensor = None
    return model

In [None]:
model = train(dataset, model, epochs=5)
torch.save(model, 'modelo_gpt2_poesia.pt')

# **Generación de Poesía**

In [19]:
model = torch.load('modelo_gpt2_poesia.pt')

In [218]:
def generate(model,tokenizer,prompt,length=60,top_p=0.8,temperature=1.):
    #prompt = trad_es_en(prompt)[0]['translation_text']
    model.eval()
    generated_num = 0
    generated_list = []
    filter_value = -float("Inf")
    with torch.no_grad():
      entry_finished = False
      generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
      for i in range(length):
          outputs = model(generated, labels=generated)
          loss, logits = outputs[:2]
          logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
          sorted_logits, sorted_indices = torch.sort(logits, descending=True)
          cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
          sorted_indices_to_remove = cumulative_probs > top_p
          sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
          sorted_indices_to_remove[..., 0] = 0
          indices_to_remove = sorted_indices[sorted_indices_to_remove]
          logits[:, indices_to_remove] = filter_value
          next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
          generated = torch.cat((generated, next_token), dim=1)
          if next_token in tokenizer.encode("<|endoftext|>"): break 
      output_list = list(generated.squeeze().numpy())
      output_text = tokenizer.decode(output_list,skip_special_tokens=True)
      generated_list.append(output_text)
    #generated_list = trad_en_es(generated_list)[0]['translation_text']  
    return generated_list[0]

In [None]:
Palabra = 'CIELO ESTRELLADO'
text = generate(model.to('cpu'), tokenizer,Palabra,temperature=0.7,length = 100,top_p = 0.8)
print(text)

In [228]:
def Generate2(model,Palabra):
  input_ids = tokenizer.encode(Palabra, return_tensors="pt")
  output = model.generate(
      input_ids,
      num_beams=5,
      max_length=40,
      early_stopping=True,
      no_repeat_ngram_size=2,
      temperature=1.5
  )
  output = tokenizer.decode(output[0], skip_special_tokens=True)
  return output

In [None]:
Palabra = 'CIELO ESTRELLADO'
text = Generate2(model.to('cpu'),Palabra)
print(text)