# **Modelo 1 (Generacion de Poemas)**
Andrey Duvan Rincon Torres

---

In [None]:
pip install pytorch-lightning

In [None]:
pip install pyyaml==5.4.1

In [None]:
pip install plotly_express

In [None]:
pip install torchmetrics

In [None]:
# Basicas
import pandas as pd
import numpy as np
# Graficas
import plotly.express as plx
import plotly.graph_objects as go
# Pytorch
import torch
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchmetrics.functional import accuracy
# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
# Texto
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
def poem_to_string(poem):
  return f'\n{poem["title"]}\n{poem["author"]}\n{poem["content"]}'
def poem_sequence_to_string(poem_sequence):
    poem_stringified = tokenizer.sequences_to_texts([poem_sequence])[0]
    print(poem_stringified)
CELoss = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
AVAIL_GPUS = min(1, torch.cuda.device_count())

# **Datos**

In [7]:
url = 'https://raw.githubusercontent.com/andreamorgar/poesIA/master/data/poems.csv'
poems_df = pd.read_csv(url)
poems_df = poems_df.dropna()

In [10]:
# Filtrar poemas grandes
poems_df['string'] = poems_df.apply(lambda row: f'\n{row["title"]}\n\n{row["author"]}\n\n{row["content"]}', axis=1)
poems_df['length'] = poems_df.string.map(len)
MAX_POEM_LENGTH=1000
poems_filtered = poems_df[poems_df.length<MAX_POEM_LENGTH]
_ , poems_filtered = train_test_split(poems_filtered, test_size = 0.1 ,shuffle=True)
poems_filtered

Unnamed: 0,author,content,title,string,length
4607,Toni García Arias,Si Dios me diese la oportunidad\nde regresar a...,Pasado,\nPasado\n\nToni García Arias\n\nSi Dios me di...,720
486,José Lezama Lima,"\n\nSin dientes, pero con dientes\ncomo sierra...",RETRATO DE DON FRANCISCO DE QUEVEDO,\nRETRATO DE DON FRANCISCO DE QUEVEDO\n\nJosé ...,591
584,Pablo Neruda,"Cien sonetos de amor\n\nPensé morir, sentí de ...",Cien sonetos de amor,\nCien sonetos de amor\n\nPablo Neruda\n\nCien...,655
3795,Gerardo Diego,\n\nTú y tu desnudo sueño. No lo sabes.\nDuerm...,INSOMNIO,\nINSOMNIO\n\nGerardo Diego\n\n\n\nTú y tu des...,578
4035,Mario Benedetti,"\nCuando el presidente, cualquier presidente\n...",Ahora todo está claro,\nAhora todo está claro\n\nMario Benedetti\n\n...,415
...,...,...,...,...,...
827,Jaime Sabines,"\nNo hay más. Sólo mujer para alegrarnos,\nsól...","No hay más, sólo mujer","\nNo hay más, sólo mujer\n\nJaime Sabines\n\n\...",389
1540,Luciano Castañón,Mueve mi madre\nesta mi cuna.\nEl mar da mied...,Nana marinera,\nNana marinera\n\nLuciano Castañón\n\nMueve ...,603
625,Carmen Conde Abellán,\n\nAcércate.\nJunto a la noche te espero.\n\n...,OFRECIMIENTO,\nOFRECIMIENTO\n\nCarmen Conde Abellán\n\n\n\n...,190
1668,Mario Benedetti,\nCompañera \nusted sabe \npuede contar \nconm...,Hagamos un trato,\nHagamos un trato\n\nMario Benedetti\n\n\nCom...,767


In [11]:
print(list(poems_filtered['string'])[0])


Pasado

Toni García Arias

Si Dios me diese la oportunidad
de regresar a mi pasado,
no guardaría tantas lágrimas
ni tantos besos.
Salpicaría todas las mañanas con un verso nuevo
que llevarme a los labios,
me dejaría navegar salvaje
donde antes me atenazaba el miedo,
no amagaría aquel abrazo
que se perdió por siempre
en lo más profundo del reproche.
Invadiría más a menudo tus noches
y tus sábanas,
asaltaría tu sonrisa
para instalar mi bandera.
No te dejaría marchar jamás
de mis sueños, de mis miedos, de mis derrotas.

Si Dios me diese la oportunidad
de regresar a mi pasado,
correría hacia él con más fuerza
para que el tiempo,
el siempre tiempo,
no pudiese reconocerme,
para que yo, al fin,
no pudiese recordarme.


# **Vocabulary**

In [12]:
poems_string=poems_filtered.string
STOP_SIGN = '␣'
tokenizer = Tokenizer(char_level=True, filters='', lower=False, split='')
tokenizer.fit_on_texts([STOP_SIGN])
tokenizer.fit_on_texts(poems_string)
total_words = len(tokenizer.word_index) + 1
dataset_vectorized = tokenizer.texts_to_sequences(poems_string)
max_sequence_len = max([len(x) for x in dataset_vectorized])
data_train, data_val = train_test_split(dataset_vectorized, test_size = 0.1, random_state = 0,shuffle=True)

In [13]:
def collate_batch(batch):
    input_sequences = []
    for line in batch:
      token_list = line
      for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label_list = torch.tensor(label, dtype=torch.int64)
    text_list = torch.tensor(predictors, dtype=torch.int64)
    return text_list, label_list
x_train, y_train = collate_batch(data_train)
x_val, y_val = collate_batch(data_val)

## **Modelo**

In [38]:
class Model(nn.Module):
  # creamos la estructura de la red
  def __init__(self):
      super(Model,self).__init__()
      # Embeding de las palabras
      self.embedding = nn.Embedding(total_words, 100)
      # Red Bidireccional
      self.lstm_1 = nn.LSTM(100, 150, 1, batch_first=True, dropout = 0.2 , bidirectional = True)
      # red LSTM
      self.lstm_2 = nn.LSTM(300,100,1, batch_first=True)
      # red perceptron
      self.linear_1 = nn.Linear(100,1605)
      self.linear_2 = nn.Linear(1605,total_words)
      self.relu = nn.ReLU()
      self.sofmax = nn.Softmax()
  # definimos el comportamiento de las capas
  def forward(self, x):
      batch_size, channelsn = x.size()
      x = self.embedding(x)
      # layer LSTM bidirectional
      out, (h_n, c_n) = self.lstm_1(x)
      # layer LSTM
      out, (h_n, c_n) = self.lstm_2(out)
      out = torch.squeeze(h_n)
      # capa de salida
      out = self.relu(self.linear_1(out))
      out = self.sofmax(self.linear_2(out))
      return out

## **Datos**

In [40]:
# Clase de los datos
class DataModule(pl.LightningDataModule):
  # Definimos un tamaño de lote en la calse
  def __init__(self, batch_size = 32):
      super(DataModule,self).__init__()
      self.batch_size = batch_size
  # Definimos el tratamiento de los datos
  def setup(self, stage=None):
    self.train_dataset = TensorDataset(x_train,y_train)
    self.val_dataset = TensorDataset(x_train,y_train)
  # Iterable de entrenamiento
  def train_dataloader(self):
      return DataLoader(self.train_dataset, batch_size=self.batch_size)
  # Iterable de validacion
  def val_dataloader(self):
      return DataLoader(self.val_dataset, batch_size=self.batch_size)

## **Entrenamiento**

In [41]:
class Train(pl.LightningModule):
    # creamos la estructura de la red
    def __init__(self,model):
        super().__init__()
        self.model = model
   # Paso de entrenamiento
    def training_step(self, batch, batch_idx):
        loss,acc = self._shared_eval_step(batch, batch_idx)
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss
    # Paso de validacion
    def validation_step(self, batch, batch_idx):
        loss,acc = self._shared_eval_step(batch, batch_idx)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        return loss
    # Funcion para evaluar el modelo y la perdida
    def _shared_eval_step(self,batch,batch_idx):
        x, y  = batch
        y_hat = self.model(x)
        loss = CELoss(y_hat, y.type(torch.LongTensor))
        acc = accuracy(y_hat, y.type(torch.LongTensor))
        return loss, acc
    # Configuracion del optimizador
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters())

##  Ajustar el modelo

In [42]:
data_module = DataModule() # Ejecutamos modulo de datos
torch.manual_seed(0)
model = Model() # Ejecutamos modelo
trainer = pl.Trainer(max_epochs=200, progress_bar_refresh_rate=20) # Lamamos el entrenador
task = Train(model)
trainer.fit(task,data_module)

  "num_layers={}".format(dropout, num_layers))
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /content/lightning_logs

  | Name  | Type  | Params
--------------------------------
0 | model | Model | 804 K 
--------------------------------
804 K     Trainable params
0         Non-trainable params
804 K     Total params
3.218     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
# guardar modelo
PATH = './checkpoint.pt'
torch.save(model.state_dict(), PATH)

In [None]:
# Tablero de resultados
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

# **Generacion de texto**

In [None]:
seed_text = "Who are you, so too cruel?"
next_words = 100
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.forward(torch.tensor(token_list.astype(np.float32))).detach().numpy().transpose()[0]
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)