# **Construcción automática de texto (Pytorch-Ligthning)**
Andrey Duvan Rincon Torres

---

In [None]:
pip install pytorch-lightning

In [None]:
pip install pyyaml==5.4.1

In [None]:
pip install plotly_express

In [None]:
pip install torchmetrics

In [7]:
# Librerias Nesesarias
import torch
import pandas as pd
import plotly.express as plx
import numpy as np
import plotly.graph_objects as go
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchmetrics.functional import accuracy
from sklearn.model_selection import train_test_split
from torchmetrics.functional import accuracy
CELoss = nn.CrossEntropyLoss()
# Para el texto
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences

# **Pre tratamiento de los datos**

In [None]:
# Descargar los datos
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \
    -O /tmp/sonnets.txt
data = open('/tmp/sonnets.txt').read()

corpus = data.lower().split("\n")

In [21]:
# Crear estructura de datos
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# crea predictores y etiqueta
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

In [22]:
# Codificar One Hot variables categoricas
encoder_lab = OneHotEncoder()
encoder_lab.fit(label.reshape(-1, 1))
label = encoder_lab.transform(label.reshape(-1, 1)).toarray()

## **Modelo**

In [None]:
class Model(nn.Module):
  # creamos la estructura de la red
  def __init__(self):
      super(Model,self).__init__()
      # Embeding de las palabras
      embedding = nn.Embedding(total_words, 100)
      # Red Bidireccional
      self.lstm_1 = nn.LSTM(100, 150, 1, batch_first=True, dropout = 0.2 , bidirectional = True)
      # red LSTM
      self.lstm_2 = nn.LSTM(150,100,1, batch_first=True)
      # red perceptron
      self.linear_1 = nn.Linear(100,1605)
      self.linear_2 = nn.Linear(1605,total_words)
  # definimos el comportamiento de las capas
  def forward(self, x):
      batch_size, channels, width = x.size()
      # layer LSTM bidirectional
      out, (h_n, c_n) = self.lstm_1(x)
      # layer LSTM
      out, (h_n, c_n) = self.lstm_2(out)
      # capa de salida
      out = nn.ReLU(self.linear_1(out))
      out = nn.Softmax(self.linear_2(out))
      return out

## **Datos**

In [None]:
# Clase de los datos
class DataModule(pl.LightningDataModule):
  # Definimos un tamaño de lote en la calse
  def __init__(self, batch_size = 32):
      super(DataModule,self).__init__()
      self.batch_size = batch_size
  # Definimos el tratamiento de los datos
  def setup(self, stage=None):
    x, y = predictors, label
    # Conjunto de validacion
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.1, random_state = 0,shuffle=True)
    # Transformar en tensores
    self.train_dataset = TensorDataset(torch.tensor(x_train.astype(np.float32)),torch.tensor(y_train.astype(np.float32)))
    self.val_dataset = TensorDataset(torch.tensor(x_val.astype(np.float32)),torch.tensor(y_val.astype(np.float32)))
  # Iterable de entrenamiento
  def train_dataloader(self):
      return DataLoader(self.train_dataset, batch_size=self.batch_size)
  # Iterable de validacion
  def val_dataloader(self):
      return DataLoader(self.val_dataset, batch_size=self.batch_size)

## **Entrenamiento**

In [None]:
class Train(pl.LightningModule):
    # creamos la estructura de la red
    def __init__(self,model):
        super().__init__()
        self.model = model
   # Paso de entrenamiento
    def training_step(self, batch, batch_idx):
        loss,acc = self._shared_eval_step(batch, batch_idx)
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss
    # Paso de validacion
    def validation_step(self, batch, batch_idx):
        loss,acc = self._shared_eval_step(batch, batch_idx)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        return loss
    # Funcion para evaluar el modelo y la perdida
    def _shared_eval_step(self,batch,batch_idx):
        x, y  = batch
        y_hat = self.model(x)
        loss = BCELoss(y_hat, y.unsqueeze(1))
        acc = self.accuracy(y_hat.softmax(-1), y.int().unsqueeze(1))
        return loss, acc
    # Configuracion del optimizador
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters())

##  Ajustar el modelo

In [None]:
data_module = DataModule() # Ejecutamos modulo de datos
torch.manual_seed(0)
model = Model() # Ejecutamos modelo
trainer = pl.Trainer(max_epochs=100, progress_bar_refresh_rate=20) # Lamamos el entrenador
task = Train(model)
trainer.fit(task,data_module)

In [None]:
# Tablero de resultados
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

# **Generacion de texto**