# Demo 
## Sistema de Recomendación en funcionamiento: Predicción de capacidades de adsorción

`entorno: pytorch`

*Autor: Celeste Castro Granados (celsgazu@ciencias.unam.mx)*

*Fecha de realización: 1 de mayo de 2024*

In [1]:
import pandas as pd
import numpy as np
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from livelossplot import PlotLosses
from tqdm import tqdm
import matplotlib.pyplot as pyplot

  from .autonotebook import tqdm as notebook_tqdm


### 1. Definición de variables

In [2]:
num_gpu = 2
ruta_datos = '/home/celeste/tesis/recommender/Recommender-Adsorbentes/test_50porciento_random.csv' #la matriz ya tiene que venir sin valores nan y con una columna explícita de 'Compuesto'
ruta_modelo = '/home/celeste/tesis/recommender/Recommender-Adsorbentes/model_final.pt'
batch_size = 8
layer_sizes = [90, 75, 75, 85]
nombre_archivo_salida = 'predicciones.csv'
guardar_csv = False

### 2. Clases y funciones

Preparación de la matriz de datos

In [3]:
# Definir la clase TestDataset
class TestDataset(Dataset):
    def __init__(self, test_file, transform=None):
        # Cargar los datos del archivo CSV y elimina la primera columna (Compuesto)
        #Al final los datos a completar se encuentran en las columnas de las propiedades y no en la que contiene la información de los compuestos
        self.data = pd.read_csv(test_file)
        self.data = self.data.iloc[:,1:]
        self.transform = transform
        
        # Aplicar la función de transformación si se proporciona
        if transform is not None:
            self.data = self.transform(np.array(self.data))
        
    def __len__(self):
        # Devolver la longitud del conjunto de datos de prueba
        return len(self.data[0])
    
    def __getitem__(self, ind):
        # Devolver el vector de compuesto correspondiente al índice especificado
        user_vector = self.data.data[0][ind]
        return user_vector

In [4]:
def prepare_newdata(ruta_datos, batch_size, num_workers=0):
    #preparar los datos para poder ingresarlos al modelo
    transformations = transforms.Compose([transforms.ToTensor()])
    test_dat = TestDataset(ruta_datos, transformations)
    test_dl = DataLoader(dataset=test_dat, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    
    return test_dl

Función de pérdida y RMSE

In [5]:
class MSEloss_with_Mask(nn.Module):
  #Esta clase hereda de la clase nn.Module de PyTorch
  def __init__(self):
    super(MSEloss_with_Mask,self).__init__()

  def forward(self,inputs, targets):
    #inputs, targets - entradas y salidas del modelo respectivamente 
    # Enmascaramiento en un vector de 1's y 0's.
    mask = (targets!=0)
    mask = mask.float()

    # Número real de datos en la matriz.
    # Se toma el máximo para evitar la división por cero en el cálculo de la pérdida.
    other = torch.Tensor([1.0])
    other = other.cuda(num_gpu)
    number_ratings = torch.max(torch.sum(mask),other)

    # Cálculo del error cuadrático medio (MSE) y la pérdida
    error = torch.sum(torch.mul(mask,torch.mul((targets-inputs),(targets-inputs))))
    loss = error.div(number_ratings)

    return loss[0]
  
  
def calcular_rmse(mmse):
    rmse = np.sqrt(mmse)
    return rmse 

Funciones de actvación

In [6]:
def activation(input, type):
  
    if type.lower()=='selu':
        return F.selu(input)
    elif type.lower()=='elu':
        return F.elu(input)
    elif type.lower()=='relu':
        return F.relu(input)
    elif type.lower()=='relu6':
        return F.relu6(input)
    elif type.lower()=='lrelu':
        return F.leaky_relu(input)
    elif type.lower()=='tanh':
        return F.tanh(input)
    elif type.lower()=='sigmoid':
        return F.sigmoid(input)
    elif type.lower()=='swish':
        return F.sigmoid(input)*input
    elif type.lower()=='identity':
        return input
    else:
        raise ValueError("Unknown non-Linearity Type")

Construcción del Autoencoder

In [7]:
class AutoEncoder(nn.Module):
    # Constructor de la clase, recibe como parámetros layer_sizes, nl_type, is_constrained, dp_drop_prob y last_layer_activations
    def __init__(self, layer_sizes, nl_type='selu', is_constrained=True, dp_drop_prob=0.0, last_layer_activations=True):
        """
        layer_sizes = tamaño de cada capa en el modelo de encoder. Should start with feature size (e.g. dimensionality of x)
        Por ejemplo: [10000, 1024, 512] resultará en:
            - 2 capas de encoder: 10000x1024 y 1024x512. La capa de representación (z) será de 512.
            - 2 capas de decoder: 512x1024 y 1024x10000.
    
        nl_type = tipo de no linealidad (por defecto: 'selu').
        is_constrained = Si es verdadero, entonces los pesos del encoder y decoder están atados.
        dp_drop_prob = probabilidad de Dropout.
        last_layer_activations = Si es verdadero, se aplica una activación en la última capa del decoder.
        """

        # Llama al constructor de la clase padre (nn.Module)
        super(AutoEncoder, self).__init__()

        # Asignación de parámetros a variables de la instancia
        self.layer_sizes = layer_sizes
        self.nl_type = nl_type
        self.is_constrained = is_constrained
        self.dp_drop_prob = dp_drop_prob
        self.last_layer_activations = last_layer_activations

        # Si dp_drop_prob > 0, se inicializa una instancia de Dropout
        if dp_drop_prob>0:
            self.drop = nn.Dropout(dp_drop_prob)

        self._last = len(layer_sizes) - 2
        
        # Inicialización de pesos del encoder
        self.encoder_weights = nn.ParameterList( [nn.Parameter(torch.rand(layer_sizes[i+1], layer_sizes[i])) for i in range(len(layer_sizes) - 1)  ] )

        # "Inicialización Xavier" (Entendiendo la dificultad en entrenar redes neuronales profundas de alimentación directa - por Glorot, X. & Bengio, Y.)
        # (Los valores se muestrean a partir de una distribución uniforme)
        for weights in self.encoder_weights:
            init.xavier_uniform_(weights)

        # Bias del encoder
        self.encoder_bias = nn.ParameterList( [nn.Parameter(torch.zeros(layer_sizes[i+1])) for i in range(len(layer_sizes) - 1) ] )

        # Lista de layer_sizes invertida
        reverse_layer_sizes = list(reversed(layer_sizes)) 
        # reversed retorna un iterador

        # Inicialización de pesos del decoder si is_constrained es falso
        if is_constrained == False:
            self.decoder_weights = nn.ParameterList( [nn.Parameter(torch.rand(reverse_layer_sizes[i+1], reverse_layer_sizes[i])) for i in range(len(reverse_layer_sizes) - 1) ] )

            # Inicialización Xavier de los pesos del decoder
            for weights in self.decoder_weights:
                init.xavier_uniform_(weights)

        # Bias del decoder
        self.decoder_bias = nn.ParameterList( [nn.Parameter(torch.zeros(reverse_layer_sizes[i+1])) for i in range(len(reverse_layer_sizes) - 1) ] ) 
        
    
    def encode(self, x):
     #Realiza la codificación de la entrada x'
     # Recorremos la lista de pesos del codificador
        for i, w in enumerate(self.encoder_weights):
            # Aplicamos la operación de multiplicación matricial entre la entrada x y el peso w del codificador, y sumamos el sesgo correspondiente
            x = F.linear(input=x, weight=w, bias=self.encoder_bias[i])
            # Aplicamos la función de activación correspondiente al tipo de no linealidad definida
            x = activation(input=x, type=self.nl_type)

        # Aplicamos Dropout en la última capa, si se define una probabilidad de eliminación de nodos
        if self.dp_drop_prob > 0:
            x = self.drop(x)

    # Retornamos el tensor resultante
        return x
    
    def build_latent_rep(self,x):
        #Obtiene la representación latente de nuevos datos una vez que el encoder ya está entrenado
        self.eval()
        x=self.encode(x)
        #será necesario agregar el paso de x = x.detach().numpy()?
        return x 
    
    def decode(self, x):
        # Si se trata de un modelo con pesos atados, se aplica la operación de decodificación con los pesos del codificador invertidos
        if self.is_constrained == True:
            # Los pesos están atados, por lo que recorremos los pesos del codificador en orden inverso
            for i, w in zip(range(len(self.encoder_weights)), list(reversed(self.encoder_weights))):
                # Aplicamos la operación de multiplicación matricial entre la entrada x y el peso w del decodificador, y sumamos el sesgo correspondiente
                x = F.linear(input=x, weight=w.t(), bias=self.decoder_bias[i])
                # Aplicamos la función de activación correspondiente al tipo de no linealidad definida, excepto para la última capa si se define que no tenga activación
                x = activation(input=x, type=self.nl_type if i != self._last or self.last_layer_activations else 'identity')

        else:
            # Los pesos no están atados, por lo que recorremos los pesos del decodificador
            for i, w in enumerate(self.decoder_weights):
                # Aplicamos la operación de multiplicación matricial entre la entrada x y el peso w del decodificador, y sumamos el sesgo correspondiente
                x = F.linear(input=x, weight=w, bias=self.decoder_bias[i])
                # Aplicamos la función de activación correspondiente al tipo de no linealidad definida, excepto para la última capa si se define que no tenga activación
                x = activation(input=x, type=self.nl_type if i != self._last or self.last_layer_activations else 'identity')

    # Retornamos el tensor resultante
        return x


    def forward(self, x):
        return self.decode(self.encode(x))
    

Función predict

In [8]:
def predict(model, criterion, test_dl):
    lr_val_loss= []
    matrix_out_test = torch.Tensor([])
    valid_loss = []
    
    # Iterar a través de los batches de los datos utilizando el índice i comenzando desde 0.
    for i, data in enumerate(test_dl, 0):
      # Establecer el modelo en modo de evaluación.
      model.eval()
      # Asignar los datos a las variables inputs y labels.
      inputs = labels = data
      # Mover las variables inputs y labels a la GPU si está disponible.
      inputs = inputs.cuda(num_gpu)
      labels = labels.cuda(num_gpu)

      # Convertir las variables inputs y labels a float.
      inputs = inputs.float()
      labels = labels.float()

      # Calcular las predicciones utilizando el modelo con las variables inputs.
      outputs = model(inputs)
      # Mover las predicciones a la GPU si está disponible.
      outputs = outputs.cuda(num_gpu)
      # Calcular la pérdida utilizando las predicciones y las etiquetas.
      loss = criterion(outputs, labels)

      # Agregar la pérdida a la lista de pérdidas de validación.
      valid_loss.append(loss.item())
      
      #Guardamos el batch reconstruido de entrenamiento
      outputs = outputs.to('cpu')
      #Vamos concatenando los batches para recuperar la matriz original
      matrix_out_test = torch.cat([matrix_out_test, outputs], 0)
      
    # Imprimir el número de la época actual y la pérdida promedio para todos los batches. 
    print("Loss (MMSE): ", np.mean(valid_loss))
    print('Matrix shape: ', matrix_out_test.shape)
    print('RMSE: ', np.sqrt(np.mean(valid_loss))) #debe ser el último valor o el promedio?
    
    return matrix_out_test

Reconstrucción de la salida en el formato adecuado

In [9]:
def reconstruir_salida(matriz_original, matriz_dl):
    columnas = matriz_original.drop(['Compuesto'],axis=1).columns
    compuestos = matriz_original['Compuesto']
    
    salida= predict(model, criterion, matriz_dl)
    salida = salida.detach().numpy()
    salida = pd.DataFrame(salida, columns=columnas)
    salida.insert(0, 'Compuesto', compuestos)
    
    return salida 

### 3. Implementación

In [16]:
#Matriz de datos por completar
test_original = pd.read_csv(ruta_datos) #matriz de datos
test_dl = prepare_newdata(ruta_datos, batch_size) #matriz de datos en formato de pytorch

In [15]:
test_original.head(10)

Unnamed: 0,Compuesto,0_acetylene273,0_acetylene298,0_argon298,0_butane298,0_carbon dioxide273,0_carbon dioxide298,0_carbon dioxide323,0_carbon monoxide298,0_ethane273,...,3_nitrogen273,3_nitrogen298,3_nitrogen323,3_nitrogen77,3_oxygen298,3_propane298,3_propane323,3_propene298,3_propene323,3_xenon298
0,[Zn12(SO3)2(BTB)6(HCO2)3].15DEF,0.0,0.947108,0.0,0.0,0.0,0.061256,0.0,0.222439,0.0,...,0.0,0.0,1.328918,12.008529,0.0,1.433456,0.557101,0.600753,0.0,0.0
1,[Zn17thb14(mu4-O)4(H2O)(Me2NH2)]*Me2NH2 1a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.029916,1.08621,...,11.867601,7.4433,0.0,8.771895,0.0,8.670429,15.977197,24.873333,0.0,0.0
2,[Zn17thb14(mu4-O)4(H2O)(Me2NH2)]*Me2NH2 2a,1.07509,0.784983,0.0,0.0,0.0,0.712871,0.0,0.2036,1.0,...,0.0,4.699669,0.0,3.052692,0.0,0.0,9.178073,8.817957,8.053491,9.396001
3,[Zn2(CN5H2)3(H2O)3]6H2O,6.189885,0.0,0.0,0.0,0.0,0.0,0.0,0.590219,0.0,...,0.684472,5.431553,4.832698,4.678391,25.974585,4.855744,0.0,14.810995,0.0,16.039534
4,[Zn2(CN5H2)3(H2O)3]6H2O Ac,0.487708,0.0,0.417787,0.0,0.0,0.0,0.0,0.164744,0.0,...,0.758638,2.966201,1.025537,1.067146,0.0,0.0,0.833559,1.44045,0.0,2.190897
5,[Zn2(CN5H2)3(H2O)3]6H2O TMC,0.481363,2.210656,0.0,0.0,1.001199,0.0,0.256793,0.398234,0.0,...,0.0,4.496917,3.229609,0.0,20.028681,3.117309,0.0,0.0,0.0,0.0
6,[Zn2(TRZ)2(DOBDC)]n,0.0,0.0,0.0,0.0,0.0,0.502399,0.442302,0.344331,0.0,...,1.20687,3.976531,0.0,2.020579,25.8629,1.921553,0.856242,0.0,1.575051,1.477235
7,[Zn2(adc)2(dabco)]n,0.0,2.623562,0.0,0.0,0.0,0.0,0.0,0.336583,0.149813,...,1.169693,4.342323,0.0,0.0,0.0,2.620548,0.801755,0.466369,1.516116,1.135246
8,[Zn2(bcta)(dipy)(mu2-OH)]*2DMF*H2O,0.0,0.0,2.942425,0.21095,0.0,0.321731,1.349548,0.0,0.447683,...,2.890525,1.906716,5.32414,0.0,0.0,0.0,2.756788,2.283262,3.321721,3.214656
9,[Zn2(bdc)2(dabco)]n,1.067338,0.384243,0.0,0.0,0.0,0.735235,0.0,0.372146,0.075903,...,2.358513,0.0,0.0,2.162565,0.0,0.0,2.01971,1.726046,2.609675,0.0


In [17]:
#Model
model= AutoEncoder(layer_sizes=layer_sizes, nl_type='relu', is_constrained=True, dp_drop_prob=0.0, last_layer_activations=False)
model = model.cuda(num_gpu)

#Loss Function
criterion = MSEloss_with_Mask()
criterion = criterion.cuda(num_gpu)

#Model loading
model.load_state_dict(torch.load(ruta_modelo))

<All keys matched successfully>

Generación de predicciones

In [11]:
pred = reconstruir_salida(test_original, test_dl)
if guardar_csv:
    pred.to_csv(nombre_archivo_salida)

Loss (MMSE):  0.881027687054414
Matrix shape:  torch.Size([100, 90])
RMSE:  0.938630751176635


In [12]:
pred

Unnamed: 0,Compuesto,0_acetylene273,0_acetylene298,0_argon298,0_butane298,0_carbon dioxide273,0_carbon dioxide298,0_carbon dioxide323,0_carbon monoxide298,0_ethane273,...,3_nitrogen273,3_nitrogen298,3_nitrogen323,3_nitrogen77,3_oxygen298,3_propane298,3_propane323,3_propene298,3_propene323,3_xenon298
0,[Zn12(SO3)2(BTB)6(HCO2)3].15DEF,0.414407,1.506064,0.847652,-0.822543,-0.988835,-0.430057,-0.004914,0.366937,-0.309695,...,1.355122,2.312035,2.132678,11.026027,3.452049,1.274993,1.600893,1.286071,1.690701,1.740997
1,[Zn17thb14(mu4-O)4(H2O)(Me2NH2)]*Me2NH2 1a,10.981824,5.051598,3.834434,-0.203325,1.796480,1.818201,1.607555,1.047155,1.183068,...,9.889935,9.195485,7.408055,7.649546,1.281817,7.884098,16.106005,20.133865,16.381830,12.911913
2,[Zn17thb14(mu4-O)4(H2O)(Me2NH2)]*Me2NH2 2a,5.187232,2.503776,1.388817,0.342314,2.052322,1.730125,0.540168,0.287437,1.439028,...,3.737534,7.587290,4.052664,2.481123,-1.295412,3.105517,6.628393,12.629663,9.640047,11.310090
3,[Zn2(CN5H2)3(H2O)3]6H2O,6.715715,4.491516,3.082927,-0.231025,1.830375,0.652009,1.013347,0.911666,0.623311,...,5.151757,5.696745,5.410547,3.880809,26.363131,5.925551,8.335501,16.136581,9.267009,16.841850
4,[Zn2(CN5H2)3(H2O)3]6H2O Ac,1.076094,1.073566,0.711624,0.107028,0.973503,0.419721,0.420839,0.235244,0.049533,...,1.593197,2.798856,1.433514,1.101846,0.535202,1.505108,1.567734,2.367908,1.866475,2.815332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,{[Zn2(OH)(AZPY)(BDC)1.5]*H2O}n,3.136024,1.553625,0.859762,-0.279603,0.973128,0.425232,0.202417,0.262708,0.028673,...,2.521842,2.865423,2.008361,0.662754,0.552428,2.068458,3.860674,1.739702,4.005394,5.470305
96,{[Zn3(L)3(DPB)1.5]-6DMF-H20}n,-0.110232,0.254824,0.053545,-0.087090,-0.015071,0.275953,0.110918,-0.050147,0.009063,...,0.672211,1.997462,0.453148,0.756816,0.294518,0.248152,-0.067518,-0.469263,0.392806,0.807022
97,{[Zn3(ptp)3](DMF)2-(H2O)},0.567948,1.732720,0.973772,0.256937,1.265839,0.521370,0.510770,0.337247,-0.021136,...,1.427909,3.275464,2.166761,2.847440,24.648577,2.181261,0.988928,1.103114,1.327566,1.914241
98,{[Zn4(BDC)4(BPDA)4]*5DMF*3H2O},3.289324,1.696383,1.110681,0.008969,2.980494,0.584895,0.549574,0.497157,0.137655,...,2.608678,2.074322,1.732297,2.484764,1.724159,1.676423,3.208954,0.206074,2.779967,2.217655
