In [1]:
import numpy as np
import os
import torch
import pandas as pd
from models import model
from read_data import read_data
import matplotlib.pyplot as plt
import seaborn as sns

## Carga de datos

In [2]:
# Leemos los archivos raw
folder = os.path.join('..', 'Date')
dataframes1 = read_data.read_raw(folder)
folder = os.path.join('..', 'Date2')
dataframes2 = read_data.read_raw(folder)

In [3]:
# Convertimos todo en arrays de numpy con series del mismo largo
data_1 = read_data.read_and_perform(dataframes1, row_range=300, col_range=(3,12), split= True)
data_2 = read_data.read_and_perform(dataframes2, row_range=99, col_range=(2,5), split= False)
# concatenamos todas las series
data_total = np.vstack([data_1.T, data_2.T])

# separamos train y test
train, test = read_data.train_test_split(data_total)

[+] Se procesaron 297 series de longitud 99
[+] Se procesaron 32 series de longitud 99


In [4]:
# escalamos
scaler = read_data.scalings(train)
train = scaler.fit_transform(train)
print(f'[+] Train shape {train.shape}')
test = scaler.transform(test)
print(f'[+] Test shape {test.shape}')

[+] StandardScaler entrenado
[+] Train shape (310, 99)
[+] Test shape (19, 99)


## Entrenar modelo para un solo espacio latente

In [5]:
# Crear el directorio para guardar las imágenes si no existe
path_img = os.path.join("..","img","Drop-Optimo")
os.makedirs(path_img, exist_ok=True)

In [27]:
epochs = 1500
lr = 1e-3
lat = 5
drop = np.linspace(0.1,0.6,6)

for dr in drop:
    # Entrenamiento
    hist_train = []
    hist_test = []
    autoencoder = model.NNAutoencoder(99, lat, dr)
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr = lr)
    criterio = torch.nn.MSELoss()

    for e in range(epochs):
        autoencoder.train()
        x = torch.FloatTensor(train)
        y_pred = autoencoder(x)
        loss = criterio(y_pred, x)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if e%100 == 0:
            print(e, "loss =",loss.item())
        hist_train.append(loss.item())
        
        with torch.no_grad():
            autoencoder.eval()
            x = torch.FloatTensor(test)
            y_pred = autoencoder(x)
            loss = criterio(y_pred, x)
            hist_test.append(loss.item())

    #guardo las img
    plt.semilogy(hist_train, label = 'train loss')
    plt.semilogy(hist_test, label = 'test loss')
    plt.title(f"Loss train -eval, drop = {dr}")
    plt.legend()
    plt.savefig(os.path.join(path_img,f"lat{lat}-drop{dr}.png"), bbox_inches='tight')
    plt.close()


0 loss = 1.0040003061294556
100 loss = 0.11730137467384338
200 loss = 0.08359379321336746
300 loss = 0.04032127559185028
400 loss = 0.033532217144966125
500 loss = 0.028023280203342438
600 loss = 0.02669672854244709
700 loss = 0.02513948269188404
800 loss = 0.020755019038915634
900 loss = 0.020684024319052696
1000 loss = 0.017455104738473892
1100 loss = 0.01727764494717121
1200 loss = 0.01715472899377346
1300 loss = 0.014518149197101593
1400 loss = 0.01809506118297577
0 loss = 1.0044764280319214
100 loss = 0.12969592213630676
200 loss = 0.10075286030769348
300 loss = 0.0756211131811142
400 loss = 0.04765097424387932
500 loss = 0.045192066580057144
600 loss = 0.04274669662117958
700 loss = 0.03829871863126755
800 loss = 0.036373257637023926
900 loss = 0.0296781063079834
1000 loss = 0.029788149520754814
1100 loss = 0.028872331604361534
1200 loss = 0.02516286075115204
1300 loss = 0.025525685399770737
1400 loss = 0.02508639171719551
0 loss = 1.0044333934783936
100 loss = 0.1389536857604980

## Early stopping

Es  una técnica que permite detener el entrenamiento cuando el valor de pérdida en el conjunto de validación (test) comienza a aumentar. Esto ayuda a prevenir el sobreajuste y permite guardar el mejor modelo basado en el mínimo valor de pérdida en el conjunto de validación

In [9]:
# Crear el directorio para guardar las imágenes si no existe
path_model = os.path.join("..","Save-Models")
os.makedirs(path_model, exist_ok=True)

In [29]:
epochs = 1500
lr = 1e-3
dr = 0.2
lat = 5

# Entrenamiento
hist_train = []
hist_test = []
best_test_loss = 100
best_epoch = 0
best_model = None
espera = 100
b = 0 #bandera

autoencoder = model.NNAutoencoder(99, lat, dr)
optimizer = torch.optim.Adam(autoencoder.parameters(), lr = lr)
criterio = torch.nn.MSELoss()

for e in range(epochs):
    autoencoder.train()
    x = torch.FloatTensor(train)
    y_pred = autoencoder(x)
    loss = criterio(y_pred, x)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    hist_train.append(loss.item())
    
    with torch.no_grad():
        autoencoder.eval()
        x = torch.FloatTensor(test)
        y_pred = autoencoder(x)
        test_loss = criterio(y_pred, x)
        hist_test.append(test_loss.item())

    if e%100 == 0:
            print(f'Epoch {e}, train Loss: {loss.item():.4f}, test Loss: {test_loss.item():.4f}')

    if test_loss < loss:
        #guardo el mejor modelo
        if test_loss < best_test_loss: #encuantra el primer minimo??
            best_test_loss = test_loss
            best_epoch = e
            best_model = autoencoder.state_dict().copy() #copia del mejor modelo

        if (e - best_epoch >= espera) and (b < 2):
            m_epoch = best_epoch
            m_test_loss = best_test_loss
            torch.save(best_model, os.path.join(path_model,f"model-lat{lat}.pth"))
            b += 1
            plt.plot(m_epoch,m_test_loss,'x', color = "red")

print(f'Best epoch was {m_epoch} with val loss {m_test_loss:.4f}')
#guardo las img
plt.semilogy(hist_train, label = 'train loss')
plt.semilogy(hist_test, label = 'test loss')
plt.plot(m_epoch,m_test_loss,'x', color = "red", label = "best model")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.title(f"dim_lat: {lat}, drop = {dr}")
plt.legend()
plt.savefig(os.path.join(path_img,f"BestModel-lat{lat}-drop{dr}.png"), bbox_inches='tight')
plt.close()

Epoch 0, train Loss: 1.0036, test Loss: 1.1227
Epoch 100, train Loss: 0.1295, test Loss: 0.0787
Epoch 200, train Loss: 0.0679, test Loss: 0.0394
Epoch 300, train Loss: 0.0479, test Loss: 0.0314
Epoch 400, train Loss: 0.0445, test Loss: 0.0354
Epoch 500, train Loss: 0.0396, test Loss: 0.0400
Epoch 600, train Loss: 0.0363, test Loss: 0.0379
Epoch 700, train Loss: 0.0321, test Loss: 0.0291
Epoch 800, train Loss: 0.0312, test Loss: 0.0381
Epoch 900, train Loss: 0.0288, test Loss: 0.0290
Epoch 1000, train Loss: 0.0290, test Loss: 0.0245
Epoch 1100, train Loss: 0.0268, test Loss: 0.0206
Epoch 1200, train Loss: 0.0257, test Loss: 0.0250
Epoch 1300, train Loss: 0.0243, test Loss: 0.0200
Epoch 1400, train Loss: 0.0271, test Loss: 0.0199
Best epoch was 279 with val loss 0.0246


In [None]:
MAES_list = []
for i in range(500):
    autoencoder.train()
    x = torch.FloatTensor(test)
    y_pred = autoencoder(x)
    sample = scaler.inverse_transform(x)
    sample_pred = scaler.inverse_transform(y_pred.detach().numpy())

    average_MAES = np.abs(sample-sample_pred).mean()
    MAES_list.append(average_MAES)
    print(f'DIM= {i} & Average MAE= {average_MAES:.3f}')

array = np.array(MAES_list)
mean = np.mean(array)
std = np.std(array) 

In [None]:
#curva
# Crear el histograma
#plt.figure(figsize=(10, 6))
sns.histplot(MAES_list, kde = True, bins=50, edgecolor='black')

# Añadir etiquetas y título
plt.xlabel('MAES')
plt.ylabel('f')
plt.title('Frecuencia de ocurrencia')

### Calculo MAE vs Dim Latent

In [None]:
latent = np.array(range(1,21)) #Espacio latente del 1 al 20
MAES_list = [] #se guardaran ls MAES de cada espacio latente
epochs = 1000
lr = 1e-3
drop = 0.0005

for lat in latent:
    # Modelo
    autoencoder = model.NNAutoencoder(99, lat, drop)
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr = lr)
    criterio = torch.nn.MSELoss()
    # Entrenamiento
    hist_train = []
    hist_test = []
    for e in range(epochs):
        autoencoder.train()
        x = torch.FloatTensor(train)
        y_pred = autoencoder(x)
        loss = criterio(y_pred, x)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluacion
    with torch.no_grad():
        autoencoder.eval()
        x = torch.FloatTensor(test)
        y_pred = autoencoder(x)
        sample = scaler.inverse_transform(x)
        sample_pred = scaler.inverse_transform(y_pred.numpy())

    average_MAES = np.abs(sample-sample_pred).mean()
    MAES_list.append(average_MAES)
    print(f'DIM= {lat} & Average MAE= {average_MAES:.3f}')

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
ax.errorbar(latent,MAES_list,0.05,fmt='o', linewidth=2, capsize=6)
ax.plot(latent,MAES_list)
ax.set_ylabel("MAES")
ax.set_xlabel("Laten Dim")
ax.set_xticks(np.arange(0, 22, 1))
ax.grid()

In [None]:
y = np.array(MAES_list)

In [None]:
# Crear el histograma
plt.figure(figsize=(10, 6))
plt.hist(y, bins=50, edgecolor='black')

# Añadir etiquetas y título
plt.xlabel('MAES')
plt.ylabel('f')
plt.title('Frecuencia de ocurrencia')