**Modelo de red neuronal Transformer**

Este archivo consta de los codigos y conclusiones de:
* 1.Carga de librerias y datos
* 2.Normalizar Datos.
* 3.Preparar datos para realizar aprendizaje supervizado.
* 4.Modelo Transformer
* 5.Evaluacion del modelo


#1.Carga de Librerias y Datos


*Se importan los módulos necesarios para trabajar*

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In [2]:
#Pandas es utilizado para leer los set de datos
import pandas as pd
#Numpy es utilizado para generar las series de datos a graficar
import numpy as np
#Seaborn es utilizado para generar los gráficos
import seaborn as sns
import matplotlib.pyplot as plt
#Se importan modulos estadisticos para generar test de hipotesis, entre otros
from sklearn.preprocessing import StandardScaler
#Módulos implementa funciones que evalúan el error de predicción para propósitos específicos
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse
#Ignorar warnings
import warnings
warnings.filterwarnings("ignore")

#Dividir arreglos o matrices en subconjuntos aleatorios de tren y prueba
from sklearn.model_selection import train_test_split

#Biblioteca de Redes Neuronales
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, model_from_json
from keras.layers import Dropout, LSTM, Dense, Activation,Input
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasRegressor

from hyperopt import Trials, STATUS_OK, tpe, hp, fmin, space_eval
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict, TimeSeriesSplit
import time, random

In [3]:
# Para acceder a los archivos del gdrive
from google.colab import drive
drive.mount('/content/gdrive/')


Mounted at /content/gdrive/


In [4]:
cd /content/gdrive/MyDrive/Tesis/Datos

/content/gdrive/MyDrive/Tesis/Datos


Se obtiene conjunto de datos

In [5]:
df=pd.read_csv('df.csv')
df=df.drop(['Year', 'Week', 'Day','Month','Size','Type'], axis=1)

In [6]:
df.set_index('Date', inplace=True)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 380380 entries, 2010-02-05 to 2012-10-26
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         380380 non-null  int64  
 1   Dept          380380 non-null  int64  
 2   Weekly_Sales  380380 non-null  float64
 3   IsHoliday     380380 non-null  bool   
 4   Temperature   380380 non-null  float64
 5   Fuel_Price    380380 non-null  float64
 6   MarkDown1     380380 non-null  float64
 7   MarkDown2     380380 non-null  float64
 8   MarkDown3     380380 non-null  float64
 9   MarkDown4     380380 non-null  float64
 10  MarkDown5     380380 non-null  float64
 11  CPI           380380 non-null  float64
 12  Unemployment  380380 non-null  float64
dtypes: bool(1), float64(10), int64(2)
memory usage: 38.1+ MB


In [8]:
df.shape

(380380, 13)

In [9]:
#Setear semilla
np.random.seed(42)

#2. Obtener series de Tiempo

*Se obtiene lista de dataframe ordenados por Store y Dept*

In [10]:
series_time=[]
lista_Store=df.Store.unique()
lista_Store.sort()
lista_dept=df.Dept.unique()
lista_dept.sort()

for i in lista_Store:
  for j in lista_dept:
    #lista=[]
    test=df[(df.Store==i) & (df.Dept==j)]
    if(test.empty!=True):
        series_time.append(test)

In [11]:
len(series_time)

2660

#3.Normalizar base de datos

El **método de puntuación z** (a menudo llamado estandarización ) transforma los datos en una distribución con una media de 0 y una desviación estándar de 1 . Cada valor estandarizado se calcula restando la media de la característica correspondiente y luego dividiendo por la desviación estándar.

In [None]:
#Seleccion de caracteristicas
features =features = [feature for feature in df.columns if feature not in ('Store','Dept')]

#Se define escalado
std_scaler = StandardScaler()

series_time_scaled=[]



#Transformacion
for serie in series_time:
  for i in features:
    serie[i]=std_scaler.fit_transform(serie[i].values.reshape(-1,1))
  series_time_scaled.append(serie)

for i in features:
  df[i] = std_scaler.fit_transform(df[i].values.reshape(-1,1))

series_time_scaled[0]

Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-02-05,1,1,0.245542,-0.274204,-1.830686,-1.521079,-0.524358,-0.193456,-0.092151,-0.336050,-0.579339,-1.130288,1.295958
2010-02-12,1,1,2.395781,3.646917,-2.098280,-1.577441,-0.524358,-0.193456,-0.092151,-0.336050,-0.579339,-1.096657,1.295958
2010-02-19,1,1,1.943233,-0.274204,-1.998285,-1.657288,-0.524358,-0.193456,-0.092151,-0.336050,-0.579339,-1.085823,1.295958
2010-02-26,1,1,-0.316684,-0.274204,-1.526473,-1.546911,-0.524358,-0.193456,-0.092151,-0.336050,-0.579339,-1.078788,1.295958
2010-03-05,1,1,-0.069800,-0.274204,-1.535627,-1.396612,-0.524358,-0.193456,-0.092151,-0.336050,-0.579339,-1.071754,1.295958
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-09-28,1,1,-0.363093,-0.274204,0.547388,1.048107,0.141879,-0.191745,-0.091796,0.020611,0.956274,1.611008,-1.836849
2012-10-05,1,1,-0.062002,-0.274204,0.017127,0.933034,0.943562,-0.193456,-0.088232,0.573847,0.594358,1.657095,-2.712884
2012-10-12,1,1,0.025529,-0.274204,-0.374406,0.895459,-0.145256,-0.193456,-0.090407,-0.184537,1.338915,1.703183,-2.712884
2012-10-19,1,1,0.170262,-0.274204,-0.023716,0.879020,-0.351664,-0.193456,-0.091091,-0.315864,0.169277,1.713430,-2.712884


In [None]:
len(series_time_scaled)

2660

In [None]:
series_time_scaled=random.sample(series_time_scaled,20)

In [None]:
df_series=pd.DataFrame()


for seriein series_time_scaled:
  df_series=pd.concat([df_series, serie])

SyntaxError: ignored

#4.Preparar datos para realizar aprendizaje supervizado.

La idea es modelar cada valor en función de los valores recientes anteriores, dado un retardo de tiempo dado. **Los valores futuros de una variable en una serie de tiempo dependen de sus propios rezagos y de los rezagos de otras variables.**

In [None]:
def time_delay_embedding(series: pd.Series, n_lags: int, horizon: int):
    """
    Incrustación de retardo de tiempo
    :param series: serie de tiempo como objeto de pandas
    :param n_lags: número de valores pasados para usar como variables explicativas
    :param horizon: horizonte de pronostico
    :return:pd.DataFrame con series temporales reconstruidas
    """
    assert isinstance(series, pd.Series)

    if series.name is None:
        name = 'Series'
    else:
        name = series.name

    n_lags_iter = list(range(n_lags, -horizon, -1))

    serie_time_delay = [series.shift(i) for i in n_lags_iter]
    serie_time_delay = pd.concat(serie_time_delay, axis=1).dropna()
    serie_time_delay.columns = [f'{name}(t-{j - 1})'
                 if j > 0 else f'{name}(t+{np.abs(j) + 1})'
                 for j in n_lags_iter]

    return serie_time_delay

In [None]:
series_predic=[]
series_target=[]
for serie in series_time_scaled:
  serie_split = []
  for columna in serie:
    col_df = time_delay_embedding(
        serie[columna],     #Serie de tiempo
        n_lags=1,           #Numero de retrasos
        horizon=1           # Horizonte de prediccion
          )
    serie_split.append(col_df)

  serie_df = pd.concat(serie_split, axis=1).dropna()
  predictor_variables = serie_df.columns.str.contains('\(t\-')
  target_variables = serie_df.columns.str.contains('Weekly_Sales\(t\+')

  predictor_variables = serie_df.iloc[:, predictor_variables]
  target_variables = serie_df.iloc[:, target_variables]
  series_predic.append(predictor_variables)
  series_target.append(target_variables)

In [None]:
#Ejemplo de variables de prediccion de una serie
series_predic[0].head(4)

In [None]:
#Ejemplo de variables objetivo de una serie
series_target[0].head(4)

In [None]:
#Se separa conjunto en entrenamiento y prueba; sin aleatoriedad
#Dejando un %20 de la data para test
predictor_variables=pd.DataFrame()
target_variables=pd.DataFrame()

for serie,target in zip(series_predic,series_target):
  predictor_variables=pd.concat([predictor_variables, serie])
  target_variables=pd.concat([target_variables, target])

print("Separacion de datos terminada!")

In [None]:
predictor_variables

In [None]:
target_variables

In [None]:
# Exportar el DataFrame como CSV
predictor_variables.to_csv('predictor_variables.csv')
target_variables.to_csv('target_variables.csv')
df_serie.to_csv('df_modelo.csv')