In [359]:
import numpy as np
import pandas as pd
import tensorflow as tf
from pandas_summary import DataFrameSummary
from matplotlib import pyplot as plt 
from pyarrow import feather

from zipfile import ZipFile


from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder  , StandardScaler


from tensorflow.keras.layers import Flatten , Embedding , Input , Concatenate , Dense, Dropout , BatchNormalization, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard
from tensorflow.keras import initializers 

import datetime

## Cargamos las bases de datos pre-procesadas

In [363]:
# Extraemos la base de train, pq es muy pesada para el push de github (+100mb)
with ZipFile('all_preprocessed_train.zip', 'r') as zipObj:
       zipObj.extractall()

In [364]:
df=pd.read_feather('all_preprocessed_train.fth')
df_test=pd.read_feather('all_preprocessed_test.fth')

### Asignamos una lista con el nombre de las variables categoricas, de las variables continuas y de los outputs

In [3]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 
            'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw']

In [4]:
contin_vars = ['CompetitionDistance', 
   'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Precipitationmm',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday', 'StateHoliday_bool']

In [5]:
out_vars=['Sales','Customers']

### Hacemos un  poco de EDA sobre los distintos tipos de variables

In [6]:
#Variables Categoricas
DataFrameSummary(df[cat_vars]).summary().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,counts,uniques,missing,missing_perc,types
Store,1017209.0,558.429727,321.908651,1.0,280.0,558.0,838.0,1115.0,1017209,1115,0,0%,numeric
DayOfWeek,1017209.0,3.998341,1.997391,1.0,2.0,4.0,6.0,7.0,1017209,7,0,0%,numeric
Year,1017209.0,2013.832292,0.777396,2013.0,2013.0,2014.0,2014.0,2015.0,1017209,3,0,0%,numeric
Month,1017209.0,5.846762,3.326097,1.0,3.0,6.0,8.0,12.0,1017209,12,0,0%,numeric
Day,1017209.0,15.70279,8.787638,1.0,8.0,16.0,23.0,31.0,1017209,31,0,0%,numeric
StateHoliday,,,,,,,,,1017209,4,0,0%,categorical
CompetitionMonthsOpen,1017209.0,20.207503,7.959786,0.0,24.0,24.0,24.0,24.0,1017209,25,0,0%,numeric
Promo2Weeks,1017209.0,22.860599,6.723693,0.0,25.0,25.0,25.0,25.0,1017209,26,0,0%,numeric
StoreType,,,,,,,,,1017209,4,0,0%,categorical
Assortment,,,,,,,,,1017209,3,0,0%,categorical


In [90]:
#Variables continuas
DataFrameSummary(df[contin_vars]).summary().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,counts,uniques,missing,missing_perc,types
CompetitionDistance,1017209.0,5613.013481,8498.338271,20.0,710.0,2330.0,6910.0,75860.0,1017209,654,0,0%,numeric
Max_TemperatureC,1017209.0,14.3283,8.464778,-11.0,8.0,15.0,21.0,38.0,1017209,50,0,0%,numeric
Mean_TemperatureC,1017209.0,10.073856,7.239083,-13.0,4.0,10.0,16.0,31.0,1017209,45,0,0%,numeric
Min_TemperatureC,1017209.0,5.900282,6.45973,-15.0,1.0,6.0,11.0,24.0,1017209,40,0,0%,numeric
Precipitationmm,1017209.0,0.790271,2.502615,0.0,0.0,0.0,0.25,58.93,1017209,40,0,0%,numeric
Max_Humidity,1017209.0,93.283786,7.717332,44.0,88.0,94.0,100.0,100.0,1017209,52,0,0%,numeric
Mean_Humidity,1017209.0,73.952151,13.206449,30.0,64.0,75.0,84.0,100.0,1017209,71,0,0%,numeric
Min_Humidity,1017209.0,49.923426,19.626353,4.0,34.0,49.0,65.0,100.0,1017209,93,0,0%,numeric
Max_Wind_SpeedKm_h,1017209.0,22.672915,8.940418,3.0,16.0,21.0,27.0,101.0,1017209,42,0,0%,numeric
Mean_Wind_SpeedKm_h,1017209.0,11.870788,5.897191,2.0,8.0,11.0,14.0,53.0,1017209,27,0,0%,numeric


In [7]:
#Variables de output
DataFrameSummary(df[out_vars]).summary().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,counts,uniques,missing,missing_perc,types
Sales,1017209.0,5773.818972,3849.926175,0.0,3727.0,5744.0,7856.0,41551.0,1017209,21734,0,0%,numeric
Customers,1017209.0,633.145946,464.411734,0.0,405.0,609.0,837.0,7388.0,1017209,4086,0,0%,numeric


### Cambiamos el tipo de variable de las  categoricas
Como se puede ver en el EDA, algunas de las variable categoricas son numericas, por ende pasamos esas variables a categoricas

In [78]:
# Pasamos a categoricas
def a_cat(data_frame,variables_cat):
    for i in variables_cat:
        data_frame[i]= data_frame[i].astype('category').cat.as_ordered()
    return data_frame

a_cat(df,cat_vars)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,...,AfterStateHoliday_bool,BeforeStateHoliday_bool,AfterPromo,BeforePromo,SchoolHoliday_bw,StateHoliday_bool_bw,Promo_bw,SchoolHoliday_fw,StateHoliday_bool_fw,Promo_fw
0,1,5,2015-07-31,5263,555,1,1,0,1,2015,...,57,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0
1,2,5,2015-07-31,6064,625,1,1,0,1,2015,...,67,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0
2,3,5,2015-07-31,8314,821,1,1,0,1,2015,...,57,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0
3,4,5,2015-07-31,13995,1498,1,1,0,1,2015,...,67,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0
4,5,5,2015-07-31,4822,559,1,1,0,1,2015,...,57,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1111,2,2013-01-01,0,0,0,0,a,1,2013,...,0,0,0,-6,1.0,1.0,0.0,4.0,1.0,1.0
1017205,1112,2,2013-01-01,0,0,0,0,a,1,2013,...,0,0,0,-6,1.0,1.0,0.0,4.0,1.0,1.0
1017206,1113,2,2013-01-01,0,0,0,0,a,1,2013,...,0,0,0,-6,1.0,1.0,0.0,4.0,1.0,1.0
1017207,1114,2,2013-01-01,0,0,0,0,a,1,2013,...,0,0,0,-6,1.0,1.0,0.0,4.0,1.0,1.0


In [79]:
# Lo hacemos tambien con test 
a_cat(df_test,cat_vars)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,...,AfterStateHoliday_bool,BeforeStateHoliday_bool,AfterPromo,BeforePromo,SchoolHoliday_bw,StateHoliday_bool_bw,Promo_bw,SchoolHoliday_fw,StateHoliday_bool_fw,Promo_fw
0,1,1,4,2015-09-17,1.0,1,0,0,2015,9,...,0,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
1,2,3,4,2015-09-17,1.0,1,0,0,2015,9,...,0,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
2,3,7,4,2015-09-17,1.0,1,0,0,2015,9,...,0,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
3,4,8,4,2015-09-17,1.0,1,0,0,2015,9,...,0,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
4,5,9,4,2015-09-17,1.0,1,0,0,2015,9,...,0,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41083,41084,1111,6,2015-08-01,1.0,0,0,0,2015,8,...,0,0,0,-2,0.0,0.0,0.0,5.0,0.0,5.0
41084,41085,1112,6,2015-08-01,1.0,0,0,0,2015,8,...,0,0,0,-2,0.0,0.0,0.0,5.0,0.0,5.0
41085,41086,1113,6,2015-08-01,1.0,0,0,0,2015,8,...,0,0,0,-2,0.0,0.0,0.0,5.0,0.0,5.0
41086,41087,1114,6,2015-08-01,1.0,0,0,0,2015,8,...,0,0,0,-2,0.0,0.0,0.0,5.0,0.0,5.0


Dejamos en los dataframes solamente las variables de interes (cat_vars y contin_vars) y la variable 'Date'. En el caso de train  tenemos que mantener  la variable 'Sales' y 'Customer' para el Output del entrenamiento, en el caso de Test mantenemos el ID para hacer la submission.

In [80]:
df=df[cat_vars+contin_vars+out_vars+['Date']].copy()
df_test=df_test[cat_vars+contin_vars+['Date','Id']].copy()

In [12]:
#Creamos una lista que vamos a usar despues para hacer los embedding, con el nombre de la variable categorica, 
#el numero efectivo del input + 1 para las variables desconocidas y un ultimo numero que seria el tamaño del embbeding, el cual es 
# la division entera por 2 en el caso de que el resultado de la division sea menor a 50, sino seria 50, esto para que el tamaño del embbeding 
#no sea muy elevado en el caso de variables con demasiadas categorias (+100)
cat_emb=[(j,len(df[j].cat.categories)+1,min(50,(len(df[j].cat.categories)+1)//2)) for j in cat_vars]

In [None]:
# Ahora hacemos una lista con las tuplas que contengan los nombres de las variables y las transformaciones que se le van a a aplicar.
# Para el caso de las categoricas, se le va a aplicar el LabelEndcoder, para el caso de las continuas, se les va a aplicar una normaliazcion,
# restandole la media y dividiendo por el desvio estandar.


In [13]:
cat_map=[(j,LabelEncoder()) for j in cat_vars]
cont_map=[([j],StandardScaler()) for j in contin_vars]


In [14]:
#Aplicamos los modelos que vamos a usar despues para las variables.
cat_mapper = DataFrameMapper(cat_map)
cont_mapper= DataFrameMapper(cont_map)


# Definimos la metrica para evaluar el entrenamiento:

$$
\textrm{RMSE}=\sqrt{\frac{1}{n}\sum_{i=1}^{n} \left(\frac{\hat{y}_i - y_i}{y_i}\right)^2}
$$

Ahora, a traves del backend de keras, tenemos que definir la metrica Custom, que es lo que haremos a continuacion

In [236]:
def rmspe(y_true,y_pred):
    rms_pe=K.sqrt(K.mean(K.square(((y_pred-y_true)/y_true))))
    return(rms_pe)

Creamos las capas de input de variables categoricas y continuas, definimos los embedding para las categoricas, hacemos el flatten,ultimo concatenamos los flatten de los embedding con las densas de las continuas y definimos el modelo, que tiene dos capas densas de 1000 y 500 neuronas respectivamente, aplicando dropout , batch normalization y regularizacion L2 para los pesos de las capas densas.
Este modelo tiene dos salidas con activación lineal, una es la variable 'Sales' y la otra es 'Customers', en donde se predice tanto las ventas del dia como la cantidad de clientes, de manera que esto ayude a entrenar los pesos teniendo en consideración el efecto que tienen los clientes en el monto de las ventas, y considerando en parte que no tendría mucho sentido hacer unb modelo predictivo que contenga como feature de entrada la cantidad de clientes siendo que este dato es incierto al momento del forecasting.


In [259]:
lr=0.001
l2_lambda = 1e-3
init=initializers.GlorotUniform(seed=18)
bias=initializers.Zeros()

In [261]:
def get_embedding(cat_emb):
    nombre, size_in , size_emb = cat_emb
    inp= Input((1,),dtype='int64',name=nombre+'_in')
    u= Flatten(name=nombre+'_flt')(Embedding(size_in,size_emb, input_length=1)(inp))
    return inp,u
def get_continuas(nombre):
    inp=Input((1,),name=nombre+'_in')
    densa= Dense(1,name=nombre+'_d')(inp)
    return inp, densa

def get_modelo(cat_emb,contin_vars):
    continua=[get_continuas(i) for i  in contin_vars]
    cont_out= [i for j,i in continua]
    cont_inp= [j for j,i in continua]
    
    embedding=[get_embedding(i) for i in cat_emb]
    
    x= Concatenate(name='concatenar_todo')([j for i,j in embedding]+cont_out)
    x= Dropout(0.10)(x)
    
    x=Dense(1000,activation='relu',kernel_initializer=init, bias_initializer=bias,kernel_regularizer=l2(l2_lambda))(x)
    x = BatchNormalization()(x)
    x= Dropout(0.10)(x)

    x=Dense(500,activation='relu',kernel_initializer=init, bias_initializer=bias,kernel_regularizer=l2(l2_lambda))(x)
    x = BatchNormalization()(x)
    x= Dropout(0.10)(x)

    output_1= Dense(1,name='Sales',activation='linear')(x)
    output_2= Dense(1,name='Customers',activation='linear')(x)

    
    
    model=Model([i for i,j in embedding]+cont_inp, [output_1,output_2])
    
    return(model)


    

## Instanciamos el modelo

In [262]:

model=get_modelo(cat_emb,contin_vars)

In [263]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Store_in (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
DayOfWeek_in (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
Year_in (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
Month_in (InputLayer)           [(None, 1)]          0                                            
____________________________________________________________________________________________

# Preparamos los inputs y outputs
definimos df_train y df_val. Para validacion, al ser una serie de tiempo, vamos a utilizar los datos de train mas actuales para ir haciendo la  crossvaldiation, serán todos los datos de ventas que se hayan hecho despues de 01/7/2015

In [82]:
df['Date']=pd.to_datetime(df['Date'])
df_test['Date']=pd.to_datetime(df_test['Date'])

In [84]:
df_train=df[df['Date']<datetime.datetime(2015,7,1)]
df_val=df[df['Date']>=datetime.datetime(2015,7,1)]

In [352]:
print(f'Tamaño de validación: {len(df_val)}',f'\nTamaño de test: {len(df_train)}',f'\nProporción de val sobre test: {(len(df_train)/(len(df_val)+len(df_train)))*100}%')

Tamaño de validación: 34565 
Tamaño de test: 982644 
Proporción de val sobre test: 96.6019765849496%


In [86]:
# Seteamos la variable de la fecha como el índice para los dos DataFrame
df_train=df_train.set_index('Date')
df_val=df_val.set_index('Date')
df_test=df_test.set_index('Date')

# estandarizacion y label encoding
Hacemos el fit de los labelencoders y standardscales para las variables categoricas y continuas tomando como base el df de train.
Para el caso del output lo hacemos por separado, para poder hacer posteriormente la inverse_transform() una vez que se tengan los resultados de las predicciones.

In [198]:
categoricas=cat_mapper.fit(df)
continuas=cont_mapper.fit(df)
output= StandardScaler()
output.fit()
output.fit(df[out_vars])


## Preparamos los inputs y outputs

In [113]:
X_train=np.concatenate((categoricas.transform(df_train),continuas.transform(df_train)),axis=1)
X_train=np.hsplit(X_train, len(cat_vars+contin_vars))

In [120]:
X_val=np.concatenate((categoricas.transform(df_val),continuas.transform(df_val)),axis=1)
X_val=np.hsplit(X_val, len(cat_vars+contin_vars))


In [127]:
X_test=np.concatenate((categoricas.transform(df_test),continuas.transform(df_test)),axis=1)
X_test=np.hsplit(X_test, len(cat_vars+contin_vars))

In [207]:
y_train=output.transform(df_train[out_vars])
y_train=np.hsplit(y_train, len(out_vars))

In [215]:
y_val=output.transform(df_val[out_vars])
y_val=np.hsplit(y_val, len(out_vars))

# Definimos los checkpoint, el earlystoppoing y tensorboard

In [285]:
import os
wordir=os.getcwd()
checkpoint_filepath = wordir+'/model.{epoch:02d}-{val_Sales_rmspe:.2f}.hdf5'
checkpoint= ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_Sales_rmspe',
    mode='min',
    save_best_only=True)

earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_Sales_loss', patience=10,mode='min')

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")


callbacks=[checkpoint,earlystopping,tensorboard_callback]

# Definimos tamaño de epochs, Batch y compilamos el modelo con el optimizador Adam.

In [267]:

epochs=50
batch_size=265


In [286]:
model.compile(optimizer=Adam(learning_rate=lr),metrics=rmspe,loss='mse')


# Entrenamos el Modelo

In [287]:
model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=epochs,batch_size=batch_size,verbose=1,callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


<tensorflow.python.keras.callbacks.History at 0x7f838cba01f0>

# Cargamos los pesos entrenados


In [288]:
model.load_weights('model.12-13.67.hdf5')

# Hacemos la prediccion

In [319]:
prediction=model.predict(X_test)

In [320]:
prediction=np.hstack(prediction)

In [325]:
prediction=pd.DataFrame(prediction,columns=['Sales','Customers'])


## Realizamos el incverse transform para pasar el forecast de ventas de una variable normalizada a una variable en unidades monetarias

In [329]:
prediction=output.inverse_transform(prediction[out_vars])

In [340]:
prediction_sales=np.hsplit(prediction,2)[0]

# Guardamos el sample_submission

In [344]:
sample_csv= pd.read_csv('sample_submission.csv')

In [347]:
sample_csv['Sales']=prediction_sales

In [349]:
sample_csv.to_csv(f'submision_primera.csv',index=False)