# Data Cleaning M5 Forecasting - Accuracy

In [1]:
#Use Python 3.6
!python --version

Python 3.6.10 :: Anaconda, Inc.


In [145]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
#from tqdm import tqdm as tqdm
import warnings
warnings.filterwarnings('ignore')
import pickle
import multiprocessing as mp
import os
import dask
import dask.dataframe as dd

In [3]:
df_cal=pd.read_csv('a/calendar.csv')
df_sat=pd.read_csv('a/sales_train_validation.csv')
df_sam=pd.read_csv('a/sample_submission.csv')
df_sep=pd.read_csv('a/sell_prices.csv')

## Summary of Steps of Cleaning

1. Extraction of days from sales_train_validation
2. Creation of dummies calendar with event's types, weekdays, months, snaps and wm_yr_wk (will remove later)
3. Creation of series for prices depending the week (wm_yr_wk)
4. Remove all wm_yr_wk
5. Integration of all the products/store



In [12]:
df_sat.shape

(30490, 1919)

In [13]:
df_sam.shape

(60980, 29)

## Submission File Structure

This file has the double of registers of products/stores due to the the first half is dedicated to the validation registers corresponding to the days 1 to 28 of preduction (days 1914 to 1941) corresponding to the validation sample, and the second part corresponding to the evaluation are the days 29 to 56 (days 1942 to 1969)

index | id | F1 | F2 | ... | F28
----- | -- | -- | -- | --- | ---
0  | HOBBIES_1_001_CA_1_validation | Yv0(F1) | Yv0(F2) | ... | Yv0(F28)
1  | HOBBIES_1_002_CA_1_validation | Yv1(F1) | Yv1(F2) | ... | Yv1(F28)
...
30489 | FOODS_3_827_WI_3_validation | Yv30489(F1) | Yv30489(F2) | ... | Yv30489(F28)
30490 | HOBBIES_1_001_CA_1_evaluation | Ye0(F1) | Ye0(F2) | ... | Ye0(F28)
30491 | HOBBIES_1_001_CA_1_evaluation | Ye1(F1) | Ye1(F2) | ... | Ye1(F28)
... 
60979 | FOODS_3_827_WI_3_evaluation | Ye30489(F1) | Ye30489(F2) | ... | Ye30489(F28)


## Step 1

In [8]:
df_sat_d=df_sat.drop(columns=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])
def day_data(item_index,fillval=None): #item index of product/store
    #Extraccion de dia(s) y transposición
    df_sell=df_sat_d.iloc[item_index].T
    df_sell.reset_index(drop=True,inplace=True,name='day_sell')
    
    #Completar la serie para igualar el numero de registros de dias
    for i in range(56):
        df_sell=df_sell.append(pd.Series([fillval]))
    
    return df_sell

## Step 2

In [9]:
#calendario dummies
df_cal.month=df_cal.month.apply(lambda x:str(x))
df_cal_dum=pd.get_dummies(df_cal[['event_type_1','event_type_2','weekday','month','snap_CA','snap_TX','snap_WI','wm_yr_wk']])

def add_cal(df_sell, df_data=df_cal_dum.copy()):
    #Integracion de las ventas al Set de datos
    df_data['day_sell']=df_sell.values
    #Cambio de tipo de datos
    df_data.day_sell=df_data.day_sell.apply(lambda x: int(x) if x!= None else None)
    return (df_data)


## Step 3

In [10]:
#Funcion para el calculo de wm_yr_wk
#Regresa indice del rango de los dias de la semana wm_yr_wk de la semana dada

def range_days_week(wm_yr_wk):
    dn=list(df_cal.wm_yr_wk.unique()).index(wm_yr_wk)*7
    return dn, dn+7


In [11]:
#Creacion de Serie de precios de venta segun semana Walmart

def sell_price_series(item_index, df_data, no_value=None):
    #Temporal variable for the query who has the item and store in the prices table
    temp = df_sep[(df_sep.store_id==df_sat['store_id'].iloc[item_index])&(df_sep.item_id==df_sat['item_id'].iloc[item_index])]
    
    #Create Clean Series
    series_length=1969
    output = pd.Series(list( no_value for i in range(series_length)))
    
    #Write the prices in the intervals corresponding every week who has values
    
    for i in range(len(temp.wm_yr_wk.unique())):
        output.iloc[range_days_week(temp.wm_yr_wk.unique()[i])[0]:range_days_week(temp.wm_yr_wk.unique()[i])[1]]=\
        temp.sell_price[temp.wm_yr_wk==temp.wm_yr_wk.unique()[i]].values[0]
        
    df_data['sell_price']=output.values
    
    
    return df_data

## Step 4

In [12]:
def remove_wm_yr_wk(df_data):
    df_data = df_data.drop(columns=['wm_yr_wk'])
    return df_data

## Step 5

In [13]:
def data_cleaning(i):
#def data_cleaning(nprod=len(df_sat)):
    #list_df_sell = []
    #for i in tqdm(range(nprod)):
    df_sell = day_data(i) #Step 1
    df_data = add_cal(df_sell) # Step 2
    df_data = sell_price_series(i,df_data) #Step 3
    df_data = remove_wm_yr_wk(df_data) #Step 4
    #list_df_sell.append(df_data) #Step 5
    #return list_df_sell
    return df_data

In [14]:
%%time
nprod=len(df_sat)
pool = mp.Pool(mp.cpu_count())
list_df_sell = pool.map(data_cleaning,[i for i in range(nprod)])
pool.close()


CPU times: user 38.2 s, sys: 12.6 s, total: 50.8 s
Wall time: 2h 14min 23s


In [15]:
#Saving the clean data

import pickle
pickle_file=open('list_df_sell.pickle','wb')
pickle.dump(list_df_sell,pickle_file)

In [4]:
len(df_sat)

30490

# Prepare Data Set for models

In [7]:
#Utilidades
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE

#Modelos
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import BayesianRidge as BR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import BaggingRegressor as BaR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import StackingRegressor as SR
from sklearn.ensemble import VotingRegressor as VR
#from sklearn.ensemble import HistGradientBoostingRegressor as HGBR



In [64]:
X,y = df_data.drop(columns=['day_sell'])[:1912], df_data.day_sell[:1912]
X_train, X_test, y_train, y_test = TTS(X,y, test_size = 0.2, shuffle=False)

In [15]:
rfr=RFR()
rfr.fit(X_train,y_train)
y_pred=rfr.predict(X_test)
MSE(y_test,y_pred)

0.9578757556443264

In [16]:
br=BR()
br.fit(X_train,y_train)
y_pred=br.predict(X_test)
MSE(y_test,y_pred)

0.8799307717714824

In [17]:
dtr=DTR()
dtr.fit(X_train,y_train)
y_pred=dtr.predict(X_test)
MSE(y_test,y_pred)

1.1431077748767045

In [18]:
abr=ABR()
abr.fit(X_train,y_train)
y_pred=abr.predict(X_test)
MSE(y_test,y_pred)

0.8850245763618101

In [19]:
bar=BaR()
bar.fit(X_train,y_train)
y_pred=bar.predict(X_test)
MSE(y_test,y_pred)

1.000630169610951

In [20]:
bar=BaR()
bar.fit(X_train,y_train)
y_pred=bar.predict(X_test)
MSE(y_test,y_pred)

0.9811524125823093

In [21]:
gbr=GBR()
gbr.fit(X_train,y_train)
y_pred=gbr.predict(X_test)
MSE(y_test,y_pred)

0.8801007145867064

### H2O

In [23]:
import h2o
from h2o.automl import H2OAutoML

In [104]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,14 hours 17 mins
H2O cluster timezone:,America/Mexico_City
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.3
H2O cluster version age:,10 months and 19 days !!!
H2O cluster name:,H2O_from_python_al_af72yp
H2O cluster total nodes:,1
H2O cluster free memory:,1.997 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [121]:
train=X_train.copy()
train.insert(29,'y_',y_train)
test=X_test.copy()
test.insert(29,'y_',y_test)

In [122]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [123]:
X=train.columns
y='y_'
X.remove(y)

In [124]:
train[y]=train[y].asfactor()
test[y]=test[y].asfactor()

In [125]:
aml=H2OAutoML(max_models=50, seed=1)
aml.train(x=X,y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [126]:
lb=aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_per_class_error,logloss,rmse,mse
GBM_grid_1_AutoML_20200326_104416_model_9,0.803973,9.82611,0.534543,0.285736
GBM_grid_1_AutoML_20200326_104416_model_6,0.807997,4.49027,0.48464,0.234876
GBM_grid_1_AutoML_20200326_104416_model_4,0.808469,0.456458,0.385047,0.148261
GBM_3_AutoML_20200326_104416,0.811629,0.450446,0.38312,0.146781
GBM_2_AutoML_20200326_104416,0.812755,0.449522,0.383289,0.14691
GBM_4_AutoML_20200326_104416,0.814053,0.457232,0.384917,0.148161
GBM_grid_1_AutoML_20200326_104416_model_5,0.815456,10.3385,0.548498,0.30085
GBM_1_AutoML_20200326_104416,0.815595,0.476462,0.391142,0.152992
DRF_1_AutoML_20200326_104416,0.816474,1.76669,0.400623,0.160499
XRT_1_AutoML_20200326_104416,0.816474,1.76669,0.400623,0.160499




In [127]:
aml.leader

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_grid_1_AutoML_20200326_104416_model_9


ModelMetricsMultinomial: gbm
** Reported on train data. **

MSE: 0.25768476128188356
RMSE: 0.5076265963105987
LogLoss: 8.900116350290968
Mean Per-Class Error: 0.7263295036827042
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7
0.0,1.0,2.0,3.0,4.0,5.0,Error,Rate
1062.0,108.0,50.0,23.0,38.0,3.0,0.1728972,"222 / 1,284"
97.0,56.0,18.0,2.0,0.0,2.0,0.68,119 / 175
17.0,19.0,14.0,1.0,1.0,1.0,0.7358491,39 / 53
5.0,4.0,1.0,3.0,0.0,0.0,0.7692308,10 / 13
2.0,0.0,0.0,1.0,0.0,0.0,1.0,3 / 3
1.0,0.0,0.0,0.0,0.0,0.0,1.0,1 / 1
1184.0,187.0,83.0,30.0,39.0,6.0,0.2576848,"394 / 1,529"


Top-6 Hit Ratios: 


0,1
k,hit_ratio
1,0.7423152
2,0.8705036
3,0.8888162
4,0.8901243
5,0.8901243
6,1.0



ModelMetricsMultinomial: gbm
** Reported on cross-validation data. **

MSE: 0.2857360452864149
RMSE: 0.5345428376532744
LogLoss: 9.82610825718673
Mean Per-Class Error: 0.8039729002751397
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7
0.0,1.0,2.0,3.0,4.0,5.0,Error,Rate
1042.0,144.0,46.0,21.0,11.0,20.0,0.1884735,"242 / 1,284"
105.0,44.0,17.0,4.0,1.0,4.0,0.7485714,131 / 175
33.0,12.0,6.0,1.0,1.0,0.0,0.8867925,47 / 53
8.0,2.0,0.0,0.0,0.0,3.0,1.0,13 / 13
2.0,0.0,0.0,0.0,0.0,1.0,1.0,3 / 3
0.0,1.0,0.0,0.0,0.0,0.0,1.0,1 / 1
1190.0,203.0,69.0,26.0,13.0,28.0,0.2858077,"437 / 1,529"


Top-6 Hit Ratios: 


0,1
k,hit_ratio
1,0.7141923
2,0.8613473
3,0.8737737
4,0.8750817
5,0.8763897
6,0.9999999


Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.7141734,0.0219877,0.7352941,0.6993464,0.6862745,0.7647059,0.6852459
err,0.2858266,0.0219877,0.2647059,0.3006536,0.3137255,0.2352941,0.3147541
err_count,87.4,6.6873016,81.0,92.0,96.0,72.0,96.0
logloss,9.826792,0.7471041,9.142705,10.289531,10.732439,8.09806,10.871222
max_per_class_error,1.0,0.0,1.0,1.0,1.0,1.0,1.0
mean_per_class_accuracy,0.4289962,0.0594794,0.3704208,0.3644955,0.537445,0.5255242,0.3470952
mean_per_class_error,0.5710038,0.0594794,0.6295791,0.6355045,0.462555,0.4744758,0.6529048
mse,0.285755,0.0219434,0.2647082,0.3006287,0.3133900,0.2352941,0.3147541
r2,0.1104166,0.1177524,0.2740894,0.2588292,-0.0354475,0.1927600,-0.1381482


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error
,2020-03-26 11:20:19,17.294 sec,0.0,0.8333333,1.7917595,0.1602354
,2020-03-26 11:20:19,17.335 sec,5.0,0.3818884,0.5014432,0.1805101
,2020-03-26 11:20:19,17.375 sec,10.0,0.4296791,2.7392197,0.2132112
,2020-03-26 11:20:19,17.416 sec,15.0,0.4664383,5.8668515,0.2256377
,2020-03-26 11:20:19,17.454 sec,20.0,0.5926297,12.0445253,0.3512099
,2020-03-26 11:20:19,17.491 sec,25.0,0.5140282,9.1067663,0.2642250
,2020-03-26 11:20:19,17.524 sec,30.0,0.4965545,8.5161012,0.2465664
,2020-03-26 11:20:19,17.540 sec,32.0,0.5076266,8.9001164,0.2576848


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
sell_price,491.2980042,1.0,0.2060171
month_12,256.0709534,0.5212131,0.1073788
weekday_Friday,225.7178650,0.4594317,0.0946508
snap_CA,137.6281433,0.2801317,0.0577119
snap_WI,131.7222748,0.2681107,0.0552354
---,---,---,---
event_type_1_Religious,1.2266078,0.0024967,0.0005144
event_type_1_Cultural,0.2263158,0.0004606,0.0000949
event_type_1_Sporting,0.1348659,0.0002745,0.0000566



See the whole table with table.as_data_frame()




In [128]:
pred=aml.leader.predict(test)
pred

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,p0,p1,p2,p3,p4,p5
0,1,0,0,0,0,0
4,0,0,0,0,1,0
0,1,0,0,0,0,0
0,1,0,0,0,0,0
0,1,0,0,0,0,0
0,1,0,0,0,0,0
0,1,0,0,0,0,0
0,1,0,0,0,0,0
2,0,0,1,0,0,0
0,1,0,0,0,0,0




## Applying Model

In [4]:
#Loading Pickle
file = open('list_df_sell.pickle', 'rb')
list_df_sell = pickle.load(file)
file.close()

In [96]:
def preparedataset(i):
    Xy=list_df_sell[i].copy()
    
    #Stracting X_test
    X_test=Xy.drop(columns=['day_sell'])[1913:]
    
    #Remove Nones for all the sample  from X and y in the sell price
    Xy.sell_price=Xy.sell_price.astype(float)
    Xy=Xy[(Xy.sell_price>0) & (Xy.day_sell >=0)]
    
    #Prepare X_train and y_train
    X_train,y_train = Xy.drop(columns=['day_sell']), Xy.day_sell
    return X_train,y_train

In [143]:
#from sklearn.ensemble import RandomForestRegressor as RFR
#from sklearn.linear_model import ElasticNet as EN
#from sklearn.ensemble import GradientBoostingRegressor as GBR

def prediction(i, database_size=30490):
    
    #Xy_train = preparedataset(i)
    #X_train = Xy_train[0]
    #y_train = Xy_train[1]
    
    #Applying model

    y_pred=xgbmodel(i)


    #Write predictions in sumbit validation (first 28)
    df_sub.iloc[i,1:] = y_pred[:28]

    #Write predictions in submit validation (second 28) row + 30490
    df_sub.iloc[i+database_size,1:]=y_pred[28:]


In [None]:
#Copy to the sample (df_sam) = submit (df_sub)
df_sub=df_sam.copy()

#for i in tqdm(range(10)):
for i in tqdm(range(len(df_sat))):
    prediction(i)

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))

100%|██████████| 5/5 [00:22<00:00,  4.46s/it, best loss: 0.7218927072926197]
100%|██████████| 5/5 [00:14<00:00,  2.84s/it, best loss: 0.7340460215522177]
100%|██████████| 5/5 [00:16<00:00,  3.31s/it, best loss: 0.743791348184722]
100%|██████████| 5/5 [00:14<00:00,  2.91s/it, best loss: 0.7113565223530882]
100%|██████████| 5/5 [00:07<00:00,  1.45s/it, best loss: 0.7166393367156354]
100%|██████████| 5/5 [00:15<00:00,  3.18s/it, best loss: 0.7214185974742348]
100%|██████████| 5/5 [00:16<00:00,  3.31s/it, best loss: 0.7207616867916945]
100%|██████████| 5/5 [00:07<00:00,  1.55s/it, best loss: 0.7375261307444854]
100%|██████████| 5/5 [00:08<00:00,  1.75s/it, best loss: 0.7096242063869472]
100%|██████████| 5/5 [00:15<00:00,  3.07s/it, best loss: 0.727752515088044] 
100%|██████████| 5/5 [01:47<00:00, 21.43s/it, best loss: 0.7106315747723608]
100%|██████████| 5/5 [00:17<00:00,  3.41s/it, best loss: 0.746550646666634] 
100%|██████████| 5/5 [00:13<00:00,  2.73s/it, best loss: 0.7334479266545756]


100%|██████████| 5/5 [00:08<00:00,  1.79s/it, best loss: 0.7275397655417688]
100%|██████████| 5/5 [00:04<00:00,  1.14it/s, best loss: 0.7182878632222962]
100%|██████████| 5/5 [00:14<00:00,  2.83s/it, best loss: 0.7282619708046174]
100%|██████████| 5/5 [00:11<00:00,  2.35s/it, best loss: 0.7103382931390395]
100%|██████████| 5/5 [00:14<00:00,  2.91s/it, best loss: 0.7212467640117082]
100%|██████████| 5/5 [00:12<00:00,  2.48s/it, best loss: 0.694003129205088] 
100%|██████████| 5/5 [00:12<00:00,  2.56s/it, best loss: 0.7320376553952468]
100%|██████████| 5/5 [00:08<00:00,  1.67s/it, best loss: 0.7266260829075266]
100%|██████████| 5/5 [00:13<00:00,  2.74s/it, best loss: 0.7414803659940502]
100%|██████████| 5/5 [00:09<00:00,  1.95s/it, best loss: 0.7266643065214146]
100%|██████████| 5/5 [00:08<00:00,  1.69s/it, best loss: 0.7267823809116732]
100%|██████████| 5/5 [00:12<00:00,  2.59s/it, best loss: 0.7403783212418082]
100%|██████████| 5/5 [00:15<00:00,  3.05s/it, best loss: 0.7323266075487735]

100%|██████████| 5/5 [00:05<00:00,  1.02s/it, best loss: 0.7084177503941711]
100%|██████████| 5/5 [00:07<00:00,  1.56s/it, best loss: 0.7215235114028374]
100%|██████████| 5/5 [00:17<00:00,  3.50s/it, best loss: 0.7332446114062335]
100%|██████████| 5/5 [00:09<00:00,  1.83s/it, best loss: 0.7297436837489365]
100%|██████████| 5/5 [00:12<00:00,  2.44s/it, best loss: 0.7135440383809222]
100%|██████████| 5/5 [00:14<00:00,  2.92s/it, best loss: 0.7162996722594076]
100%|██████████| 5/5 [00:12<00:00,  2.47s/it, best loss: 0.7185448078819516]
100%|██████████| 5/5 [00:12<00:00,  2.43s/it, best loss: 0.7448790966832042]
100%|██████████| 5/5 [00:12<00:00,  2.57s/it, best loss: 0.7127714182885775]
100%|██████████| 5/5 [00:12<00:00,  2.47s/it, best loss: 0.7184971523351087]
100%|██████████| 5/5 [00:12<00:00,  2.41s/it, best loss: 0.7200850821890901]
100%|██████████| 5/5 [00:11<00:00,  2.40s/it, best loss: 0.7498428863268112]
100%|██████████| 5/5 [00:12<00:00,  2.54s/it, best loss: 0.7499021727544897]

In [None]:
df_sub.to_csv('M5_AV_05_XGBoost_n5.csv', index=False)

## Test Zone

In [149]:
df_sub.iloc[30490:30505]

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30490,HOBBIES_1_001_CA_1_evaluation,0.060448,0.710953,0.0,0.223788,0.228602,0.168158,0.214708,0.0,0.710953,...,0.747942,0.0,0.0,0.130675,0.381387,0.0,0.931677,1.003537,0.753696,2.09153
30491,HOBBIES_1_002_CA_1_evaluation,0.278447,0.325633,0.009176,0.367228,0.378045,0.432572,0.02126,0.01063,0.325633,...,0.567867,0.794892,0.420917,0.188256,0.244426,1.325752,0.16287,0.247418,0.006402,0.623214
30492,HOBBIES_1_003_CA_1_evaluation,0.454166,0.229238,0.252124,0.273253,0.193736,0.666428,0.016962,1.49898,0.229238,...,0.384402,1.239972,0.005978,0.289158,0.0,0.036501,0.018479,0.47369,0.520014,0.222852
30493,HOBBIES_1_004_CA_1_evaluation,2.097975,1.589313,0.978691,0.857186,1.933128,3.18536,3.249015,1.696931,1.589313,...,1.364504,0.672822,2.513978,1.846013,1.789156,1.431149,1.015532,1.690391,2.817038,1.784263
30494,HOBBIES_1_005_CA_1_evaluation,1.145534,1.551736,1.492959,1.898079,1.955451,4.176614,3.136689,1.315667,1.551736,...,1.098753,1.825846,2.090707,2.023895,0.905503,0.805829,0.606214,0.715246,1.065588,0.298136
30495,HOBBIES_1_006_CA_1_evaluation,0.708417,0.603343,0.930946,0.456842,0.699314,0.946712,0.506094,4.045746,0.603343,...,0.175462,0.151209,0.735043,1.678147,0.270828,0.0,0.057916,0.056985,0.0,0.0
30496,HOBBIES_1_007_CA_1_evaluation,0.267318,0.192824,0.299227,0.305762,0.864935,0.269041,0.406385,0.004599,0.192824,...,0.0,0.79884,0.576164,0.923692,0.940162,0.992437,0.470614,0.209512,0.315186,0.0
30497,HOBBIES_1_008_CA_1_evaluation,11.68504,6.305268,10.34616,7.839515,8.388208,9.848098,14.043884,6.004989,6.305268,...,5.934118,1.479713,2.478768,16.999746,6.482188,2.025238,2.881044,11.714595,6.625737,6.525025
30498,HOBBIES_1_009_CA_1_evaluation,2.852529,1.015794,2.048379,1.202897,1.091626,1.593028,1.620458,0.795144,1.015794,...,2.088094,2.788052,2.07183,2.115031,0.106516,0.469396,2.326511,3.42982,2.319166,2.301038
30499,HOBBIES_1_010_CA_1_evaluation,1.349583,0.715811,0.587149,0.481595,0.569269,0.510581,0.869681,1.02115,0.715811,...,0.705148,1.006472,0.743819,0.810768,0.131817,0.147788,0.505147,1.103369,1.006675,1.579789


### XGboost + HyperOpt

In [12]:
import xgboost as xgb
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample

In [76]:
space={
    'n_estimators':hp.quniform('n_estimators', 10, 2000, 25),
    'learning_rate':hp.uniform('learning_rate', 0.00001, 1.0),
    'max_depth':hp.quniform('x_max_depth', 8, 32, 1),
    'min_child_weight':hp.quniform('x_min_child', 1, 10, 1),
    'subsample':hp.uniform('x_subsample', 0.7, 1),
    'gamma':hp.uniform('x_gamma', 0.1, 0.5),
    'reg_lambda':hp.uniform('x_reg_lambda', 0, 1)
}

In [77]:
def objetivo(space):
    
    modelo=xgb.XGBRegressor(
        n_estimators=int(space['n_estimators']),
        learning_rate=space['learning_rate'],
        max_depth=int(space['max_depth']),
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        gamma=space['gamma'],
        reg_lambda=space['reg_lambda'],
        objective='reg:squarederror'
    )
    
    eval_set=[(X_train, y_train), (X_test, y_test)]
    modelo.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', verbose=False)
    y_pred=modelo.predict(X_test)
    rmse=MSE(y_test, y_pred)**0.5
    
    return {'loss':rmse, 'status':STATUS_OK}

In [138]:
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE

def xgbmodel(i,ShowMSE=False, max_evals =5):
    trials_reg=Trials()
    
    Xy=list_df_sell[i].copy()

    #Stracting X_test
    X_test=Xy.drop(columns=['day_sell'])[1913:]

    #Remove Nones for all the sample  from X and y in the sell price
    Xy.sell_price=Xy.sell_price.astype(float)
    Xy=Xy[(Xy.sell_price>0) & (Xy.day_sell >=0)]

    #Prepare X_train and y_train
    X,y = Xy.drop(columns=['day_sell']), Xy.day_sell
    
    

    X_train, X_test, y_train, y_test = TTS(X,y, test_size = 0.2, shuffle=False)
    best=fmin(fn=objetivo, space=space, algo=tpe.suggest, max_evals=max_evals, trials=Trials())

    #Train with complete data set and founded hyperparameters

    modelo=xgb.XGBRegressor(
        n_estimators=int(best['n_estimators']),
        learning_rate=best['learning_rate'],
        x_max_depth=int(best['x_max_depth']),
        x_min_child=best['x_min_child'],
        x_subsample=best['x_subsample'],
        x_gamma=best['x_gamma'],
        x_reg_lambda=best['x_reg_lambda'],
        objective='reg:squarederror'
        )

    #Checking MSE
    
    if ShowMSE==True:
        modelo.fit(X_train, y_train)
        y_pred=modelo.predict(X_test)
        print(MSE(y_test, y_pred))
    else:
        pass
    
    
    #Defining new X_train and y_train to train with the all dataset
    Xy=list_df_sell[i].copy()

    #Stracting X_test
    X_test=Xy.drop(columns=['day_sell'])[1913:]
    X_test.sell_price=X_test.sell_price.astype(float)

    #Remove Nones for all the sample  from X and y in the sell price
    Xy.sell_price=Xy.sell_price.astype(float)
    Xy=Xy[(Xy.sell_price>0) & (Xy.day_sell >=0)]

    #Prepare X_train and y_train
    X,y = Xy.drop(columns=['day_sell']), Xy.day_sell
    
    
    #Final Train
    m=xgb.XGBRegressor()
    m.fit(X_train,y_train)
    y_pred=m.predict(X_test)
    
    y_pred=np.array(list((map(lambda x: 0 if x<0 else x,y_pred))))
    
    return y_pred

In [141]:
y_pred=xgbmodel(1,True)
y_pred

100%|██████████| 5/5 [00:09<00:00,  1.84s/it, best loss: 0.722018377467855]
0.5619736255642137


array([0.3056742 , 0.23767391, 0.13566807, 0.12867454, 0.22564512,
       0.18748155, 1.0384649 , 0.        , 0.07177058, 0.10478401,
       0.83349669, 0.40323573, 0.31879982, 1.36095858, 0.01037201,
       0.52990621, 1.70528877, 0.05309653, 0.01444811, 0.28841984,
       0.18280241, 0.27844653, 0.32563314, 0.00917578, 0.36722845,
       0.37804535, 0.4325718 , 0.02126017, 0.27844653, 0.32563314,
       0.00917578, 0.36722845, 0.37804535, 0.4325718 , 0.02126017,
       0.01063013, 0.32563314, 0.        , 0.12742424, 1.43955946,
       0.        , 0.12698987, 0.51505345, 0.4972491 , 0.00436535,
       0.0424206 , 0.56786722, 0.7948916 , 0.4209168 , 0.18825561,
       0.2444258 , 1.32575178, 0.16287038, 0.24741837, 0.00640163,
       0.62321407])