# Predict Future Sale - Baseline
## Final project for "How to win a data science competition" Coursera course
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data  
>Student: Rafael Caneiro de Oliveira  
>Email: rafael.caneiro@gmail.com  
>Date: 04/08/2020

## Load the data

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path

PATH = Path.cwd().parent
DATA_PATH = Path(PATH, "./data/raw/") 

In [2]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,14 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.1
H2O_cluster_version_age:,13 days
H2O_cluster_name:,rco
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.325 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,12


In [8]:
sales_train_df = h2o.import_file(Path(DATA_PATH,"sales_train.csv").as_uri())
test_df = h2o.import_file(Path(DATA_PATH,"test.csv").as_uri())
items_df = h2o.import_file(Path(DATA_PATH,"items.csv").as_uri())
shops_df = h2o.import_file(Path(DATA_PATH,"shops.csv").as_uri())
categories_df = h2o.import_file(Path(DATA_PATH,"item_categories.csv").as_uri())

print(sales_train_df.shape)

train_df = sales_train_df.merge(items_df,
                                all_x=True,
                                all_y=False,
                                by_x=["item_id"],
                                by_y=["item_id"])

train_df = train_df.merge(categories_df,
                          all_x=True,
                          all_y=False,
                          by_x=["item_category_id"],
                          by_y=["item_category_id"])

train_df = train_df.merge(shops_df,
                          all_x=True,
                          all_y=False,
                          by_x=["shop_id"],
                          by_y=["shop_id"])

print(train_df.shape)

train_df.head(5)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
(2935849, 6)
(2935849, 10)


shop_id,item_category_id,item_id,date,date_block_num,item_price,item_cnt_day,item_name,item_category_name,shop_name
0,0,16255,28.02.2013,1,93,1,Наушники PHILIPS SBC HC8680,PC - Гарнитуры/Наушники,"!Якутск Орджоникидзе, 56 фран"
0,1,5740,20.02.2013,1,283,1,Pelican. Геймпад Nerf Wireless Controller PS2 (assorted colors),Аксессуары - PS2,"!Якутск Орджоникидзе, 56 фран"
0,2,5570,28.02.2013,1,93,1,PS Move Controller (Контроллер движений),Аксессуары - PS3,"!Якутск Орджоникидзе, 56 фран"
0,2,5572,04.01.2013,0,1322,3,PS Move Motion Controller (Контроллер движений PS Move : CECH-ZCM1R BX: SCEE),Аксессуары - PS3,"!Якутск Орджоникидзе, 56 фран"
0,2,5572,05.01.2013,0,1322,2,PS Move Motion Controller (Контроллер движений PS Move : CECH-ZCM1R BX: SCEE),Аксессуары - PS3,"!Якутск Орджоникидзе, 56 фран"




In [9]:
grouped = train_df.group_by(["shop_id", "item_id", "date_block_num"])

grouped.sum("item_cnt_day")

train_df = grouped.get_frame()

train_df.head(5)

shop_id,item_id,date_block_num,sum_item_cnt_day
0,30,1,31
0,31,1,11
0,32,0,6
0,32,1,10
0,33,0,3
0,33,1,3
0,35,0,1
0,35,1,14
0,36,1,1
0,40,1,1




In [22]:
test_df["date_block_num"] = 34
test_df.head(5)

ID,shop_id,item_id,date_block_num
0,5,5037,34
1,5,5320,34
2,5,5233,34
3,5,5232,34
4,5,5268,34




## Model

In [18]:
from h2o.automl import H2OAutoML

# Define label and predictors
y = "sum_item_cnt_day"
x = ["shop_id", "item_id", "date_block_num"]

aml = H2OAutoML(seed=42, max_runtime_secs=300)

In [19]:
aml.train(x, y, train_df)

AutoML progress: |████████████████████████████████████████████████████████| 100%


## Evaluation

In [20]:
aml.leaderboard

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20200823_172157,34.3722,5.86278,34.3722,1.38908,
StackedEnsemble_BestOfFamily_AutoML_20200823_172157,35.5556,5.96285,35.5556,1.41175,
XGBoost_grid__1_AutoML_20200823_172157_model_1,40.9846,6.40192,40.9846,1.46532,
DRF_1_AutoML_20200823_172157,44.9271,6.70277,44.9271,1.48288,
XRT_1_AutoML_20200823_172157,51.3611,7.16666,51.3611,1.53262,
XGBoost_3_AutoML_20200823_172157,53.7454,7.33113,53.7454,1.52853,
XGBoost_1_AutoML_20200823_172157,54.1992,7.36201,54.1992,1.35269,
GBM_4_AutoML_20200823_172157,54.4278,7.37752,54.4278,1.6153,
GBM_grid__1_AutoML_20200823_172157_model_2,56.6017,7.52341,56.6017,1.50821,
GBM_3_AutoML_20200823_172157,57.4492,7.57952,57.4492,1.63904,




In [23]:
predictions = aml.predict(test_df[x])
predictions.head(5)

stackedensemble prediction progress: |████████████████████████████████████| 100%


predict
1.36885
1.26437
1.26437
1.26437
1.26437




In [24]:
predictions.as_data_frame()["predict"]

0         1.368850
1         1.264367
2         1.264367
3         1.264367
4         1.264367
            ...   
214195    0.957166
214196    1.159826
214197    1.097405
214198    0.791951
214199    0.625960
Name: predict, Length: 214200, dtype: float64

In [26]:
submission = test_df.concat(predictions)
submission["predict"] = submission["predict"].round()
submission[submission["predict"]>20, "predict"] = 20
submission[submission["predict"]<0, "predict"] = 0
submission.summary()

Unnamed: 0,ID,shop_id,item_id,date_block_num,predict
type,int,int,int,int,int
mins,0.0,2.0,30.0,34.0,0.0
mean,107099.5,31.64285714285714,11019.39862745098,34.0,1.664743230625584
maxs,214199.0,59.0,22167.0,34.0,20.0
sigma,61834.35816760776,17.56193348989602,6252.644589940324,0.0,1.6699202549371641
zeros,1,0,0,0,4961
missing,0,0,0,0,0
0,0.0,5.0,5037.0,34.0,1.0
1,1.0,5.0,5320.0,34.0,1.0
2,2.0,5.0,5233.0,34.0,1.0


In [27]:
submission.rename({"predict":"item_cnt_month"})
file = str(Path(PATH, "notebooks/submission.csv"))
h2o.export_file(submission[["ID", "item_cnt_month"]],
                file,
                force=True)

Export File progress: |███████████████████████████████████████████████████| 100%


In [28]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submission.csv -m "h2o_automl"

100%|███████████████████████████████████████| 1.73M/1.73M [00:06<00:00, 280kB/s]
Successfully submitted to Predict Future Sales