# Predict Future Sale - Baseline
## Final project for "How to win a data science competition" Coursera course
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data  
>Student: Rafael Caneiro de Oliveira  
>Email: rafael.caneiro@gmail.com  
>Date: 04/08/2020

## Load the data

In [51]:
import numpy as np
import pandas as pd

from pathlib import Path

PATH = Path.cwd().parent
DATA_PATH = Path(PATH, "./data/raw/") 

In [52]:
# Load the data
train_df = pd.read_csv(Path(DATA_PATH,"sales_train.csv"))
test_df = pd.read_csv(Path(DATA_PATH,"test.csv"))
items_df = pd.read_csv(Path(DATA_PATH,"items.csv"))

# # Merge item category data for train
# train_df = pd.merge(train_df,
#                     items_df[["item_id", "item_category_id"]],
#                     how="inner",
#                     on="item_id")

# # Merge item category data for test
# test_df = pd.merge(test_df,
#                    items_df[["item_id", "item_category_id"]],
#                    how="inner",
#                    on="item_id")

# Dates engineering on train data
train_df["date"] = pd.to_datetime(train_df["date"], format="%d.%m.%Y")
train_df["quarter"] = train_df.date.dt.quarter
train_df["year"] = train_df.date.dt.year
train_df["month"] = train_df.date.dt.month

# Dates engineering on test data
test_df["date_block_num"] = 34
test_df["quarter"] = 3
test_df["year"] = 2015
test_df["month"] = 11


## Outliers
https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba

In [53]:
cols = ["item_price", "item_cnt_day"]
for col in cols:
    upperbound = np.percentile(train_df[col], 99)
    train_df.loc[train_df[col] > upperbound, "is_outlier"] = 1
    
train_df.is_outlier.fillna(0, inplace=True)

## Train / Validation Datasets

In [54]:
import h2o
h2o.init()
# h2o.shutdown()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,16 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.1
H2O_cluster_version_age:,"21 days, 9 hours and 40 minutes"
H2O_cluster_name:,rco
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.103 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [55]:
def create_grouped(df, returned=False):
    if returned:
        df_grouped = df \
            .groupby(["year", "month", "shop_id", 
                      "item_id", "date_block_num"]) \
            .agg({"item_cnt_day":"sum"}) \
            .reset_index()        
    else:
        df_grouped = df[(df.item_cnt_day>=0)] \
            .groupby(["year", "month", "shop_id", 
                      "item_id", "date_block_num"]) \
            .agg({"item_cnt_day":"sum"}) \
            .reset_index()
    return df_grouped

train_grouped = h2o.H2OFrame(create_grouped(train_df[(train_df.month.between(1,11)) &
                                                     (train_df.date_block_num<=33)]))
train_grouped["fold"] = train_grouped["year"] - 2013

# valid_grouped = h2o.H2OFrame(create_grouped(train_df[(train_df.month==11) &
#                                         (train_df.year.between(2013, 2014))]))

# valid_grouped["fold"] = valid_grouped["year"] - 2013


# test_grouped = h2o.H2OFrame(create_grouped(train_df[(train_df.month==11) &
#                                                     (train_df.year==2015)]))

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [63]:
htrain = h2o.H2OFrame(train_df[train_df.date_block_num<=33])

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Model

In [70]:
from h2o.automl import H2OAutoML

# Define label and predictors
y = "item_cnt_day"
x = ["shop_id", "item_id", "date_block_num", "year", "month"]

aml = H2OAutoML(seed=42, max_runtime_secs=3600, max_models=50, nfolds=5)

In [71]:
aml.train(x=x, y=y,
          training_frame=htrain)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [65]:
aml.leaderboard.head()
#  8.97 >> train jan-set / leaderboard out / fold_column = year
# 13.62 >> train full -> leaderboard out/15
# 12.78 >> train jan-set / leaderboard out/15 / auto fold
# 13.xx >> train jan-set / leaderboard out/15 / fold_column = year

AutoML progress: |████████████████████████████████████████████████████████| 100%


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20200901_122851,5.8459,2.41783,5.8459,0.320701,
StackedEnsemble_BestOfFamily_AutoML_20200901_122851,5.88053,2.42498,5.88053,0.332113,
XGBoost_grid__1_AutoML_20200901_122851_model_1,5.88481,2.42586,5.88481,0.327396,
XGBoost_2_AutoML_20200901_122851,6.22743,2.49548,6.22743,0.419702,
XGBoost_1_AutoML_20200901_122851,6.2587,2.50174,6.2587,0.31697,
DRF_1_AutoML_20200901_122851,6.43823,2.53737,6.43823,0.397386,
GBM_2_AutoML_20200901_122851,6.45538,2.54074,6.45538,0.398299,
XGBoost_3_AutoML_20200901_122851,6.45959,2.54157,6.45959,0.30191,
GBM_4_AutoML_20200901_122851,6.4619,2.54203,6.4619,0.408307,
GBM_1_AutoML_20200901_122851,6.47583,2.54476,6.47583,0.402129,




## Prediction

In [66]:
htest = h2o.H2OFrame(test_df)
predictions = aml.predict(htest[x])
predictions[predictions["predict"]>20, "predict"] = 20
predictions[predictions["predict"]<0, "predict"] = 0

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


## Submission

In [67]:
submission = htest.cbind(predictions).as_data_frame()
submission.rename({"predict":"item_cnt_month"}, inplace=True, axis=1)

In [68]:
submission[["ID", "item_cnt_month"]].to_csv("submission.csv", index=False)

In [69]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submission.csv -m "h2o_auto_ml_base"

100%|██████████████████████████████████████| 5.16M/5.16M [00:03<00:00, 1.71MB/s]
Successfully submitted to Predict Future Sales