# Predict Future Sale - Baseline
## Final project for "How to win a data science competition" Coursera course
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data  
>Student: Rafael Caneiro de Oliveira  
>Email: rafael.caneiro@gmail.com  
>Date: 04/08/2020

## Load the data

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path

PATH = Path.cwd().parent
DATA_PATH = Path(PATH, "./data/raw/") 

In [2]:
# Load the data
train_df = pd.read_csv(Path(DATA_PATH,"sales_train.csv"))
test_df = pd.read_csv(Path(DATA_PATH,"test.csv"))
items_df = pd.read_csv(Path(DATA_PATH,"items.csv"))

# # Merge item category data for train
# train_df = pd.merge(train_df,
#                     items_df[["item_id", "item_category_id"]],
#                     how="inner",
#                     on="item_id")

# # Merge item category data for test
# test_df = pd.merge(test_df,
#                    items_df[["item_id", "item_category_id"]],
#                    how="inner",
#                    on="item_id")

# Dates engineering on train data
train_df["date"] = pd.to_datetime(train_df["date"], format="%d.%m.%Y")
train_df["quarter"] = train_df.date.dt.quarter
train_df["year"] = train_df.date.dt.year
train_df["month"] = train_df.date.dt.month

# Dates engineering on test data
test_df["date_block_num"] = 34
test_df["quarter"] = 3
test_df["year"] = 2015
test_df["month"] = 11


## Outliers
https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba

In [3]:
cols = ["item_price", "item_cnt_day"]
for col in cols:
    upperbound = np.percentile(train_df[col], 99)
    train_df.loc[train_df[col] > upperbound, "is_outlier"] = 1
    
train_df.is_outlier.fillna(0, inplace=True)

## Train / Validation Datasets

In [5]:
import h2o
h2o.init()
# h2o.shutdown()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,35 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.1
H2O_cluster_version_age:,19 days
H2O_cluster_name:,rco
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.103 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [36]:
def create_grouped(df, returned=False):
    if returned:
        df_grouped = df \
            .groupby(["year", "month", "shop_id", 
                      "item_id", "date_block_num"]) \
            .agg({"item_cnt_day":"sum"}) \
            .reset_index()        
    else:
        df_grouped = df[(df.item_cnt_day>=0)] \
            .groupby(["year", "month", "shop_id", 
                      "item_id", "date_block_num"]) \
            .agg({"item_cnt_day":"sum"}) \
            .reset_index()
    return df_grouped

train_grouped = h2o.H2OFrame(create_grouped(train_df[train_df.month.between(1,9)]))

train_grouped["fold"] = train_grouped["year"] - 2013

# valid_grouped = create_grouped(train_df[train_df.month==10])
test_grouped = h2o.H2OFrame(create_grouped(train_df[train_df.month==10]))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [44]:
train_grouped.group_by(["year", "month"]).count().get_frame().head(50)

year,month,nrow
2013,1,63170
2013,2,59911
2013,3,63951
2013,4,54590
2013,5,53276
2013,6,56173
2013,7,58004
2013,8,57988
2013,9,51543
2014,1,53270




In [45]:
test_grouped.group_by(["year", "month"]).count().get_frame().head(50)

year,month,nrow
2013,10,51057
2014,10,42581
2015,10,31521




## Model

In [37]:
from h2o.automl import H2OAutoML

# Define label and predictors
y = "item_cnt_day"
x = ["shop_id", "item_id", "date_block_num", "year", "month"]

aml = H2OAutoML(seed=42, max_runtime_secs=300, nfolds=5)

In [38]:
aml.train(x=x, y=y,
          training_frame=train_grouped,
          fold_column="fold",
          leaderboard_frame=test_grouped)

aml.leaderboard.head()
#  8.97 >> train jan-set / leaderboard out / fold_column = year
# 13.62 >> train full -> leaderboard out/15
# 12.78 >> train jan-set / leaderboard out/15 / auto fold
# 13.xx >> train jan-set / leaderboard out/15 / fold_column = year

AutoML progress: |
17:43:45.928: Fold column fold will be used for cross-validation. nfolds parameter will be ignored.

████████████████████████████████████████████████████████| 100%


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_grid__1_AutoML_20200829_174345_model_1,80.5604,8.97554,80.5604,1.55121,
StackedEnsemble_BestOfFamily_AutoML_20200829_174345,84.8061,9.20902,84.8061,1.63311,
StackedEnsemble_AllModels_AutoML_20200829_174345,85.1202,9.22606,85.1202,1.66971,
XGBoost_grid__1_AutoML_20200829_174345_model_1,86.2967,9.2896,86.2967,1.71867,
XGBoost_1_AutoML_20200829_174345,95.1731,9.75567,95.1731,1.6164,
GBM_4_AutoML_20200829_174345,96.2128,9.80881,96.2128,1.77847,
GBM_3_AutoML_20200829_174345,96.8074,9.83908,96.8074,1.77998,0.521814
GBM_2_AutoML_20200829_174345,98.9471,9.94722,98.9471,1.78893,0.525857
XGBoost_3_AutoML_20200829_174345,100.195,10.0098,100.195,1.82486,
XGBoost_2_AutoML_20200829_174345,101.864,10.0928,101.864,1.44475,




## Submission

In [23]:
submission[["ID", "item_cnt_month"]].to_csv("submission.csv",
                                          index=False)

In [24]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submission.csv -m "Baseline3"

100%|███████████████████████████████████████| 2.14M/2.14M [00:03<00:00, 678kB/s]
Successfully submitted to Predict Future Sales