# Predict Future Sale - Baseline
## Final project for "How to win a data science competition" Coursera course
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data  
>Student: Rafael Caneiro de Oliveira  
>Email: rafael.caneiro@gmail.com  
>Date: 04/08/2020

## Load

In [1]:
import numpy as np
import pandas as pd
from itertools import product

from pathlib import Path

PATH = Path.cwd().parent
DATA_PATH = Path(PATH, "./data/raw/") 

In [2]:
sales = pd.read_csv(Path(DATA_PATH,"sales_train.csv"))
items = pd.read_csv(Path(DATA_PATH,"items.csv"))
categories = pd.read_csv(Path(DATA_PATH,"item_categories.csv"))
shops = pd.read_csv(Path(DATA_PATH,"shops.csv"))
test = pd.read_csv(Path(DATA_PATH,"test.csv"))
test["date_block_num"]=34

## Aggregate

In [3]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})

#fix column names
gb.rename({"item_cnt_day":"target"}, inplace=True, axis=1)

#join aggregated data to the grid
train = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

#sort the data
train.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

## Join

In [4]:
train = train.merge(items, on="item_id", how="left")
train = train.merge(categories, on="item_category_id", how="left")
train = train.merge(shops, on="shop_id", how="left")
train.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_name,item_category_id,item_category_name,shop_name
0,0,19,0,0.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран"
1,0,27,0,0.0,"007 Legends [PS3, русская версия]",19,Игры - PS3,"!Якутск Орджоникидзе, 56 фран"
2,0,28,0,0.0,"007 Legends [PС, Jewel, русская версия]",30,Игры PC - Стандартные издания,"!Якутск Орджоникидзе, 56 фран"
3,0,29,0,0.0,"007 Legends [Xbox 360, русская версия]",23,Игры - XBOX 360,"!Якутск Орджоникидзе, 56 фран"
4,0,32,0,6.0,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран"


In [5]:
test = test.merge(items, on="item_id", how="left")
test = test.merge(categories, on="item_category_id", how="left")
test = test.merge(shops, on="shop_id", how="left")
test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_name,item_category_id,item_category_name,shop_name
0,0,5,5037,34,"NHL 15 [PS3, русские субтитры]",19,Игры - PS3,"Вологда ТРЦ ""Мармелад"""
1,1,5,5320,34,ONE DIRECTION Made In The A.M.,55,Музыка - CD локального производства,"Вологда ТРЦ ""Мармелад"""
2,2,5,5233,34,"Need for Speed Rivals (Essentials) [PS3, русск...",19,Игры - PS3,"Вологда ТРЦ ""Мармелад"""
3,3,5,5232,34,"Need for Speed Rivals (Classics) [Xbox 360, ру...",23,Игры - XBOX 360,"Вологда ТРЦ ""Мармелад"""
4,4,5,5268,34,"Need for Speed [PS4, русская версия]",20,Игры - PS4,"Вологда ТРЦ ""Мармелад"""


## Test / Validation Split

In [6]:
def get_index(start, end, df, col):
    ls = [x for x in range(start , end + 1)]
    idx = df[df[col].isin(ls)].index.ravel()
    return idx

# 2013-jan >> 2013-oct
train_1 = get_index(0, 10, train, "date_block_num")

# 2013-nov
val_1 = get_index(10, 10, train, "date_block_num")

# 2014-jan >> 2014-oct
train_2 = get_index(12, 22, train, "date_block_num")

# 2014-nov
val_2 = get_index(22, 22, train, "date_block_num")

# 2015-jan >> 2015-sep
train_3 = get_index(24, 33, train, "date_block_num")

# 2015-oct
val_3 = get_index(33, 33, train, "date_block_num")

folds = [(train_1, val_1), (train_2, val_2), (train_3, val_3)]

## Features / Target

In [7]:
remove_cols = ["target", "item_name", "item_category_name", "shop_name"]
features = [col for col in train.columns if col not in remove_cols]
target="target"

## Model

In [8]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42, )

## Grid Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {"learning_rate":[0.01, 0.03, 0.1, 0.3, 1],
              "min_child_weight":[1, 3, 10, 30, 100],
              "max_depth":[1, 3, 10, 30, 100],
              "gamma":[0, 0.03, 0.1, 0.3, 1],
              "subsample":[0.01, 0.03, 0.1, 0.3, 1],
              "colsample_bytree":[0.01, 0.03, 0.1, 0.3, 1],
              "n_estimators":[10, 30, 100, 300, 1000]}

clf = RandomizedSearchCV(estimator=xgb,
                         param_distributions=parameters,
                         n_iter=10,
                         scoring="neg_mean_squared_error",
                         cv=folds,
                         verbose=2,
                         n_jobs=16,
                         pre_dispatch="2*n_jobs")

clf.fit(X=train[features], y=train[target])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  15 out of  30 | elapsed:  2.7min remaining:  2.7min


## Predictions

In [None]:
pred = model.predict(X_test)
pred = np.clip(predictions, 0, 20)

## Submission

In [None]:
submission = pd.DataFrame({"ID":range(pred.shape[0]),
                           "item_cnt_month":pred})
submission

In [None]:
submission[["ID", "item_cnt_month"]].to_csv("submission.csv", 
                                            index=False)

In [None]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submission.csv -m "Model_Baseline"