# Predict Future Sale - LightGBM - Baseline
## Final project for "How to win a data science competition" Coursera course
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data  
>Student: Rafael Caneiro de Oliveira  
>Email: rafael.caneiro@gmail.com  
>Date: 15/12/2020

## Load

In [59]:
import numpy as np
import pandas as pd
from itertools import product

from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline

PATH = Path.cwd().parent
DATA_PATH = Path(PATH, "./data/raw/") 
MODEL_PATH = Path(PATH, "./models/") 

seed = 42

In [60]:
sales = pd.read_csv(Path(DATA_PATH,"sales_train.csv"))
items = pd.read_csv(Path(DATA_PATH,"items.csv"))
categories = pd.read_csv(Path(DATA_PATH,"item_categories.csv"))
shops = pd.read_csv(Path(DATA_PATH,"shops.csv"))
test = pd.read_csv(Path(DATA_PATH,"test.csv"))
test["date_block_num"]=34

## Aggregate

In [61]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})

#fix column names
gb.rename({"item_cnt_day":"target"}, inplace=True, axis=1)

#join aggregated data to the grid
train = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

#sort the data
train.sort_values(['date_block_num','shop_id','item_id'],inplace=True)


## Join

In [62]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [63]:
train = train.merge(items, on="item_id", how="left")
train = train.merge(categories, on="item_category_id", how="left")
train = train.merge(shops, on="shop_id", how="left")
train.drop(columns=["item_name", "shop_name", "item_category_name"], inplace=True)
train = reduce_mem_usage(train)
train.sort_values("date_block_num", inplace=True)
train.head() 

Memory usage after optimization is: 156.12 MB
Decreased by 58.3%


Unnamed: 0,shop_id,item_id,date_block_num,target,item_category_id
0,0,19,0,0.0,40
243455,38,33,0,1.0,37
243454,38,32,0,4.0,40
243453,38,29,0,0.0,23
243452,38,28,0,0.0,30


In [64]:
test = test.merge(items, on="item_id", how="left")
test = test.merge(categories, on="item_category_id", how="left")
test = test.merge(shops, on="shop_id", how="left")
test.drop(columns=["item_name", "shop_name", "item_category_name"], inplace=True)
test = reduce_mem_usage(test)
test.head()

Memory usage after optimization is: 3.47 MB
Decreased by 64.6%


Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id
0,0,5,5037,34,19
1,1,5,5320,34,55
2,2,5,5233,34,19
3,3,5,5232,34,23
4,4,5,5268,34,20


## Test / Validation Split

In [65]:
sales["date"] = pd.to_datetime(sales["date"], format="%d.%m.%Y")
sales["period"] = sales.date.dt.year.astype("str") + "#" + sales.date.dt.month.astype("str")
sales.groupby(["period", "date_block_num"])["item_id"].count()

period   date_block_num
2013#1   0                 115690
2013#10  9                  94202
2013#11  10                 96736
2013#12  11                143246
2013#2   1                 108613
2013#3   2                 121347
2013#4   3                  94109
2013#5   4                  91759
2013#6   5                 100403
2013#7   6                 100548
2013#8   7                 104772
2013#9   8                  96137
2014#1   12                 99349
2014#10  21                 79361
2014#11  22                 86428
2014#12  23                130786
2014#2   13                 89830
2014#3   14                 92733
2014#4   15                 77906
2014#5   16                 78529
2014#6   17                 82408
2014#7   18                 78760
2014#8   19                 86614
2014#9   20                 73157
2015#1   24                 88522
2015#10  33                 53514
2015#2   25                 71808
2015#3   26                 69977
2015#4   27             

In [82]:
from sklearn.model_selection import TimeSeriesSplit

def get_index(start, end, df, col):
    ls = [x for x in range(start , end + 1)]
    idx = df[df[col].isin(ls)].index.ravel()
    return idx

# 2013-jan >> 2013-oct
train_1 = get_index(0, 9, train, "date_block_num")

# 2013-nov
val_1 = get_index(10, 10, train, "date_block_num")

# 2014-jan >> 2014-oct
train_2 = get_index(12, 21, train, "date_block_num")

# 2014-nov
val_2 = get_index(22, 22, train, "date_block_num")

# 2015-jan >> 2015-sep
train_3 = get_index(24, 32, train, "date_block_num")

# 2015-oct
val_3 = get_index(33, 33, train, "date_block_num")

folds = [(train_1, val_1), (train_2, val_2), (train_3, val_3)]

## Features / Target

In [83]:
remove_cols = ["target", "item_name", "item_category_name", "shop_name"]
features = [col for col in train.columns if col not in remove_cols]
target="target"
print(features)
print(target)

['shop_id', 'item_id', 'date_block_num', 'item_category_id']
target


## Model

In [84]:
import lightgbm
print(lightgbm.__version__)

3.1.1


In [85]:
from sklearn.metrics import mean_squared_error, make_scorer

def cliped_rmse(y, p):
    return -mean_squared_error(y, np.clip(p, 0, 100), squared=False)

cliped_rmse_scorer = make_scorer(cliped_rmse)

In [78]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

model = LGBMRegressor(learning_rate=0.01,
                      n_estimators=1000,
                      max_depth=10,
                      num_leaves=512,
                      subsample=0.8,
                      colsample_bytree=0.8,
                      random_state=seed)

-cross_val_score(estimator=model,
                 X=train[features],
                 y=train[target],
                 scoring="neg_root_mean_squared_error",
                 cv=folds).mean()

3.6663028931564328

In [86]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

model = XGBRegressor(random_state=seed)

-cross_val_score(estimator=model,
                 X=train[features],
                 y=train[target],
                 scoring="neg_root_mean_squared_error",
                 cv=folds).mean()

3.553106149037679

In [87]:
model.fit(X=train[features],
          y=train[target],)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='approx', validate_parameters=1, verbosity=None)

In [91]:
from joblib import dump, load

model_name = "XGBoost_Baseline"

if model is not None:
    dump(model, Path(MODEL_PATH, model_name + ".joblib"))
else:
    model = load(Path(MODEL_PATH, model_name + ".joblib"))
    

## Predictions

In [92]:
pred = model.predict(test[features])
pred = np.clip(pred, 0, 20)

## Submission

In [94]:
submission = pd.DataFrame({"ID":range(pred.shape[0]),
                           "item_cnt_month":pred})

sub_file_name = "submission_" + model_name + ".csv"
sub_file_path = Path(MODEL_PATH, sub_file_name)

submission[["ID", "item_cnt_month"]].to_csv(sub_file_path, index=False)

!kaggle competitions submit -c competitive-data-science-predict-future-sales -f $sub_file_path -m $model_name

100%|██████████████████████████████████████| 3.32M/3.32M [00:03<00:00, 1.05MB/s]
Successfully submitted to Predict Future Sales