## Imports


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, median_absolute_error, r2_score, explained_variance_score

In [None]:
# train.csv
train_df = pd.read_csv("https://drive.google.com/uc?id=1h-vNpZ0RJnsuLxCTs0qc80jwGNrGeX49")
train_df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [None]:
# test.csv
test_df = pd.read_csv("https://drive.google.com/uc?id=1i01pfTSwINi_SBEZhZCG_oZ8C2s6zqWa")
test_df.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [None]:
# sample_submission.csv
sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df.head()

Unnamed: 0,id,sales
0,0,52
1,1,52
2,2,52
3,3,52
4,4,52


Exploration

In [None]:
train_df.store.value_counts()

1     91300
2     91300
3     91300
4     91300
5     91300
6     91300
7     91300
8     91300
9     91300
10    91300
Name: store, dtype: int64

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   store   913000 non-null  int64
 1   item    913000 non-null  int64
 2   sales   913000 non-null  int64
 3   year    913000 non-null  int64
 4   month   913000 non-null  int64
 5   day     913000 non-null  int64
dtypes: int64(6)
memory usage: 41.8 MB


In [None]:
train_df.isnull().sum().sum()

0

In [None]:
test_df.store.value_counts()

1     4500
2     4500
3     4500
4     4500
5     4500
6     4500
7     4500
8     4500
9     4500
10    4500
Name: store, dtype: int64

In [None]:
test_df.isnull().sum().sum()

0

## Data Preparation

In [None]:
# extract d,m,y for time series forecasting
# train data

import datetime as dt
train_df['date'] = pd.to_datetime(train_df['date'])

train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.dayofweek

train_df.drop('date', axis=1, inplace=True)
train_df.head()

Unnamed: 0,store,item,sales,year,month,day
0,1,1,13,2013,1,1
1,1,1,11,2013,1,2
2,1,1,14,2013,1,3
3,1,1,13,2013,1,4
4,1,1,10,2013,1,5


In [None]:
train_df.year.value_counts()

2016    183000
2013    182500
2014    182500
2015    182500
2017    182500
Name: year, dtype: int64

In [None]:
train_df.month.value_counts()

1     77500
3     77500
5     77500
7     77500
8     77500
10    77500
12    77500
4     75000
6     75000
9     75000
11    75000
2     70500
Name: month, dtype: int64

In [None]:
train_df.day.value_counts()

1    130500
2    130500
3    130500
4    130500
5    130500
6    130500
0    130000
Name: day, dtype: int64

In [None]:
train_df.item.value_counts().sort_index()

1     18260
2     18260
3     18260
4     18260
5     18260
6     18260
7     18260
8     18260
9     18260
10    18260
11    18260
12    18260
13    18260
14    18260
15    18260
16    18260
17    18260
18    18260
19    18260
20    18260
21    18260
22    18260
23    18260
24    18260
25    18260
26    18260
27    18260
28    18260
29    18260
30    18260
31    18260
32    18260
33    18260
34    18260
35    18260
36    18260
37    18260
38    18260
39    18260
40    18260
41    18260
42    18260
43    18260
44    18260
45    18260
46    18260
47    18260
48    18260
49    18260
50    18260
Name: item, dtype: int64

In [None]:
train_df.sales.value_counts().sort_index()

0        1
1        4
2       25
3       91
4      215
      ... 
208      1
209      1
210      1
214      1
231      1
Name: sales, Length: 213, dtype: int64

In [None]:
sales = train_df.pop('sales') # extract sales col for target class
sales.head()

0    13
1    11
2    14
3    13
4    10
Name: sales, dtype: int64

In [None]:
# extract d,m,y for time series forecasting
# test data
test_df['date'] = pd.to_datetime(test_df['date'])

test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.dayofweek

test_df.drop('date', axis=1, inplace=True)
test_df.head()

Unnamed: 0,id,store,item,year,month,day
0,0,1,1,2018,1,0
1,1,1,1,2018,1,1
2,2,1,1,2018,1,2
3,3,1,1,2018,1,3
4,4,1,1,2018,1,4


In [None]:
# drop id column before predicting
test_df.drop('id', axis=1, inplace=True)

In [None]:
test_df.year.value_counts()

2018    45000
Name: year, dtype: int64

In [None]:
test_df.item.value_counts().sort_index()

1     900
2     900
3     900
4     900
5     900
6     900
7     900
8     900
9     900
10    900
11    900
12    900
13    900
14    900
15    900
16    900
17    900
18    900
19    900
20    900
21    900
22    900
23    900
24    900
25    900
26    900
27    900
28    900
29    900
30    900
31    900
32    900
33    900
34    900
35    900
36    900
37    900
38    900
39    900
40    900
41    900
42    900
43    900
44    900
45    900
46    900
47    900
48    900
49    900
50    900
Name: item, dtype: int64

# Baseline ML Models

In [None]:
train_df = pd.read_csv("https://drive.google.com/uc?id=1h-vNpZ0RJnsuLxCTs0qc80jwGNrGeX49")

# Split train_df into 2 parts for testing purposes
val = train_df.loc[(train_df["date"] >= "2017-01-01") & (train_df["date"] < "2017-04-01"), :] # First 3 months of 2017
train = pd.concat([train_df, val, val]).drop_duplicates(keep=False) # everything else not in the 3 months

# x labels
train['date'] = pd.to_datetime(train['date'])
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.dayofweek
train.drop('date', axis=1, inplace=True)

val['date'] = pd.to_datetime(val['date'])
val['year'] = val['date'].dt.year
val['month'] = val['date'].dt.month
val['day'] = val['date'].dt.dayofweek
val.drop('date', axis=1, inplace=True)

# y labels
sales_train = train.pop('sales') # extract sales col for target class
sales_val = val.pop('sales') # extract sales col for target class

# test df
test_df = pd.read_csv("https://drive.google.com/uc?id=1i01pfTSwINi_SBEZhZCG_oZ8C2s6zqWa")
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.dayofweek
test_df.drop('date', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

In [None]:
print(train.shape, val.shape, test_df.shape)

(868000, 5) (45000, 5) (45000, 5)


In [None]:
def regression_report(y_true, y_pred):
    metrics = [('SMAPE', (100/len(y_pred) * np.sum(2 * np.abs(y_true - y_pred) / (np.abs(y_pred) + np.abs(y_true))))),
               ('MAE', mean_absolute_error(y_true, y_pred)),
               ('Median Absolute Error', median_absolute_error(y_true, y_pred)),
               ('Mean Squared Error', mean_squared_error(y_true, y_pred)),
               ('Max Error', max_error(y_true, y_pred)),
               ('R2 Score', r2_score(y_true, y_pred)),
               ('Explained Variance Score', explained_variance_score(y_true, y_pred))]

    # Init scoring metrics
    regression_report_df = pd.DataFrame(columns=['Metric Name', 'Score'])

    for name, score in metrics:
      regression_report_df = regression_report_df.append({'Metric Name': name, 'Score': score}, ignore_index=True)
      
    regression_report_df['Score'] = regression_report_df.Score.round(3)

    return regression_report_df

## Catboost

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error as mse

model = CatBoostRegressor(max_depth=10, loss_function='RMSE', n_estimators = 200, eta = 0.3, random_seed=1)
model.fit(train.values, sales_train.values, verbose=True, early_stopping_rounds = 20, eval_set=(val.values, sales_val.values))

0:	learn: 25.5508294	test: 22.0294744	best: 22.0294744 (0)	total: 147ms	remaining: 29.3s
1:	learn: 23.5569327	test: 20.0034748	best: 20.0034748 (1)	total: 307ms	remaining: 30.4s
2:	learn: 21.9270142	test: 18.7698309	best: 18.7698309 (2)	total: 446ms	remaining: 29.3s
3:	learn: 20.8556084	test: 17.8652701	best: 17.8652701 (3)	total: 586ms	remaining: 28.7s
4:	learn: 19.8948580	test: 17.1182411	best: 17.1182411 (4)	total: 744ms	remaining: 29s
5:	learn: 18.9128838	test: 16.1631390	best: 16.1631390 (5)	total: 875ms	remaining: 28.3s
6:	learn: 18.1926212	test: 15.4094859	best: 15.4094859 (6)	total: 1.02s	remaining: 28.1s
7:	learn: 17.6572749	test: 14.9678126	best: 14.9678126 (7)	total: 1.17s	remaining: 28s
8:	learn: 16.4071182	test: 13.7564503	best: 13.7564503 (8)	total: 1.32s	remaining: 28.1s
9:	learn: 15.3101600	test: 12.6974150	best: 12.6974150 (9)	total: 1.46s	remaining: 27.7s
10:	learn: 14.5357008	test: 11.9557628	best: 11.9557628 (10)	total: 1.6s	remaining: 27.5s
11:	learn: 14.0640542	te

<catboost.core.CatBoostRegressor at 0x7f67c0dbfe10>

In [None]:
y_pred = model.predict(val.values)
regression_report(sales_val.values, y_pred)

Unnamed: 0,Metric Name,Score
0,SMAPE,13.471
1,MAE,5.309
2,Median Absolute Error,4.244
3,Mean Squared Error,47.227
4,Max Error,36.377
5,R2 Score,0.917
6,Explained Variance Score,0.917


In [None]:
# now prediction of actual test data for submission
y_pred = model.predict(test_df.values)

sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df['sales'] = y_pred
sample_df.to_csv("submission_cb.csv", index=False)
sample_df.head()

Unnamed: 0,id,sales
0,0,12.024849
1,1,13.822182
2,2,13.872539
3,3,14.49142
4,4,15.831704


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(train.values, sales_train.values)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, random_state=None,
                      splitter='best')

In [None]:
y_pred = model.predict(val.values)
regression_report(sales_val.values, y_pred)

Unnamed: 0,Metric Name,Score
0,SMAPE,14.972
1,MAE,5.914
2,Median Absolute Error,4.75
3,Mean Squared Error,59.487
4,Max Error,46.25
5,R2 Score,0.895
6,Explained Variance Score,0.9


In [None]:
# now prediction of actual test data for submission
y_pred = model.predict(test_df.values)

sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df['sales'] = y_pred
sample_df.to_csv("submission_dt.csv", index=False)
sample_df.head()

Unnamed: 0,id,sales
0,0,10.25
1,1,15.0
2,2,13.0
3,3,14.75
4,4,15.4


## Light GBM

In [None]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()
model.fit(train.values, sales_train.values, verbose=True, early_stopping_rounds=20, eval_set=(val.values, sales_val.values))

[1]	valid_0's l2: 584.19
[2]	valid_0's l2: 561.64
[3]	valid_0's l2: 524.281
[4]	valid_0's l2: 506.43
[5]	valid_0's l2: 473.732
[6]	valid_0's l2: 450.229
[7]	valid_0's l2: 428.166
[8]	valid_0's l2: 416.79
[9]	valid_0's l2: 394.807
[10]	valid_0's l2: 374.03
[11]	valid_0's l2: 363.919
[12]	valid_0's l2: 361.482
[13]	valid_0's l2: 342.619
[14]	valid_0's l2: 330.144
[15]	valid_0's l2: 300.307
[16]	valid_0's l2: 297.294
[17]	valid_0's l2: 268.072
[18]	valid_0's l2: 257.175
[19]	valid_0's l2: 254.052
[20]	valid_0's l2: 247.8
[21]	valid_0's l2: 225.654
[22]	valid_0's l2: 227.765
[23]	valid_0's l2: 223.403
[24]	valid_0's l2: 217.933
[25]	valid_0's l2: 214.066
[26]	valid_0's l2: 204.363
[27]	valid_0's l2: 202.691
[28]	valid_0's l2: 198.104
[29]	valid_0's l2: 199.247
[30]	valid_0's l2: 196.216
[31]	valid_0's l2: 181.023
[32]	valid_0's l2: 176.559
[33]	valid_0's l2: 174.482
[34]	valid_0's l2: 163.458
[35]	valid_0's l2: 158.113
[36]	valid_0's l2: 154.468
[37]	valid_0's l2: 144.422
[38]	valid_0's l2

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
y_pred = model.predict(val.values)
regression_report(sales_val.values, y_pred)

Unnamed: 0,Metric Name,Score
0,SMAPE,16.073
1,MAE,6.261
2,Median Absolute Error,5.224
3,Mean Squared Error,63.384
4,Max Error,49.278
5,R2 Score,0.888
6,Explained Variance Score,0.892


In [None]:
# now prediction of actual test data for submission
y_pred = model.predict(test_df.values)

sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df['sales'] = y_pred
sample_df.to_csv("submission_lgbm.csv", index=False)
sample_df.head()

Unnamed: 0,id,sales
0,0,16.561062
1,1,17.444016
2,2,17.444016
3,3,18.268907
4,4,17.977213


## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train.values, sales_train.values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False,
                 positive=False)

In [None]:
y_pred = model.predict(val.values)
regression_report(sales_val.values, y_pred)

Unnamed: 0,Metric Name,Score
0,SMAPE,48.924
1,MAE,22.671
2,Median Absolute Error,21.663
3,Mean Squared Error,717.116
4,Max Error,90.323
5,R2 Score,-0.266
6,Explained Variance Score,0.066


In [None]:
# now prediction of actual test data for submission
y_pred = model.predict(test_df.values)

sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df['sales'] = y_pred
sample_df.to_csv("submission_lr.csv", index=False)
sample_df.head()

Unnamed: 0,id,sales
0,0,57.005051
1,1,60.219296
2,2,63.43354
3,3,66.647785
4,4,69.862029


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=200, n_jobs=-1)
model.fit(train.values, sales_train.values)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
y_pred = model.predict(val.values)
regression_report(sales_val.values, y_pred)

Unnamed: 0,Metric Name,Score
0,SMAPE,14.945
1,MAE,5.905
2,Median Absolute Error,4.69
3,Mean Squared Error,59.301
4,Max Error,46.071
5,R2 Score,0.895
6,Explained Variance Score,0.9


In [None]:
# now prediction of actual test data for submission
y_pred = model.predict(test_df.values)

sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df['sales'] = y_pred
sample_df.to_csv("submission_rf.csv", index=False)
sample_df.head()

Unnamed: 0,id,sales
0,0,10.180202
1,1,15.008175
2,2,13.084671
3,3,14.706119
4,4,15.293183


## XGBoost

In [None]:
import xgboost
model = xgboost.XGBRegressor(n_estimators = 200, seed = 21)
model.fit(train.values, sales_train.values, verbose=True)



XGBRegressor(n_estimators=200, seed=21)

In [None]:
y_pred = model.predict(val.values)
regression_report(sales_val.values, y_pred)

Unnamed: 0,Metric Name,Score
0,SMAPE,20.025
1,MAE,7.52
2,Median Absolute Error,6.27
3,Mean Squared Error,91.563
4,Max Error,58.065
5,R2 Score,0.838
6,Explained Variance Score,0.846


In [None]:
# now prediction of actual test data for submission
y_pred = model.predict(test_df.values)

sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df['sales'] = y_pred
sample_df.to_csv("submission_xg.csv", index=False)
sample_df.head()

Unnamed: 0,id,sales
0,0,7.431525
1,1,12.538452
2,2,12.538452
3,3,14.524738
4,4,17.086143


## MLP (Neural Network)

In [None]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(hidden_layer_sizes=(32,32,32), activation="relu", random_state=1, max_iter=10).fit(train.values, sales_train.values)



In [None]:
y_pred = model.predict(val.values)
regression_report(sales_val.values, y_pred)

Unnamed: 0,Metric Name,Score
0,SMAPE,38.664
1,MAE,15.954
2,Median Absolute Error,12.84
3,Mean Squared Error,426.312
4,Max Error,105.151
5,R2 Score,0.247
6,Explained Variance Score,0.3


In [None]:
# now prediction of actual test data for submission
y_pred = model.predict(test_df.values)

sample_df = pd.read_csv("https://drive.google.com/uc?id=1VUhb4Jv2BbeE3jLef3vuTMhWDnf3z_JD")
sample_df['sales'] = y_pred
sample_df.to_csv("submission_mlp.csv", index=False)
sample_df.head()

Unnamed: 0,id,sales
0,0,1.600727
1,1,1.533796
2,2,2.700584
3,3,4.0516
4,4,6.807449
