In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv('train_kaggle.csv')
df_test = pd.read_csv('test_kaggle.csv')
result = df_test['id']
result = pd.DataFrame(result)

In [3]:
df_train

Unnamed: 0,Store_id,SKU_id,Date,Promo,Demand,Regular_Price,Promo_Price
0,1,1,01.01.2015,,22,163.78,
1,1,1,02.01.2015,,41,163.78,
2,1,1,03.01.2015,,35,163.78,
3,1,1,04.01.2015,,72,163.78,
4,1,1,05.01.2015,,25,163.78,
...,...,...,...,...,...,...,...
86911,106,2,18.05.2016,,9,138.50,
86912,106,2,19.05.2016,,3,138.50,
86913,106,2,20.05.2016,,13,138.50,
86914,106,2,21.05.2016,,3,138.50,


In [4]:
df_test.describe()

Unnamed: 0,id,Store_id,SKU_id,Promo,Demand,Regular_Price,Promo_Price
count,5970.0,5970.0,5970.0,1514.0,0.0,5970.0,1514.0
mean,2984.5,53.849246,1.5,1.0,,135.00439,119.452272
std,1723.534885,30.85339,0.500042,0.0,,3.490976,5.494316
min,0.0,1.0,1.0,1.0,,128.98,114.17
25%,1492.25,27.0,1.0,1.0,,131.7,114.17
50%,2984.5,54.0,1.5,1.0,,134.96,119.6
75%,4476.75,80.0,2.0,1.0,,138.5,125.04
max,5969.0,110.0,2.0,1.0,,145.56,127.76


### NaN reconstructing

In [5]:
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_train['Year'] = df_train['Date'].apply(lambda time: time.year)
df_train['Month'] = df_train['Date'].apply(lambda time: time.month)
df_train['Day'] = df_train['Date'].apply(lambda time: time.day)
df_train.drop(['Date'], axis=1, inplace=True)
df_train['Promo'].fillna(0, inplace=True)
df_train['Promo_Price'].fillna(df_train['Regular_Price'], inplace=True)

df_test['Date'] = pd.to_datetime(df_test['Date'])
df_test['Year'] = df_test['Date'].apply(lambda time: time.year)
df_test['Month'] = df_test['Date'].apply(lambda time: time.month)
df_test['Day'] = df_test['Date'].apply(lambda time: time.day)
df_test.drop(['Demand', 'id', 'Date'], axis=1, inplace=True)
df_test['Promo'].fillna(0, inplace=True)
df_test['Promo_Price'].fillna(df_test['Regular_Price'], inplace=True)

### Test and val split

In [6]:
df_train

Unnamed: 0,Store_id,SKU_id,Promo,Demand,Regular_Price,Promo_Price,Year,Month,Day
0,1,1,0.0,22,163.78,163.78,2015,1,1
1,1,1,0.0,41,163.78,163.78,2015,2,1
2,1,1,0.0,35,163.78,163.78,2015,3,1
3,1,1,0.0,72,163.78,163.78,2015,4,1
4,1,1,0.0,25,163.78,163.78,2015,5,1
...,...,...,...,...,...,...,...,...,...
86911,106,2,0.0,9,138.50,138.50,2016,5,18
86912,106,2,0.0,3,138.50,138.50,2016,5,19
86913,106,2,0.0,13,138.50,138.50,2016,5,20
86914,106,2,0.0,3,138.50,138.50,2016,5,21


In [7]:
df_test.describe()

Unnamed: 0,Store_id,SKU_id,Promo,Regular_Price,Promo_Price,Year,Month,Day
count,5970.0,5970.0,5970.0,5970.0,5970.0,5970.0,5970.0,5970.0
mean,53.849246,1.5,0.253601,135.00439,130.727687,2016.0,5.913903,15.175209
std,30.85339,0.500042,0.435109,3.490976,7.792553,0.0,2.360029,9.141712
min,1.0,1.0,0.0,128.98,114.17,2016.0,1.0,6.0
25%,27.0,1.0,0.0,131.7,127.76,2016.0,5.0,6.0
50%,54.0,1.5,0.0,134.96,131.7,2016.0,6.0,14.0
75%,80.0,2.0,1.0,138.5,138.5,2016.0,6.0,24.0
max,110.0,2.0,1.0,145.56,145.56,2016.0,12.0,31.0


In [8]:
X_train = df_train.drop(['Demand'], axis=1)
y_train = df_train['Demand']

X_test = df_test

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### LR

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('MAE: ', mean_absolute_error(y_val, lr.predict(X_val)), 'RMSE: ', np.sqrt(mean_squared_error(y_val, lr.predict(X_val))))

MAE:  136.6038475309344 RMSE:  285.7253936472334


### Random forest

In [10]:
from sklearn.ensemble import RandomForestRegressor

num = [10, 20, 50, 100, 300]

for i in num:
    rf = RandomForestRegressor(n_estimators=i, max_depth=10)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(i, 'MAE: ', mean_absolute_error(y_val, rf.predict(X_val)), 'RMSE: ', np.sqrt(mean_squared_error(y_val, rf.predict(X_val))))

10 MAE:  84.24326763673713 RMSE:  221.55850581508807
20 MAE:  84.04279012077706 RMSE:  219.95753371498597
50 MAE:  83.60708195342973 RMSE:  219.43612387604796
100 MAE:  83.54101566671655 RMSE:  219.29771926129163
500 MAE:  83.49421501990916 RMSE:  219.04056245868955


### Gradient Boosting

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

for i in num:
    gb = GradientBoostingRegressor(n_estimators=i, max_depth=10)
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    print(i, 'MAE: ', mean_absolute_error(y_val, gb.predict(X_val)), 'RMSE: ', np.sqrt(mean_squared_error(y_val, gb.predict(X_val))))

10 MAE:  111.89847780192999 RMSE:  241.85184485987278
20 MAE:  83.31872583391332 RMSE:  208.20314091300963
50 MAE:  65.42788518655746 RMSE:  184.35265157302962
100 MAE:  62.5773959022916 RMSE:  179.26642961285975
500 MAE:  64.59242206012756 RMSE:  181.2117654936575


In [12]:
from sklearn.ensemble import AdaBoostRegressor

for i in num:
    ada = AdaBoostRegressor(n_estimators=i)
    ada.fit(X_train, y_train)
    y_pred = ada.predict(X_test)
    print(i, 'MAE: ', mean_absolute_error(y_val, ada.predict(X_val)), 'RMSE: ', np.sqrt(mean_squared_error(y_val, ada.predict(X_val))))

10 MAE:  131.99156844859863 RMSE:  302.14116787761094
20 MAE:  130.64443067562038 RMSE:  298.98825206685666
50 MAE:  138.29512245515173 RMSE:  315.7523757057866
100 MAE:  148.09411248482425 RMSE:  357.6624122955727
500 MAE:  127.81820483815001 RMSE:  292.63172968484724


In [13]:
from sklearn.ensemble import BaggingRegressor

for i in num:
    bag = BaggingRegressor(n_estimators=i)
    bag.fit(X_train, y_train)
    y_pred = bag.predict(X_test)
    print(i, 'MAE: ', mean_absolute_error(y_val, bag.predict(X_val)), 'RMSE: ', np.sqrt(mean_squared_error(y_val, bag.predict(X_val))))

10 MAE:  67.93791992636906 RMSE:  199.93453275898491
20 MAE:  66.21889093419236 RMSE:  193.76996285085403
50 MAE:  65.20360446387481 RMSE:  189.6819643244734
100 MAE:  65.31693856419695 RMSE:  190.94807931270782
500 MAE:  64.62492326277037 RMSE:  190.07366415124457


In [14]:
from sklearn.ensemble import ExtraTreesRegressor

for i in num:
    et = ExtraTreesRegressor(n_estimators=i)
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    print(i, 'MAE: ', mean_absolute_error(y_val, et.predict(X_val)), 'RMSE: ', np.sqrt(mean_squared_error(y_val, et.predict(X_val))))

10 MAE:  82.83040727105384 RMSE:  236.44598500578658
20 MAE:  82.30548205246204 RMSE:  234.9641471823989
50 MAE:  81.69391586132845 RMSE:  233.40101125454203
100 MAE:  81.26289277222625 RMSE:  232.35957730476142
500 MAE:  81.04650552106884 RMSE:  232.03972494339794


In [15]:
from sklearn.ensemble import VotingRegressor

voting = VotingRegressor([('lr', lr), ('rf', rf), ('gb', gb), ('ada', ada), ('bag', bag), ('et', et)])
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print('MAE: ', mean_absolute_error(y_val, voting.predict(X_val)), 'RMSE: ', np.sqrt(mean_squared_error(y_val, voting.predict(X_val))))

### Any

### Final submission

In [None]:
# gb = GradientBoostingRegressor(n_estimators=100, max_depth=10)
# gb.fit(X_train, y_train)
# y_pred = gb.predict(X_test)

# result['Demand'] = y_pred
# result.to_csv('submission.csv', index=False)

# result

Unnamed: 0,id,Demand
0,0,590.082373
1,1,141.536053
2,2,89.348128
3,3,89.348128
4,4,89.348128
...,...,...
5965,5965,49.498894
5966,5966,49.498894
5967,5967,49.498894
5968,5968,49.498894
