In [54]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [55]:
train = pd.read_csv('../data/v3/training_seg2.csv')
test = pd.read_csv('../data/v3/test_seg2.csv')

In [56]:
train.shape, test.shape

((12474, 14), (93, 14))

In [57]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon,day_of_week,week_num
0,2017-04-01,2,0.0,1,2017,4,,,0.978148,0.207912,-0.5,0.866025,5,13
1,2017-04-02,2,0.0,2,2017,4,,,0.913545,0.406737,-0.5,0.866025,6,13
2,2017-04-03,2,0.0,3,2017,4,,,0.809017,0.587785,-0.5,0.866025,0,14
3,2017-04-04,2,0.0,4,2017,4,Rama Navami,G,0.669131,0.743145,-0.5,0.866025,1,14
4,2017-04-05,2,0.0,5,2017,4,,,0.5,0.866025,-0.5,0.866025,2,14


In [58]:
train['log_case'] = np.log1p(train['case_count'])

# Feature Engineering

In [59]:
a1 = [0] + [i for i in train['log_case'].iloc[:-1].tolist()]
a2 = [0] + [0] + [i for i in train['log_case'].iloc[:-2].tolist()]
a3 = [0] + [0] + [0] + [i for i in train['log_case'].iloc[:-3].tolist()]
a4 = [0] + [0] + [0] + [0] + [i for i in train['log_case'].iloc[:-4].tolist()]

In [60]:
train['case_count_prev'] = a1
train['case_count_prev_2'] = a2
train['case_count_prev_3'] = a3
train['case_count_prev_4'] = a4

# Modelling

In [61]:
train.Type.fillna(value="Nope", inplace=True)
test.Type.fillna(value="Nope", inplace=True)

In [62]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [63]:
la = LabelEncoder()
on = OneHotEncoder()
on2 = OneHotEncoder()

In [64]:
train['Type_Mat'] = la.fit_transform(train['Type'])

In [65]:
type_mat = on.fit_transform(train['Type_Mat'].values.reshape(-1,1))
day_mat = on2.fit_transform(train['day_of_week'].values.reshape(-1,1))
type_mat = type_mat.todense()
day_mat = day_mat.todense()
mat = np.append(type_mat, day_mat, axis=1)

In [66]:
mat = np.append(type_mat, day_mat, axis=1)
mat = np.append(mat, train['day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['week_num'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['month'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['year'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev_2'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev_3'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev_4'].values.reshape(-1, 1), axis=1)

In [67]:
mat.shape

(12474, 26)

In [68]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost
from xgboost import XGBRegressor

In [69]:
def instantiate_models():
    rf = RandomForestRegressor(n_estimators=100)
    et = ExtraTreesRegressor(n_estimators=100)
    knn = KNeighborsRegressor()
    LGB = lgb.LGBMRegressor()
    xgb = XGBRegressor()
    return rf, knn, et, LGB, xgb

In [70]:
def training(X_train, y_train, models):
    rf, et, knn, LGB, xgb = models
    LGB.fit(X_train, y_train)
    et.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    return  rf, et, knn, LGB, xgb

In [71]:
def predict(X_test, models):
    rf, et, knn, LGB, xgb = models
    lgb_pred = abs(np.round(np.expm1(LGB.predict(X_test))))
    xgb_pred = abs(np.round(np.expm1(xgb.predict(X_test))))
    rf_pred = abs(np.round(np.expm1(rf.predict(X_test))))
    et_pred = abs(np.round(np.expm1(et.predict(X_test))))
    knn_pred = abs(np.round(np.expm1(knn.predict(X_test))))
    return lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred

In [72]:
def MAPE(y_test, predictions):
    lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred = predictions
    return [mean_absolute_error(np.expm1(y_test), lgb_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), xgb_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), rf_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), et_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), knn_pred) * (100/len(y_test)),
           ]
    

In [73]:
def export_csv(y_test, prediction, train, seed):
    lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred = prediction
    xtest = train.iloc[y_test.index].reset_index(drop=True)
    xtest['log_case'] = y_test.values
    xtest['lgb_pred'] = lgb_pred
    xtest['xgb_pred'] = xgb_pred
    xtest['knn_pred'] = knn_pred
    xtest['et_pred'] = et_pred
    xtest['rf_pred'] = rf_pred
    xtest[['log_case', 'case_count_prev', 'case_count_prev_2',
       'case_count_prev_3', 'case_count_prev_4', 
       'knn_pred', 'et_pred', 'rf_pred', 'lgb_pred', 'xgb_pred', 'case_count']].to_csv(f'stack_v{seed}.csv')

In [74]:
for seed in range(10):
    X_train, X_test, y_train, y_test = train_test_split(mat, train['log_case'], test_size=0.3, random_state=seed)
    inst_models = instantiate_models()
    trained_models = training(X_train, y_train, inst_models)
    prediction =  predict(X_test, trained_models)
    err = MAPE(y_test, prediction)
    export_csv(y_test, prediction, train, seed)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


In [75]:
err

[3.3072974976747047,
 3.4975966179704296,
 3.442728858407276,
 4.691760892485101,
 3.4331857083440607]

# Loading Stacked Data

In [76]:
pk = []
for i in range(10):
    pk.append(pd.read_csv(f'stack_v{i}.csv'))

In [77]:
df = pd.concat(pk, ignore_index=True, sort=False)
df.drop(columns='Unnamed: 0', inplace=True)

In [78]:
df.head()

Unnamed: 0,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,knn_pred,et_pred,rf_pred,lgb_pred,xgb_pred,case_count
0,6.232448,6.059123,6.042633,5.638355,6.095825,400.0,507.0,387.0,458.0,458.0,508.0
1,6.280396,5.247024,6.306275,6.357842,6.539586,377.0,478.0,362.0,329.0,301.0,533.0
2,4.077537,4.394449,2.302585,2.302585,1.609438,84.0,32.0,76.0,59.0,70.0,58.0
3,7.113956,7.201171,7.561642,5.680173,6.948897,1561.0,923.0,1419.0,1387.0,1301.0,1228.0
4,4.418841,4.912655,5.375278,5.648974,5.568345,153.0,184.0,141.0,129.0,128.0,82.0


In [79]:
df.corr()

Unnamed: 0,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,knn_pred,et_pred,rf_pred,lgb_pred,xgb_pred,case_count
log_case,1.0,0.966637,0.942833,0.924051,0.906837,0.696521,0.728457,0.696026,0.702251,0.700502,0.704678
case_count_prev,0.966637,1.0,0.966652,0.944168,0.923503,0.69867,0.722563,0.701541,0.703896,0.70977,0.668099
case_count_prev_2,0.942833,0.966652,1.0,0.966735,0.941801,0.667634,0.70644,0.669879,0.671792,0.675827,0.640385
case_count_prev_3,0.924051,0.944168,0.966735,1.0,0.96611,0.639242,0.685233,0.639766,0.642716,0.646467,0.613681
case_count_prev_4,0.906837,0.923503,0.941801,0.96611,1.0,0.612399,0.660474,0.613602,0.617484,0.620504,0.591582
knn_pred,0.696521,0.69867,0.667634,0.639242,0.612399,1.0,0.913091,0.99083,0.985128,0.980433,0.924328
et_pred,0.728457,0.722563,0.70644,0.685233,0.660474,0.913091,1.0,0.906274,0.913785,0.910472,0.86256
rf_pred,0.696026,0.701541,0.669879,0.639766,0.613602,0.99083,0.906274,1.0,0.98674,0.984148,0.923554
lgb_pred,0.702251,0.703896,0.671792,0.642716,0.617484,0.985128,0.913785,0.98674,1.0,0.988846,0.931099
xgb_pred,0.700502,0.70977,0.675827,0.646467,0.620504,0.980433,0.910472,0.984148,0.988846,1.0,0.921821


In [80]:
training = df[['log_case', 'case_count_prev', 'case_count_prev_2', 'case_count_prev_3',
               'case_count_prev_4', 'knn_pred', 'et_pred', 'rf_pred', 'lgb_pred',
               'xgb_pred']]
label = df['case_count']

In [81]:
X_train, X_test, y_train, y_test = train_test_split(training, label, test_size=0.3, random_state=42)

In [82]:
rfs = RandomForestRegressor(n_estimators=100)

In [83]:
rfs.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [84]:
pred = rfs.predict(X_test)

In [85]:
xtest = X_test.loc[y_test.index].reset_index(drop=True)

In [86]:
mean_absolute_error(y_test, pred) * (100/len(y_test))

0.0003515730427177913

# Testing

In [87]:
test['Type_Mat'] = la.fit_transform(test['Type'])
type_mat_test = on.transform(test['Type_Mat'].values.reshape(-1,1))
day_mat_test = on2.transform(test['day_of_week'].values.reshape(-1,1))
type_mat_test = type_mat_test.todense()
day_mat_test = day_mat_test.todense()
mat_test = np.append(type_mat_test, day_mat_test, axis=1)

In [88]:
mat_test = np.append(mat_test, test['day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['week_num'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['month'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['year'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_day'].values.reshape(-1, 1), axis=1)

In [89]:
mat.shape, mat_test.shape

((12474, 26), (93, 22))

In [90]:
train.tail()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon,day_of_week,week_num,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,Type_Mat
12469,2019-07-19,2,1886.0,19,2019,7,,Nope,-0.758758,-0.651372,-0.866025,-0.5,4,29,7.542744,7.786967,7.698936,6.779922,7.337588,3
12470,2019-07-20,2,1480.0,20,2019,7,,Nope,-0.612106,-0.790776,-0.866025,-0.5,5,29,7.300473,7.542744,7.786967,7.698936,6.779922,3
12471,2019-07-21,2,1028.0,21,2019,7,,Nope,-0.440394,-0.897805,-0.866025,-0.5,6,29,6.936343,7.300473,7.542744,7.786967,7.698936,3
12472,2019-07-22,2,1946.0,22,2019,7,,Nope,-0.250653,-0.968077,-0.866025,-0.5,0,30,7.574045,6.936343,7.300473,7.542744,7.786967,3
12473,2019-07-23,2,1984.0,23,2019,7,,Nope,-0.050649,-0.998717,-0.866025,-0.5,1,30,7.593374,7.574045,6.936343,7.300473,7.542744,3


In [91]:
def func(model):
    a = []
    for i in range(len(mat_test)):
        if i ==0:
            k = np.append(np.array(mat_test[i]), np.log1p(np.array([[1984.0, 1946.0, 1028.0, 1480.0]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i==1:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[0],1984,1946,1028]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i==2:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[1],a[0],1984,1946]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i==3:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[2],a[1],a[0],1984]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i>3:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[i-1],a[i-2],a[i-3],a[i-4]]])), axis=1)

        a.append((np.expm1(model.predict(k)))[0])
    return a

In [92]:
rf, et, knn, LGB, xgb = trained_models
a_rf = func(rf)
a_et = func(et)
a_knn = func(knn)
a_LGB = func(LGB)
a_xgb = func(xgb)

In [106]:
train.tail()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon,day_of_week,week_num,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,Type_Mat
12469,2019-07-19,2,1886.0,19,2019,7,,Nope,-0.758758,-0.651372,-0.866025,-0.5,4,29,7.542744,7.786967,7.698936,6.779922,7.337588,3
12470,2019-07-20,2,1480.0,20,2019,7,,Nope,-0.612106,-0.790776,-0.866025,-0.5,5,29,7.300473,7.542744,7.786967,7.698936,6.779922,3
12471,2019-07-21,2,1028.0,21,2019,7,,Nope,-0.440394,-0.897805,-0.866025,-0.5,6,29,6.936343,7.300473,7.542744,7.786967,7.698936,3
12472,2019-07-22,2,1946.0,22,2019,7,,Nope,-0.250653,-0.968077,-0.866025,-0.5,0,30,7.574045,6.936343,7.300473,7.542744,7.786967,3
12473,2019-07-23,2,1984.0,23,2019,7,,Nope,-0.050649,-0.998717,-0.866025,-0.5,1,30,7.593374,7.574045,6.936343,7.300473,7.542744,3


In [93]:
np.log1p([1480., 1028., 1946., 1984.])

array([7.30047281, 6.93634274, 7.57404501, 7.59337419])

In [94]:
a=[np.mean(x) for x in zip(a_rf, a_et, a_knn, a_LGB, a_xgb)]

In [95]:
test['log_case'] = np.log1p(a)

In [96]:
a1 = [7.59337419] + [i for i in test['log_case'].iloc[:-1].tolist()]
a2 = [7.57404501] + [7.59337419] + [i for i in test['log_case'].iloc[:-2].tolist()]
a3 = [6.93634274] + [7.57404501] + [7.59337419] + [i for i in test['log_case'].iloc[:-3].tolist()]
a4 = [7.30047281] + [6.93634274] + [7.57404501] + [7.59337419] + [i for i in test['log_case'].iloc[:-4].tolist()]

In [97]:
test['case_count_prev'] = a1
test['case_count_prev_2'] = a2
test['case_count_prev_3'] = a3
test['case_count_prev_4'] = a4

In [98]:
test['knn_pred'] = a_knn
test['et_pred'] = a_et
test['rf_pred'] = a_rf
test['lgb_pred'] = a_LGB
test['xgb_pred'] = a_xgb

In [99]:
df_test = test[['log_case', 'case_count_prev', 'case_count_prev_2', 'case_count_prev_3',
                'case_count_prev_4', 'knn_pred', 'et_pred', 'rf_pred', 'lgb_pred',
                'xgb_pred']]
df_test.head()

Unnamed: 0,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,knn_pred,et_pred,rf_pred,lgb_pred,xgb_pred
0,7.430936,7.593374,7.574045,6.936343,7.300473,1572.411646,1610.962019,1808.046583,1710.958276,1729.550537
1,7.322112,7.430936,7.593374,7.574045,6.936343,1413.655971,1783.80529,1602.576429,1297.203004,1464.740479
2,7.22372,7.322112,7.430936,7.593374,7.574045,1319.183247,2019.405533,1470.830285,905.046689,1138.441284
3,6.981066,7.22372,7.322112,7.430936,7.593374,1037.099378,1619.169211,1124.78999,744.769977,849.495483
4,6.290602,6.981066,7.22372,7.322112,7.430936,787.955351,529.654298,570.878881,349.572341,454.330353


In [103]:
test['case_count'] = np.round(rfs.predict(df_test))

In [104]:
test.head()

Unnamed: 0,id,application_date,segment,day,year,month,Holiday,Type,cos_mon,sin_mon,...,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,knn_pred,et_pred,rf_pred,lgb_pred,xgb_pred,case_count
0,88,2019-07-24,2,24,2019,7,,Nope,-0.866025,-0.5,...,7.593374,7.574045,6.936343,7.300473,1572.411646,1610.962019,1808.046583,1710.958276,1729.550537,1685.0
1,89,2019-07-25,2,25,2019,7,,Nope,-0.866025,-0.5,...,7.430936,7.593374,7.574045,6.936343,1413.655971,1783.80529,1602.576429,1297.203004,1464.740479,1512.0
2,90,2019-07-26,2,26,2019,7,,Nope,-0.866025,-0.5,...,7.322112,7.430936,7.593374,7.574045,1319.183247,2019.405533,1470.830285,905.046689,1138.441284,1370.0
3,91,2019-07-27,2,27,2019,7,,Nope,-0.866025,-0.5,...,7.22372,7.322112,7.430936,7.593374,1037.099378,1619.169211,1124.78999,744.769977,849.495483,1075.0
4,92,2019-07-28,2,28,2019,7,,Nope,-0.866025,-0.5,...,6.981066,7.22372,7.322112,7.430936,787.955351,529.654298,570.878881,349.572341,454.330353,538.0


In [105]:
test[['id', 'application_date', 'segment', 'case_count']].to_csv('stacked_V1_seg_2.csv', index=False)