In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [2]:
train = pd.read_csv('../data/v3/training_seg1.csv')
test = pd.read_csv('../data/v3/test_seg1.csv')

In [3]:
train.shape, test.shape

((62562, 14), (87, 14))

In [4]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon,day_of_week,week_num
0,2017-04-01,1,40.0,1,2017,4,,,0.978148,0.207912,-0.5,0.866025,5,13
1,2017-04-03,1,5.0,3,2017,4,,,0.809017,0.587785,-0.5,0.866025,0,14
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,G,0.669131,0.743145,-0.5,0.866025,1,14
3,2017-04-07,1,76.0,7,2017,4,,,0.104528,0.994522,-0.5,0.866025,4,14
4,2017-04-13,1,81.0,13,2017,4,Maundy Thursday,C,-0.913545,0.406737,-0.5,0.866025,3,15


In [5]:
train['log_case'] = np.log1p(train['case_count'])

# Feature Engineering

In [6]:
a1 = [0] + [i for i in train['log_case'].iloc[:-1].tolist()]
a2 = [0] + [0] + [i for i in train['log_case'].iloc[:-2].tolist()]
a3 = [0] + [0] + [0] + [i for i in train['log_case'].iloc[:-3].tolist()]
a4 = [0] + [0] + [0] + [0] + [i for i in train['log_case'].iloc[:-4].tolist()]

In [7]:
train['case_count_prev'] = a1
train['case_count_prev_2'] = a2
train['case_count_prev_3'] = a3
train['case_count_prev_4'] = a4

# Modelling

In [8]:
train.Type.fillna(value="Nope", inplace=True)
test.Type.fillna(value="Nope", inplace=True)

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [10]:
la = LabelEncoder()
on = OneHotEncoder()
on2 = OneHotEncoder()

In [11]:
train['Type_Mat'] = la.fit_transform(train['Type'])

In [12]:
type_mat = on.fit_transform(train['Type_Mat'].values.reshape(-1,1))
day_mat = on2.fit_transform(train['day_of_week'].values.reshape(-1,1))
type_mat = type_mat.todense()
day_mat = day_mat.todense()
mat = np.append(type_mat, day_mat, axis=1)

In [13]:
mat = np.append(type_mat, day_mat, axis=1)
mat = np.append(mat, train['day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['week_num'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['month'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['year'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev_2'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev_3'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['case_count_prev_4'].values.reshape(-1, 1), axis=1)

In [14]:
mat.shape

(62562, 26)

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost
from xgboost import XGBRegressor

  from numpy.core.umath_tests import inner1d


In [16]:
def instantiate_models():
    rf = RandomForestRegressor(n_estimators=100)
    et = ExtraTreesRegressor(n_estimators=100)
    knn = KNeighborsRegressor()
    LGB = lgb.LGBMRegressor()
    xgb = XGBRegressor()
    return rf, knn, et, LGB, xgb

In [17]:
def training(X_train, y_train, models):
    rf, et, knn, LGB, xgb = models
    LGB.fit(X_train, y_train)
    et.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    return  rf, et, knn, LGB, xgb

In [18]:
def predict(X_test, models):
    rf, et, knn, LGB, xgb = models
    lgb_pred = abs(np.round(np.expm1(LGB.predict(X_test))))
    xgb_pred = abs(np.round(np.expm1(xgb.predict(X_test))))
    rf_pred = abs(np.round(np.expm1(rf.predict(X_test))))
    et_pred = abs(np.round(np.expm1(et.predict(X_test))))
    knn_pred = abs(np.round(np.expm1(knn.predict(X_test))))
    return lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred

In [19]:
def MAPE(y_test, predictions):
    lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred = predictions
    return [mean_absolute_error(np.expm1(y_test), lgb_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), xgb_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), rf_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), et_pred) * (100/len(y_test)),
            mean_absolute_error(np.expm1(y_test), knn_pred) * (100/len(y_test)),
           ]
    

In [20]:
def export_csv(y_test, prediction, train, seed):
    lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred = prediction
    xtest = train.iloc[y_test.index].reset_index(drop=True)
    xtest['log_case'] = y_test.values
    xtest['lgb_pred'] = lgb_pred
    xtest['xgb_pred'] = xgb_pred
    xtest['knn_pred'] = knn_pred
    xtest['et_pred'] = et_pred
    xtest['rf_pred'] = rf_pred
    xtest[['log_case', 'case_count_prev', 'case_count_prev_2',
       'case_count_prev_3', 'case_count_prev_4', 
       'knn_pred', 'et_pred', 'rf_pred', 'lgb_pred', 'xgb_pred', 'case_count']].to_csv(f'stack_v{seed}.csv')

In [21]:
for seed in range(10):
    X_train, X_test, y_train, y_test = train_test_split(mat, train['log_case'], test_size=0.3, random_state=seed)
    inst_models = instantiate_models()
    trained_models = training(X_train, y_train, inst_models)
    prediction =  predict(X_test, trained_models)
    err = MAPE(y_test, prediction)
    export_csv(y_test, prediction, train, seed)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


In [22]:
err

[0.031821981441387266,
 0.036787699154469104,
 0.031874213308946124,
 0.03425047941402862,
 0.030860801530766153]

# Loading Stacked Data

In [19]:
pk = []
for i in range(4):
    pk.append(pd.read_csv(f'../notebooks_Day_#1/stack_seg_1/stack_v{i}.csv'))

In [20]:
df = pd.concat(pk, ignore_index=True, sort=False)
df.drop(columns='Unnamed: 0', inplace=True)

In [21]:
df.head()

Unnamed: 0,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,knn_pred,et_pred,rf_pred,lgb_pred,xgb_pred,case_count
0,1.609438,2.639057,2.302585,2.397895,2.397895,1.0,1.0,1.0,2.0,4.0,4.0
1,1.791759,3.496508,3.78419,3.401197,3.912023,19.0,10.0,18.0,14.0,16.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.484907,2.564949,2.079442,1.609438,2.197225,11.0,14.0,10.0,11.0,11.0,11.0
4,3.871201,3.610918,3.931826,2.079442,1.098612,34.0,28.0,34.0,37.0,37.0,47.0


In [22]:
df.corr()

Unnamed: 0,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,knn_pred,et_pred,rf_pred,lgb_pred,xgb_pred,case_count
log_case,1.0,0.781775,0.735564,0.71111,0.695802,0.817128,0.817117,0.814294,0.817506,0.783138,0.854872
case_count_prev,0.781775,1.0,0.78345,0.735737,0.713768,0.704363,0.688721,0.707452,0.725134,0.756082,0.642709
case_count_prev_2,0.735564,0.78345,1.0,0.780926,0.735052,0.657855,0.652943,0.664488,0.681992,0.69708,0.608408
case_count_prev_3,0.71111,0.735737,0.780926,1.0,0.780007,0.630917,0.63096,0.638425,0.649334,0.660471,0.585398
case_count_prev_4,0.695802,0.713768,0.735052,0.780007,1.0,0.617571,0.615965,0.623663,0.634778,0.64822,0.571144
knn_pred,0.817128,0.704363,0.657855,0.630917,0.617571,1.0,0.941692,0.980662,0.963562,0.925318,0.878297
et_pred,0.817117,0.688721,0.652943,0.63096,0.615965,0.941692,1.0,0.925371,0.927594,0.890657,0.849413
rf_pred,0.814294,0.707452,0.664488,0.638425,0.623663,0.980662,0.925371,1.0,0.965457,0.93144,0.870197
lgb_pred,0.817506,0.725134,0.681992,0.649334,0.634778,0.963562,0.927594,0.965457,1.0,0.964121,0.870699
xgb_pred,0.783138,0.756082,0.69708,0.660471,0.64822,0.925318,0.890657,0.93144,0.964121,1.0,0.829156


In [23]:
training = df[['log_case', 'case_count_prev', 'case_count_prev_2', 'case_count_prev_3',
               'case_count_prev_4', 'knn_pred', 'et_pred', 'rf_pred', 'lgb_pred',
               'xgb_pred']]
label = df['case_count']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(training, label, test_size=0.3, random_state=42)

In [25]:
rfs = RandomForestRegressor(n_estimators=100)

In [26]:
rfs.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [27]:
pred = rfs.predict(X_test)

In [28]:
xtest = X_test.loc[y_test.index].reset_index(drop=True)

In [29]:
xtest['pred'] = pred

In [30]:
mean_absolute_error(y_test, pred) * (100/len(y_test))

0.0

In [31]:
X_train.shape

(52553, 10)

# Testing

In [32]:
test['Type_Mat'] = la.fit_transform(test['Type'])
type_mat_test = on.transform(test['Type_Mat'].values.reshape(-1,1))
day_mat_test = on2.transform(test['day_of_week'].values.reshape(-1,1))
type_mat_test = type_mat_test.todense()
day_mat_test = day_mat_test.todense()
mat_test = np.append(type_mat_test, day_mat_test, axis=1)

NameError: name 'la' is not defined

In [35]:
mat_test = np.append(mat_test, test['day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['week_num'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['month'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['year'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_day'].values.reshape(-1, 1), axis=1)

In [36]:
mat.shape, mat_test.shape

((62562, 26), (87, 22))

In [37]:
train.tail()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon,day_of_week,week_num,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,Type_Mat
62557,2019-07-01,1,6.0,1,2019,7,,Nope,0.97953,0.201299,-0.866025,-0.5,0,27,1.94591,0.0,1.386294,1.386294,1.386294,3
62558,2019-07-02,1,3.0,2,2019,7,,Nope,0.918958,0.394356,-0.866025,-0.5,1,27,1.386294,1.94591,0.0,1.386294,1.386294,3
62559,2019-07-03,1,3.0,3,2019,7,,Nope,0.820763,0.571268,-0.866025,-0.5,2,27,1.386294,1.386294,1.94591,0.0,1.386294,3
62560,2019-07-04,1,5.0,4,2019,7,Rath Yatra,R,0.688967,0.724793,-0.866025,-0.5,3,27,1.791759,1.386294,1.386294,1.94591,0.0,5
62561,2019-07-05,1,0.0,5,2019,7,,Nope,0.528964,0.848644,-0.866025,-0.5,4,27,0.0,1.791759,1.386294,1.386294,1.94591,3


In [40]:
[i for i in reversed(train['case_count'].iloc[-4:].tolist())]

[0.0, 5.0, 3.0, 3.0]

In [41]:
def func(model):
    a = []
    for i in range(len(mat_test)):
        if i ==0:
            k = np.append(np.array(mat_test[i]), np.log1p(np.array([[0.0, 5.0, 3.0, 3.0]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i==1:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[0],0.0,5.0,3.0]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i==2:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[1],a[0],0.0,5.0]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i==3:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[2],a[1],a[0],0.0]])), axis=1)
    #         a.append(np.round(np.expm1(rf.predict(k))))
        if i>3:
            k = np.append(np.array(mat_test[i]),  np.log1p(np.array([[a[i-1],a[i-2],a[i-3],a[i-4]]])), axis=1)

        a.append((np.expm1(model.predict(k)))[0])
    return a

In [42]:
rf, et, knn, LGB, xgb = trained_models
a_rf = func(rf)
a_et = func(et)
a_knn = func(knn)
a_LGB = func(LGB)
a_xgb = func(xgb)

In [43]:
np.log1p([0.0, 5.0, 3.0, 3.0])

array([0.        , 1.79175947, 1.38629436, 1.38629436])

In [44]:
a=[np.mean(x) for x in zip(a_rf, a_et, a_knn, a_LGB, a_xgb)]

In [45]:
test['log_case'] = np.log1p(a)

In [46]:
a1 = [0] + [i for i in test['log_case'].iloc[:-1].tolist()]
a2 = [1.79175947] + [0] + [i for i in test['log_case'].iloc[:-2].tolist()]
a3 = [1.38629436] + [1.79175947] + [0] + [i for i in test['log_case'].iloc[:-3].tolist()]
a4 = [1.38629436] + [1.38629436] + [1.79175947] + [0] + [i for i in test['log_case'].iloc[:-4].tolist()]

In [47]:
test['case_count_prev'] = a1
test['case_count_prev_2'] = a2
test['case_count_prev_3'] = a3
test['case_count_prev_4'] = a4

In [48]:
test['knn_pred'] = a_knn
test['et_pred'] = a_et
test['rf_pred'] = a_rf
test['lgb_pred'] = a_LGB
test['xgb_pred'] = a_xgb

In [49]:
df_test = test[['log_case', 'case_count_prev', 'case_count_prev_2', 'case_count_prev_3',
                'case_count_prev_4', 'knn_pred', 'et_pred', 'rf_pred', 'lgb_pred',
                'xgb_pred']]
df_test.head()

Unnamed: 0,log_case,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,knn_pred,et_pred,rf_pred,lgb_pred,xgb_pred
0,1.040164,0.0,1.791759,1.386294,1.386294,1.995133,0.820564,2.784826,1.735212,1.81267
1,0.766437,1.040164,0.0,1.791759,1.386294,1.374712,0.584893,1.253003,1.097321,1.45049
2,1.081785,0.766437,1.040164,0.0,1.791759,3.822518,1.491462,3.390112,0.235943,0.809667
3,0.881175,1.081785,0.766437,1.040164,0.0,3.0197,0.0,1.577598,1.065499,1.405874
4,1.130816,0.881175,1.081785,0.766437,1.040164,3.461769,0.515717,2.534735,1.658348,2.320352


In [50]:
test['case_count'] = rfs.predict(df_test)

In [51]:
test[['id', 'application_date', 'segment', 'case_count']].to_csv('stacked_V1_seg_1.csv', index=False)