In [1]:
import pandas as pd 
import numpy as np

In [38]:
train = pd.read_csv('../data/v3/training_seg2.csv')


In [39]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost
from xgboost import XGBRegressor

In [40]:
def instantiate_models():
    rf = RandomForestRegressor(n_estimators=100)
    et = ExtraTreesRegressor(n_estimators=100)
    knn = KNeighborsRegressor()
    LGB = lgb.LGBMRegressor()
    xgb = XGBRegressor()
    return rf, knn, et, LGB, xgb

In [41]:
def training(X_train, y_train, models):
    rf, et, knn, LGB, xgb = models
    LGB.fit(X_train, y_train)
    et.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    return  rf, et, knn, LGB, xgb

In [42]:
def predict(X_test, models):
    rf, et, knn, LGB, xgb = models
    lgb_pred = abs(np.round((LGB.predict(X_test))))
    xgb_pred = abs(np.round((xgb.predict(X_test))))
    rf_pred = abs(np.round((rf.predict(X_test))))
    et_pred = abs(np.round((et.predict(X_test))))
    knn_pred = abs(np.round((knn.predict(X_test))))
    return lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred

In [43]:
def MAPE(y_test, predictions):
    lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred = predictions
    return [mean_absolute_error((y_test), lgb_pred) * (100/len(y_test)),
            mean_absolute_error((y_test), xgb_pred) * (100/len(y_test)),
            mean_absolute_error((y_test), rf_pred) * (100/len(y_test)),
            mean_absolute_error((y_test), et_pred) * (100/len(y_test)),
            mean_absolute_error((y_test), knn_pred) * (100/len(y_test)),
           ]
    

In [44]:
best_pred = pd.read_csv('pred_best.csv')
test = pd.read_csv('../data/v3/test_seg2.csv')
best_pred = best_pred[best_pred['segment']==2].reset_index(drop=True)

In [45]:
test.head()

Unnamed: 0,id,application_date,segment,day,year,month,Holiday,Type,cos_mon,sin_mon,cos_day,sin_day,day_of_week,week_num
0,88,2019-07-24,2,24,2019,7,,,-0.866025,-0.5,0.151428,-0.988468,2,30
1,89,2019-07-25,2,25,2019,7,,,-0.866025,-0.5,0.347305,-0.937752,3,30
2,90,2019-07-26,2,26,2019,7,,,-0.866025,-0.5,0.528964,-0.848644,4,30
3,91,2019-07-27,2,27,2019,7,,,-0.866025,-0.5,0.688967,-0.724793,5,30
4,92,2019-07-28,2,28,2019,7,,,-0.866025,-0.5,0.820763,-0.571268,6,30


In [46]:
best_pred.head()

Unnamed: 0,id,application_date,segment,case_count
0,88,7/24/2019,2,1701
1,89,7/25/2019,2,1360
2,90,7/26/2019,2,1591
3,91,7/27/2019,2,1605
4,92,7/28/2019,2,1204


In [47]:
test_p = pd.merge(test, best_pred[['id','case_count']], left_on="id", right_on="id")

In [48]:
test_p.head()

Unnamed: 0,id,application_date,segment,day,year,month,Holiday,Type,cos_mon,sin_mon,cos_day,sin_day,day_of_week,week_num,case_count
0,88,2019-07-24,2,24,2019,7,,,-0.866025,-0.5,0.151428,-0.988468,2,30,1701
1,89,2019-07-25,2,25,2019,7,,,-0.866025,-0.5,0.347305,-0.937752,3,30,1360
2,90,2019-07-26,2,26,2019,7,,,-0.866025,-0.5,0.528964,-0.848644,4,30,1591
3,91,2019-07-27,2,27,2019,7,,,-0.866025,-0.5,0.688967,-0.724793,5,30,1605
4,92,2019-07-28,2,28,2019,7,,,-0.866025,-0.5,0.820763,-0.571268,6,30,1204


In [49]:
df = train[['application_date', 'case_count']].append(test_p[['application_date', 'case_count']], ignore_index=True)
df

Unnamed: 0,application_date,case_count
0,2017-04-01,0.0
1,2017-04-02,0.0
2,2017-04-03,0.0
3,2017-04-04,0.0
4,2017-04-05,0.0
...,...,...
12562,2019-10-20,1225.0
12563,2019-10-21,1504.0
12564,2019-10-22,1517.0
12565,2019-10-23,1549.0


In [50]:
window = df.expanding()
dataframe = pd.concat([df['application_date'], df['case_count'],
                       window['case_count'].min(), window['case_count'].mean(),
                       window['case_count'].max(), window['case_count'].std(),
                       df['case_count'].shift(1),
                       df['case_count'].shift(2),  df['case_count'].shift(3)
                       ], axis=1)
dataframe.columns = ['application_date','case_count', 'min', 'mean', 'max', 'std', 'shift1','shift2', 'shift3']
dataframe

Unnamed: 0,application_date,case_count,min,mean,max,std,shift1,shift2,shift3
0,2017-04-01,0.0,0.0,0.000000,0.0,,,,
1,2017-04-02,0.0,0.0,0.000000,0.0,0.000000,0.0,,
2,2017-04-03,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,
3,2017-04-04,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
4,2017-04-05,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
12562,2019-10-20,1225.0,0.0,514.619597,2505.0,610.043519,1540.0,1489.0,1854.0
12563,2019-10-21,1504.0,0.0,514.698344,2505.0,610.083096,1225.0,1540.0,1489.0
12564,2019-10-22,1517.0,0.0,514.778114,2505.0,610.124341,1504.0,1225.0,1540.0
12565,2019-10-23,1549.0,0.0,514.860417,2505.0,610.169817,1517.0,1504.0,1225.0


In [51]:
test_p = dataframe[dataframe['application_date']>='2019-07-24'].reset_index(drop=True)

In [52]:
train_p = dataframe[dataframe['application_date']<'2019-07-24'].reset_index(drop=True)

In [53]:
train_p.drop(columns=['application_date', 'case_count'], inplace=True)
test_p.drop(columns=['application_date', 'case_count'], inplace=True)

In [54]:
train = pd.merge(train, train_p, left_index=True, right_index=True)
train.drop(columns='Holiday', inplace=True)

In [55]:
test = pd.merge(test, test_p, left_index=True, right_index=True)
test.drop(columns='Holiday', inplace=True)

In [56]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Type,cos_day,sin_day,cos_mon,sin_mon,day_of_week,week_num,min,mean,max,std,shift1,shift2,shift3
0,2017-04-01,2,0.0,1,2017,4,,0.978148,0.207912,-0.5,0.866025,5,13,0.0,0.0,0.0,,,,
1,2017-04-02,2,0.0,2,2017,4,,0.913545,0.406737,-0.5,0.866025,6,13,0.0,0.0,0.0,0.0,0.0,,
2,2017-04-03,2,0.0,3,2017,4,,0.809017,0.587785,-0.5,0.866025,0,14,0.0,0.0,0.0,0.0,0.0,0.0,
3,2017-04-04,2,0.0,4,2017,4,G,0.669131,0.743145,-0.5,0.866025,1,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-04-05,2,0.0,5,2017,4,,0.5,0.866025,-0.5,0.866025,2,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
test.head()

Unnamed: 0,id,application_date,segment,day,year,month,Type,cos_mon,sin_mon,cos_day,sin_day,day_of_week,week_num,min,mean,max,std,shift1,shift2,shift3
0,88,2019-07-24,2,24,2019,7,,-0.866025,-0.5,0.151428,-0.988468,2,30,0.0,507.949659,2505.0,605.950481,1984.0,1946.0,1028.0
1,89,2019-07-25,2,25,2019,7,,-0.866025,-0.5,0.347305,-0.937752,3,30,0.0,508.017954,2505.0,605.97421,1701.0,1984.0,1946.0
2,90,2019-07-26,2,26,2019,7,,-0.866025,-0.5,0.528964,-0.848644,4,30,0.0,508.104753,2505.0,606.027484,1360.0,1701.0,1984.0
3,91,2019-07-27,2,27,2019,7,,-0.866025,-0.5,0.688967,-0.724793,5,30,0.0,508.192659,2505.0,606.08275,1591.0,1360.0,1701.0
4,92,2019-07-28,2,28,2019,7,,-0.866025,-0.5,0.820763,-0.571268,6,30,0.0,508.248417,2505.0,606.09047,1605.0,1591.0,1360.0


In [58]:
train.Type.fillna(value="Nope", inplace=True)
test.Type.fillna(value="Nope", inplace=True)

In [59]:
train.dropna(inplace=True)

In [60]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
la = LabelEncoder()
on = OneHotEncoder()
on2 = OneHotEncoder()

In [61]:
train['Type_Mat'] = la.fit_transform(train['Type'])

In [62]:
type_mat = on.fit_transform(train['Type_Mat'].values.reshape(-1,1))
day_mat = on2.fit_transform(train['day_of_week'].values.reshape(-1,1))
type_mat = type_mat.todense()
day_mat = day_mat.todense()
mat = np.append(type_mat, day_mat, axis=1)

In [63]:
mat = np.append(type_mat, day_mat, axis=1)
mat = np.append(mat, train['day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['week_num'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['month'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['year'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['min'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['mean'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['max'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['shift1'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['shift2'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['shift3'].values.reshape(-1, 1), axis=1)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(mat, train['case_count'], test_size=0.3, random_state=42)
inst_models = instantiate_models()
trained_models = training(X_train, y_train, inst_models)
prediction =  predict(X_test, trained_models)
err = MAPE(y_test, prediction)
# export_csv(y_test, prediction, train, seed)

  if getattr(data, 'base', None) is not None and \


In [65]:
err

[3.245784129249472,
 3.5410657648127875,
 3.330497186086777,
 4.491448851796,
 3.2406636384593566]

# Test Data Transformation

In [66]:
test['Type_Mat'] = la.transform(test['Type'])

In [67]:
type_mat = on.transform(test['Type_Mat'].values.reshape(-1,1))
day_mat = on2.transform(test['day_of_week'].values.reshape(-1,1))
type_mat = type_mat.todense()
day_mat = day_mat.todense()
mat_test = np.append(type_mat, day_mat, axis=1)

In [68]:
mat_test = np.append(type_mat, day_mat, axis=1)
mat_test = np.append(mat_test, test['day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['week_num'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['month'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['year'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['min'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['mean'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['max'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['shift1'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['shift2'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['shift3'].values.reshape(-1, 1), axis=1)

In [69]:
mat_test.shape, mat.shape

((93, 28), (12471, 28))

In [70]:
test_prediction = predict(mat_test, trained_models)

In [71]:
lgb_pred, xgb_pred, rf_pred, et_pred, knn_pred = test_prediction

In [72]:
test['case_count'] = knn_pred

In [73]:
test[['id', 'application_date', 'segment', 'case_count']].to_csv('sft_123_rf_pred_seg2.csv', index=False)