In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [59]:
train = pd.read_csv('../data/v3/training_seg2.csv')
test = pd.read_csv('../data/v3/test_seg2.csv')

In [60]:
train.shape, test.shape

((12474, 14), (93, 14))

In [61]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon,day_of_week,week_num
0,2017-04-01,2,0.0,1,2017,4,,,0.978148,0.207912,-0.5,0.866025,5,13
1,2017-04-02,2,0.0,2,2017,4,,,0.913545,0.406737,-0.5,0.866025,6,13
2,2017-04-03,2,0.0,3,2017,4,,,0.809017,0.587785,-0.5,0.866025,0,14
3,2017-04-04,2,0.0,4,2017,4,Rama Navami,G,0.669131,0.743145,-0.5,0.866025,1,14
4,2017-04-05,2,0.0,5,2017,4,,,0.5,0.866025,-0.5,0.866025,2,14


In [64]:
train['log_case'] = np.log1p(train['case_count'])

# Feature Engineering

In [65]:
a1 = [0] + [i for i in train['log_case'].iloc[:-1].tolist()]
a2 = [0] + [0] + [i for i in train['log_case'].iloc[:-2].tolist()]
a3 = [0] + [0] + [0] + [i for i in train['log_case'].iloc[:-3].tolist()]
a4 = [0] + [0] + [0] + [0] + [i for i in train['log_case'].iloc[:-4].tolist()]

In [66]:
train['case_count_prev'] = a1
train['case_count_prev_2'] = a2
train['case_count_prev_3'] = a3
train['case_count_prev_4'] = a4

# Modelling

In [67]:
train.Type.fillna(value="Nope", inplace=True)
test.Type.fillna(value="Nope", inplace=True)

In [68]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [69]:
la = LabelEncoder()
on = OneHotEncoder()
on2 = OneHotEncoder()

In [70]:
train['Type_Mat'] = la.fit_transform(train['Type'])

In [71]:
type_mat = on.fit_transform(train['Type_Mat'].values.reshape(-1,1))
day_mat = on2.fit_transform(train['day_of_week'].values.reshape(-1,1))
type_mat = type_mat.todense()
day_mat = day_mat.todense()
mat = np.append(type_mat, day_mat, axis=1)

In [72]:
mat = np.append(mat, train['day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['week_num'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['month'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['year'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_mon'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['sin_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, train['cos_day'].values.reshape(-1, 1), axis=1)
mat = np.append(mat, np.log1p(train['case_count_prev'].values.reshape(-1, 1)), axis=1)
mat = np.append(mat, np.log1p(train['case_count_prev_2'].values.reshape(-1, 1)), axis=1)
mat = np.append(mat, np.log1p(train['case_count_prev_3'].values.reshape(-1, 1)), axis=1)
mat = np.append(mat, np.log1p(train['case_count_prev_4'].values.reshape(-1, 1)), axis=1)

In [73]:
mat.shape

(12474, 26)

In [74]:
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# import lightgbm as lgb

In [75]:
X_train, X_test, y_train, y_test = train_test_split(mat,train['log_case'], test_size=0.33, random_state=42)

In [76]:
rf = RandomForestRegressor(n_estimators=100)
ls = Lasso()
hu = HuberRegressor()

In [77]:
rf.fit(X_train, y_train)
ls.fit(X_train, y_train)
hu.fit(X_train, y_train)

HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=False)

In [78]:
rf_pred = np.round(np.expm1(rf.predict(X_test)))
ls_pred = np.round(np.expm1(ls.predict(X_test)))
et_pred = np.round(np.expm1(hu.predict(X_test)))

In [79]:
[mean_absolute_error(np.expm1(y_test), rf_pred) * (100/len(y_test)), 
mean_absolute_error(np.expm1(y_test), ls_pred) * (100/len(y_test)),
mean_absolute_error(np.expm1(y_test), et_pred) * (100/len(y_test))]

[3.1624769044434973, 9.022236337197691, 5.697638464044975]

In [80]:
xtest = train.iloc[y_test.index].reset_index(drop=True)
# xtest['true'] = np.expm1(y_test.values)
xtest['pred_rf'] = rf_pred
xtest['pred_ls'] = ls_pred
xtest['pred_hu'] = et_pred

In [81]:
xtest[['case_count', 'case_count_prev', 'case_count_prev_2', 'case_count_prev_3', 'case_count_prev_4', 'pred_rf', 'pred_ls', 'pred_hu']]

Unnamed: 0,case_count,case_count_prev,case_count_prev_2,case_count_prev_3,case_count_prev_4,pred_rf,pred_ls,pred_hu
0,297.0,5.932245,5.455321,6.408529,6.371612,282.0,209.0,318.0
1,0.0,0.000000,0.000000,0.000000,0.000000,0.0,8.0,0.0
2,186.0,5.247024,4.595120,3.091042,4.330733,240.0,176.0,187.0
3,623.0,6.403574,6.165418,6.507278,6.563856,614.0,233.0,828.0
4,1530.0,7.108244,7.293698,7.109062,7.071573,1314.0,270.0,768.0
...,...,...,...,...,...,...,...,...
4112,1397.0,7.209340,5.874931,6.746412,7.089243,1337.0,276.0,904.0
4113,1144.0,7.351158,7.585789,7.029973,7.112327,1267.0,284.0,730.0
4114,551.0,6.583409,6.419995,6.313548,6.206576,597.0,242.0,689.0
4115,680.0,6.306275,5.673323,6.522093,6.738152,567.0,228.0,640.0


In [82]:
pd.DataFrame(rf.feature_importances_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,6e-06,0.000409,4.2e-05,0.000724,0.000213,0.000472,0.000256,0.000652,0.000493,0.00035,...,0.001186,0.001479,0.001931,0.001572,0.003274,0.006774,0.937826,0.011849,0.008538,0.006


# Testing

In [83]:
test['Type_Mat'] = la.fit_transform(test['Type'])
type_mat_test = on.transform(test['Type_Mat'].values.reshape(-1,1))
day_mat_test = on2.transform(test['day_of_week'].values.reshape(-1,1))
type_mat_test = type_mat_test.todense()
day_mat_test = day_mat_test.todense()
mat_test = np.append(type_mat_test, day_mat_test, axis=1)

In [84]:
mat_test = np.append(mat_test, test['day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['week_num'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['month'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['year'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_mon'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['sin_day'].values.reshape(-1, 1), axis=1)
mat_test = np.append(mat_test, test['cos_day'].values.reshape(-1, 1), axis=1)

In [85]:
mat.shape, mat_test.shape

((12474, 26), (93, 22))

In [86]:
a = []
for i in range(len(mat_test)):
    print(i)
    if i ==0:
        k = np.append(np.array(mat_test[i]), np.log1p(np.array([[1480., 1028., 1946., 1984.]])), axis=1)
#         a.append(np.round(np.expm1(rf.predict(k))))
    if i==1:
        k = np.append(np.array(mat_test[i]), np.array([[a[0],0,0,0]]), axis=1)
#         a.append(np.round(np.expm1(rf.predict(k))))
    if i==2:
        k = np.append(np.array(mat_test[i]), np.array([[a[1],a[0],0,0]]), axis=1)
#         a.append(np.round(np.expm1(rf.predict(k))))
    if i==3:
        k = np.append(np.array(mat_test[i]), np.array([[a[2],a[1],a[0],0]]), axis=1)
#         a.append(np.round(np.expm1(rf.predict(k))))
    if i>3:
        k = np.append(np.array(mat_test[i]), np.array([[a[i-1],a[i-2],a[i-3],a[i-4]]]), axis=1)
    
    a.append(np.round(np.expm1(rf.predict(k)))[0])
        
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92


In [87]:
test['case_count'] = a

In [88]:
test[['id','application_date','segment','case_count']].to_csv('pred_prev_seg_2.csv', index=False)