In [23]:
import pandas as pd 
import numpy as np 

In [2]:
train = pd.read_csv('data/v1/training_v1.csv')
test = pd.read_csv('data/v1/test_v1.csv')

In [3]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type
0,2017-04-01,1,40.0,1,2017,4,,
1,2017-04-03,1,5.0,3,2017,4,,
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,Gazetted Holiday
3,2017-04-05,1,113.0,5,2017,4,,
4,2017-04-07,1,76.0,7,2017,4,,


In [4]:
test.head()

Unnamed: 0,id,application_date,segment,day,year,month,Holiday,Type
0,1,2019-07-06,1,6,2019,7,,
1,2,2019-07-07,1,7,2019,7,,
2,3,2019-07-08,1,8,2019,7,,
3,4,2019-07-09,1,9,2019,7,,
4,5,2019-07-10,1,10,2019,7,,


In [5]:
train.Type.unique()

array([nan, 'Gazetted Holiday', 'Observance, Christian',
       'Restricted Holiday', 'Observance', 'Season',
       'Muslim, Common local holiday'], dtype=object)

In [6]:
def encode_holidays(df):
    if df == 'Gazetted Holiday':
        return 'G'
    elif df == 'Observance':
        return 'O'
    elif df == 'Observance, Christian':
        return 'C'
    elif df == 'Muslim, Common local holiday':
        return 'M'
    elif df == 'Restricted Holiday':
        return 'R'
    elif df == 'Season':
        return 'S'
    else:
        return 'NA'

In [8]:
train['Type'] = train['Type'].apply(encode_holidays)

In [9]:
test['Type'] = test['Type'].apply(encode_holidays)

In [10]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type
0,2017-04-01,1,40.0,1,2017,4,,
1,2017-04-03,1,5.0,3,2017,4,,
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,G
3,2017-04-05,1,113.0,5,2017,4,,
4,2017-04-07,1,76.0,7,2017,4,,


In [11]:
test.head()

Unnamed: 0,id,application_date,segment,day,year,month,Holiday,Type
0,1,2019-07-06,1,6,2019,7,,
1,2,2019-07-07,1,7,2019,7,,
2,3,2019-07-08,1,8,2019,7,,
3,4,2019-07-09,1,9,2019,7,,
4,5,2019-07-10,1,10,2019,7,,


In [13]:
train.sample(30)

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type
18093,2019-04-19,1,37.0,19,2019,4,Good Friday,G
11955,2018-05-08,1,88.0,8,2018,5,,
19980,2017-09-17,1,42.0,17,2017,9,,
82359,2019-06-15,2,1147.0,15,2019,6,,
20729,2017-06-24,1,1.0,24,2017,6,,
2307,2019-01-23,1,242.0,23,2019,1,,
27809,2018-10-13,1,67.0,13,2018,10,,
21036,2018-05-03,1,8.0,3,2018,5,,
60704,2018-05-23,1,0.0,23,2018,5,,
9744,2019-01-26,1,19.0,26,2019,1,Republic Day,G


In [19]:
month_31 = [1,3,5,7,8,10,12]

In [43]:
def fourier_trans_28(df):
    return np.sin(2*np.pi*df/28), np.cos(2*np.pi*df/28)

In [44]:
def fourier_trans_29(df):
    return np.sin(2*np.pi*df/29), np.cos(2*np.pi*df/29)

In [45]:
def fourier_trans_30(df):
    return np.sin(2*np.pi*df/30), np.cos(2*np.pi*df/30)

In [46]:
def fourier_trans_31(df):
    return np.sin(2*np.pi*df/31), np.cos(2*np.pi*df/31)

In [55]:
def fourier_trans_month(df):
    return np.sin(2*np.pi*df/12), np.cos(2*np.pi*df/12)

In [49]:
def transform(df):
    if df.month in month_31:
        k = fourier_trans_31(df.day)
    elif (df.month == 2) & (df.year==2018):
        k =  fourier_trans_29(df.day)
    elif (df.month == 2) & (df.year!=2018):
        k =  fourier_trans_28(df.day)
    else:
        k = fourier_trans_30(df.day)
    return k

In [50]:
train['four_day'] = train.apply(transform, axis=1)

In [51]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,four_day,cos_day,sin_day
0,2017-04-01,1,40.0,1,2017,4,,,"(0.20791169081775931, 0.9781476007338057)",0.978148,0.207912
1,2017-04-03,1,5.0,3,2017,4,,,"(0.5877852522924731, 0.8090169943749475)",0.809017,0.587785
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,G,"(0.7431448254773941, 0.6691306063588582)",0.669131,0.743145
3,2017-04-05,1,113.0,5,2017,4,,,"(0.8660254037844386, 0.5000000000000001)",0.5,0.866025
4,2017-04-07,1,76.0,7,2017,4,,,"(0.9945218953682733, 0.10452846326765346)",0.104528,0.994522


In [52]:
def split_sin(df):
    return df[0]
def split_cos(df):
    return df[1]

In [53]:
train['cos_day'] = train['four_day'].apply(split_cos)
train['sin_day'] = train['four_day'].apply(split_sin)

In [54]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,four_day,cos_day,sin_day
0,2017-04-01,1,40.0,1,2017,4,,,"(0.20791169081775931, 0.9781476007338057)",0.978148,0.207912
1,2017-04-03,1,5.0,3,2017,4,,,"(0.5877852522924731, 0.8090169943749475)",0.809017,0.587785
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,G,"(0.7431448254773941, 0.6691306063588582)",0.669131,0.743145
3,2017-04-05,1,113.0,5,2017,4,,,"(0.8660254037844386, 0.5000000000000001)",0.5,0.866025
4,2017-04-07,1,76.0,7,2017,4,,,"(0.9945218953682733, 0.10452846326765346)",0.104528,0.994522


In [58]:
train['four_month'] = train['month'].apply(fourier_trans_month)

In [59]:
train['cos_mon'] = train['four_month'].apply(split_cos)
train['sin_mon'] = train['four_month'].apply(split_sin)

In [60]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,four_day,cos_day,sin_day,four_month,cos_mon,sin_mon
0,2017-04-01,1,40.0,1,2017,4,,,"(0.20791169081775931, 0.9781476007338057)",0.978148,0.207912,"(0.8660254037844387, -0.4999999999999998)",-0.5,0.866025
1,2017-04-03,1,5.0,3,2017,4,,,"(0.5877852522924731, 0.8090169943749475)",0.809017,0.587785,"(0.8660254037844387, -0.4999999999999998)",-0.5,0.866025
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,G,"(0.7431448254773941, 0.6691306063588582)",0.669131,0.743145,"(0.8660254037844387, -0.4999999999999998)",-0.5,0.866025
3,2017-04-05,1,113.0,5,2017,4,,,"(0.8660254037844386, 0.5000000000000001)",0.5,0.866025,"(0.8660254037844387, -0.4999999999999998)",-0.5,0.866025
4,2017-04-07,1,76.0,7,2017,4,,,"(0.9945218953682733, 0.10452846326765346)",0.104528,0.994522,"(0.8660254037844387, -0.4999999999999998)",-0.5,0.866025


In [76]:
train = train.drop(columns=['four_day', 'four_month'])

In [64]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon
0,2017-04-01,1,40.0,1,2017,4,,,0.978148,0.207912,-0.5,0.866025
1,2017-04-03,1,5.0,3,2017,4,,,0.809017,0.587785,-0.5,0.866025
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,G,0.669131,0.743145,-0.5,0.866025
3,2017-04-05,1,113.0,5,2017,4,,,0.5,0.866025,-0.5,0.866025
4,2017-04-07,1,76.0,7,2017,4,,,0.104528,0.994522,-0.5,0.866025


In [65]:
train.corr()

Unnamed: 0,segment,case_count,day,year,month,cos_day,sin_day,cos_mon,sin_mon
segment,1.0,0.452733,-0.011913,0.002483,-0.000964,-0.001195,0.010964,-0.011529,-0.004998
case_count,0.452733,1.0,0.044342,0.045343,-0.002371,-0.085307,-0.065642,0.008247,-0.003544
day,-0.011913,0.044342,1.0,-0.043502,0.008517,0.086428,-0.774307,0.008026,-0.000683
year,0.002483,0.045343,-0.043502,1.0,-0.488008,0.009933,0.024139,-0.020401,0.444034
month,-0.000964,-0.002371,0.008517,-0.488008,1.0,0.020221,0.009533,0.231888,-0.764486
cos_day,-0.001195,-0.085307,0.086428,0.009933,0.020221,1.0,-0.002604,-0.001513,-0.013553
sin_day,0.010964,-0.065642,-0.774307,0.024139,0.009533,-0.002604,1.0,-0.0038,-0.015626
cos_mon,-0.011529,0.008247,0.008026,-0.020401,0.231888,-0.001513,-0.0038,1.0,-0.049926
sin_mon,-0.004998,-0.003544,-0.000683,0.444034,-0.764486,-0.013553,-0.015626,-0.049926,1.0


In [67]:
test['four_day'] = test.apply(transform, axis=1)

In [69]:
test['four_month'] = test['month'].apply(fourier_trans_month)

In [70]:
test['cos_mon'] = test['four_month'].apply(split_cos)
test['sin_mon'] = test['four_month'].apply(split_sin)

In [71]:
test['cos_day'] = test['four_day'].apply(split_cos)
test['sin_day'] = test['four_day'].apply(split_sin)

In [74]:
test = test.drop(columns=['four_day', 'four_month'])

In [77]:
train.head()

Unnamed: 0,application_date,segment,case_count,day,year,month,Holiday,Type,cos_day,sin_day,cos_mon,sin_mon
0,2017-04-01,1,40.0,1,2017,4,,,0.978148,0.207912,-0.5,0.866025
1,2017-04-03,1,5.0,3,2017,4,,,0.809017,0.587785,-0.5,0.866025
2,2017-04-04,1,4.0,4,2017,4,Rama Navami,G,0.669131,0.743145,-0.5,0.866025
3,2017-04-05,1,113.0,5,2017,4,,,0.5,0.866025,-0.5,0.866025
4,2017-04-07,1,76.0,7,2017,4,,,0.104528,0.994522,-0.5,0.866025


In [78]:
test.head()

Unnamed: 0,id,application_date,segment,day,year,month,Holiday,Type,cos_mon,sin_mon,cos_day,sin_day
0,1,2019-07-06,1,6,2019,7,,,-0.866025,-0.5,0.347305,0.937752
1,2,2019-07-07,1,7,2019,7,,,-0.866025,-0.5,0.151428,0.988468
2,3,2019-07-08,1,8,2019,7,,,-0.866025,-0.5,-0.050649,0.998717
3,4,2019-07-09,1,9,2019,7,,,-0.866025,-0.5,-0.250653,0.968077
4,5,2019-07-10,1,10,2019,7,,,-0.866025,-0.5,-0.440394,0.897805


In [79]:
train.to_csv('training_v2.csv', index=False)
test.to_csv('test_v2.csv', index=False)