In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import collections


### Lag BI, CI, HI & Weather Data

In [84]:
path_dt = './data/full_house_again.csv'
df = pd.read_csv(path_dt)

In [85]:
df.head()

Unnamed: 0.1,Unnamed: 0,County_EN,YYYY_WW,BI,CI,HI,Temperature,T Max,T Min,RH,Precp
0,0,Hualien,2012_01,0.5,6.249750000000001,0.5,16.614286,18.657143,14.714286,75.714286,0.571429
1,1,Hualien,2012_02,0.0,0.0,0.0,18.942857,21.557143,16.8,75.428571,0.357143
2,2,Hualien,2012_03,0.0,0.0,0.0,19.3,21.914286,17.1,76.428571,2.4
3,3,Hualien,2012_04,XXX,XXX,XXX,16.142857,18.6,13.771429,79.714286,5.0
4,4,Hualien,2012_05,0.0,0.0,0.0,17.957143,20.414286,16.057143,76.571429,1.0


In [86]:
# read this file to generate county-id pairs
cty_id_path = './data/county_id.csv'
cty_id_df = pd.read_csv(cty_id_path)

In [87]:
cty_id_df.head()

Unnamed: 0,County,ID,Time,Denv,BI,CI,HI,Tave,Tmax,Tmin,RH,Precp,Pop,Pop_Den
0,Hualien,1,1,0,0.5,6.24975,0.5,16.61,18.657143,14.714286,75.71,0.57,336704.9965,72.774987
1,Hualien,1,2,0,0.0,0.0,0.0,18.94,21.557143,16.8,75.43,0.36,336676.9953,72.749983
2,Hualien,1,3,0,0.0,0.0,0.0,19.3,21.914286,17.1,76.43,2.4,336648.9965,72.724987
3,Hualien,1,4,0,0.0,0.0,0.0,0.0,18.6,13.771429,79.71,5.0,336621.0,72.7
4,Hualien,1,5,0,0.0,0.0,0.0,17.96,20.414286,16.057143,76.57,1.0,336586.9948,72.7


In [88]:
cty_id_df['County'] = cty_id_df.County.astype(str)
cty_id_df['ID'] = cty_id_df.ID.astype(int)
cty_id_df.dtypes

County      object
ID           int64
Time         int64
Denv         int64
BI         float64
CI         float64
HI         float64
Tave       float64
Tmax       float64
Tmin       float64
RH         float64
Precp      float64
Pop        float64
Pop_Den    float64
dtype: object

In [89]:
cty_id_dict = collections.defaultdict()
id_cty_dict = collections.defaultdict()

for cty, idx in zip(cty_id_df.County, cty_id_df.ID):
    cty_id_dict[cty] = idx
    id_cty_dict[idx] = cty

In [91]:
X = ['Temperature', 'T Max', 'T Min', 'RH', 'Precp']
Y = ['BI', 'CI', 'HI']
YX = ['County_EN', 'YYYY_WW', 'BI', 'CI', 'HI', 'Temperature', 'T Max', 'T Min', 'RH', 'Precp']

In [92]:
yx = df[YX]
yx.head()

Unnamed: 0,County_EN,YYYY_WW,BI,CI,HI,Temperature,T Max,T Min,RH,Precp
0,Hualien,2012_01,0.5,6.249750000000001,0.5,16.614286,18.657143,14.714286,75.714286,0.571429
1,Hualien,2012_02,0.0,0.0,0.0,18.942857,21.557143,16.8,75.428571,0.357143
2,Hualien,2012_03,0.0,0.0,0.0,19.3,21.914286,17.1,76.428571,2.4
3,Hualien,2012_04,XXX,XXX,XXX,16.142857,18.6,13.771429,79.714286,5.0
4,Hualien,2012_05,0.0,0.0,0.0,17.957143,20.414286,16.057143,76.571429,1.0


In [93]:
# split by cty
cty_yx = collections.defaultdict(list)

for i in range(yx.shape[0]):

    
    row = yx.iloc[i, :].tolist()
    
    cnty = row[0]
    
    cty_yx[cnty].append(row)
    

In [94]:
# convert list to df
cty_yx_df = collections.defaultdict()

for k, v in cty_yx.items():

    x = pd.DataFrame(cty_yx[k], columns = YX)

    cty_yx_df[k] = x


In [95]:
cty_yx_df['Taipei'].head()

Unnamed: 0,County_EN,YYYY_WW,BI,CI,HI,Temperature,T Max,T Min,RH,Precp
0,Taipei,2012_01,1.3805873015873016,2.489971111111111,1.1681873015873017,14.5,16.557143,13.028571,85.428571,10.2
1,Taipei,2012_02,1.1770250915750915,2.9363352694924125,0.9870742804814234,17.542857,19.757143,16.257143,85.428571,1.0
2,Taipei,2012_03,1.2979166666666666,1.20925,1.051125,17.685714,20.828571,15.857143,82.857143,4.385714
3,Taipei,2012_04,2.3205,12.091333333333331,0.9871666666666666,13.771429,15.914286,12.1,89.571429,9.885714
4,Taipei,2012_05,1.7666666666666668,1.984,1.3666666666666667,15.4,17.257143,13.785714,87.714286,3.0


In [97]:
def lag_df(df, cty_id_dict, lag = 1, y_cols = ['County_EN', 'YYYY_WW', 'BI', 'CI', 'HI'], x_cols = ['Temperature', 'T Max', 'T Min', 'RH', 'Precp']):
    
    cty = list(df.County_EN)
    
    idx = cty_id_dict[cty[0]]

    # split y, x
    YY = df[y_cols]
    XX = df[x_cols]

    n_rows = YY.shape[0]
    
    # slice with df.iloc[:, :]
    
    yy = YY.iloc[lag:, :]
    yy = yy.reset_index()
    xx = XX.iloc[:-lag, :]
    xx = xx.reset_index()
    
    yyxx = pd.concat([yy, xx], axis = 1)
    yyxx['ID'] = idx
    
    ti = pd.Series(np.arange(1, (yyxx.shape[0] + 1), 1))
    yyxx['Time'] = ti
    
    used_YX = ['ID', 'Time', 'BI', 'CI', 'HI', 'Temperature', 'T Max', 'T Min', 'RH', 'Precp']
    yyxx = yyxx[used_YX]
    return yyxx



In [101]:
cty_id_dict

defaultdict(None,
            {'Changhua': 16,
             'Chiayi': 5,
             'Hsinchu': 11,
             'Hualien': 1,
             'Kaohsiung': 14,
             'Miaoli': 7,
             'Nantou': 8,
             'New Taipei': 6,
             'Penghu': 9,
             'Pingtung': 2,
             'Taichung': 13,
             'Tainan': 10,
             'Taipei': 4,
             'Taitung': 15,
             'Taoyuan': 3,
             'Yilan': 12,
             'Yunlin': 17})

In [102]:

id_cty_dict

defaultdict(None,
            {1: 'Hualien',
             2: 'Pingtung',
             3: 'Taoyuan',
             4: 'Taipei',
             5: 'Chiayi',
             6: 'New Taipei',
             7: 'Miaoli',
             8: 'Nantou',
             9: 'Penghu',
             10: 'Tainan',
             11: 'Hsinchu',
             12: 'Yilan',
             13: 'Taichung',
             14: 'Kaohsiung',
             15: 'Taitung',
             16: 'Changhua',
             17: 'Yunlin'})

In [104]:
sorted_id = sorted(id_cty_dict.keys())
sorted_id

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [105]:
lags = 5 # lag 1, 2, 3, 4

for lag in range(1, lags, 1):
    
    print('lag = ', lag)
    
    tmp = []

    for idx in sorted_id:
        
        cty = id_cty_dict[idx]
        
        df = cty_yx_df[cty]
        
        df2 = lag_df(df, cty_id_dict, lag = lag)
        
        #print('\n', df2.tail())
        
        tmp.append(df2)
        

    fn = pd.concat(tmp, axis = 0, ignore_index = True)
    
    #print(fn.tail())
    
    fn.to_csv('./data/lag_%s_week_bi_temp.csv' %str(lag), index = False)

lag =  1
lag =  2
lag =  3
lag =  4


### Lag Dengue Fever & Pop Data

In [107]:
path_dt = './data/denguefever_bi_pop.csv'
df = pd.read_csv(path_dt)
df.head()

Unnamed: 0,ID,Time,Denv,BI,CI,HI,Pop,PopDen
0,1,1,0,0.5,6.24975,0.5,336705,72.77
1,1,2,0,0.0,0.0,0.0,336677,72.75
2,1,3,0,0.0,0.0,0.0,336649,72.72
3,1,4,0,0.0,0.0,0.0,336621,72.7
4,1,5,0,0.0,0.0,0.0,336587,72.7


In [108]:
YX = ['ID', 'Time', 'Denv', 'BI', 'CI', 'HI', 'Pop', 'PopDen']

In [109]:
df['ID'] = df.ID.astype(str)
df.dtypes

ID         object
Time        int64
Denv        int64
BI        float64
CI        float64
HI        float64
Pop         int64
PopDen    float64
dtype: object

In [110]:
# split by cty
cty_yx = collections.defaultdict(list)

for i in range(df.shape[0]):

    
    row = df.iloc[i, :].tolist()
    
    cnty = row[0]
    
    cty_yx[cnty].append(row)
    
# convert list to df
cty_yx_df = collections.defaultdict()

for k, v in cty_yx.items():

    #print(type(k))
    x = pd.DataFrame(cty_yx[k], columns = YX)

    cty_yx_df[str(k)] = x

In [111]:
cty_yx_df['1'].head()

Unnamed: 0,ID,Time,Denv,BI,CI,HI,Pop,PopDen
0,1,1,0,0.5,6.24975,0.5,336705,72.77
1,1,2,0,0.0,0.0,0.0,336677,72.75
2,1,3,0,0.0,0.0,0.0,336649,72.72
3,1,4,0,0.0,0.0,0.0,336621,72.7
4,1,5,0,0.0,0.0,0.0,336587,72.7


In [112]:
def lag_df_2(df, lag = 1, used_YX = '', y_cols = ['County_EN', 'YYYY_WW', 'BI', 'CI', 'HI'], x_cols = ['Temperature', 'T Max', 'T Min', 'RH', 'Precp']):
    


    # split y, x
    YY = df[y_cols]
    XX = df[x_cols]

    n_rows = YY.shape[0]
    
    # slice with df.iloc[:, :]
    
    yy = YY.iloc[lag:, :]
    yy = yy.reset_index()
    xx = XX.iloc[:-lag, :]
    xx = xx.reset_index()
    
    yyxx = pd.concat([yy, xx], axis = 1)
    
    ti = pd.Series(np.arange(1, (yyxx.shape[0] + 1), 1))
    yyxx['Time'] = ti
    
    
    yyxx = yyxx[used_YX]
    
    return yyxx

In [113]:
lags = 5 # lag 1, 2, 3, 4

y_cols = ['ID', 'Time', 'Denv']
x_cols = ['BI', 'CI', 'HI', 'Pop', 'PopDen']

YX = ['ID', 'Time', 'Denv', 'BI', 'CI', 'HI', 'Pop', 'PopDen']

for lag in range(1, lags, 1):
    
    print('lag = ', lag)
    
    tmp = []

    for cty in sorted(cty_yx_df.keys()):
        
        df = cty_yx_df[cty]
        
        df2 = lag_df_2(df, lag = lag, used_YX = YX, y_cols = y_cols, x_cols = x_cols)
        
        tmp.append(df2)
        
        
    fn = pd.concat(tmp, axis = 0, ignore_index = True)

    #print(fn.tail())
    
    fn.to_csv('./data/lag_%s_week_denv_bi_pop.csv' %str(lag), index = False)

lag =  1
lag =  2
lag =  3
lag =  4
