In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import collections


### Lag BI, CI, HI & Weather Data

In [6]:
path_dt = './data/full_house_again.csv'
df = pd.read_csv(path_dt)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,County_EN,YYYY_WW,BI,CI,HI,Temperature,T Max,T Min,RH,Precp
0,0,Hualien,2012_01,0.5,6.249750000000001,0.5,16.614286,18.657143,14.714286,75.714286,0.571429
1,1,Hualien,2012_02,0.0,0.0,0.0,18.942857,21.557143,16.8,75.428571,0.357143
2,2,Hualien,2012_03,0.0,0.0,0.0,19.3,21.914286,17.1,76.428571,2.4
3,3,Hualien,2012_04,XXX,XXX,XXX,16.142857,18.6,13.771429,79.714286,5.0
4,4,Hualien,2012_05,0.0,0.0,0.0,17.957143,20.414286,16.057143,76.571429,1.0


In [8]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Temperature,T Max,T Min,RH,Precp
count,6966.0,6965.0,6966.0,6966.0,6928.0,6965.0
mean,3482.5,23.427062,27.444352,20.557642,77.04266,5.79728
std,2011.05532,4.785375,4.946195,4.804164,6.782163,10.546052
min,0.0,9.728571,12.314286,6.2,46.857143,0.0
25%,1741.25,19.457143,23.757143,16.614286,72.571429,0.083333
50%,3482.5,23.771429,28.014286,20.892857,77.0,1.785714
75%,5223.75,27.8,31.642857,24.985714,81.571429,6.928571
max,6965.0,32.557143,45.857143,29.1,98.857143,123.142857


In [9]:
# read this file to generate county-id pairs
cty_id_path = './data/county_id.csv'
cty_id_df = pd.read_csv(cty_id_path)

In [10]:
cty_id_df.head()

Unnamed: 0,County,ID,Time,Denv,BI,CI,HI,Tave,Tmax,Tmin,RH,Precp,Pop,Pop_Den
0,Hualien,1,1,0,0.5,6.24975,0.5,16.61,18.657143,14.714286,75.71,0.57,336704.9965,72.774987
1,Hualien,1,2,0,0.0,0.0,0.0,18.94,21.557143,16.8,75.43,0.36,336676.9953,72.749983
2,Hualien,1,3,0,0.0,0.0,0.0,19.3,21.914286,17.1,76.43,2.4,336648.9965,72.724987
3,Hualien,1,4,0,0.0,0.0,0.0,0.0,18.6,13.771429,79.71,5.0,336621.0,72.7
4,Hualien,1,5,0,0.0,0.0,0.0,17.96,20.414286,16.057143,76.57,1.0,336586.9948,72.7


In [11]:
cty_id_df.describe()

Unnamed: 0,ID,Time,Denv,BI,CI,HI,Tave,Tmax,Tmin,RH,Precp,Pop,Pop_Den
count,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0,6579.0
mean,9.0,194.0,9.653747,1.509442,4.813578,1.255711,23.457128,27.549113,20.553886,77.06885,5.558838,1308175.0,1225.365216
std,4.899352,111.725396,118.818132,1.933646,4.640524,1.38022,4.776905,4.892508,4.800198,6.786237,10.472907,1123545.0,2233.554968
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,12.314286,6.2,46.86,0.0,97253.0,61.975025
25%,5.0,97.0,0.0,0.38,1.501919,0.348869,19.535,23.871429,16.657143,72.57,0.07,502433.6,269.475012
50%,9.0,194.0,0.0,0.95,3.570063,0.844376,23.81,28.142857,20.885714,77.0,1.57,700204.5,542.449993
75%,13.0,291.0,1.0,1.94,6.851858,1.686381,27.825,31.671429,24.971429,81.57,6.42,2175340.0,1205.749973
max,17.0,387.0,3416.0,30.44,76.19,16.071,32.56,45.857143,29.1,98.86,123.14,4003297.0,9956.100003


In [12]:
cty_id_df['County'] = cty_id_df.County.astype(str)
cty_id_df['ID'] = cty_id_df.ID.astype(int)
cty_id_df.dtypes

County      object
ID           int64
Time         int64
Denv         int64
BI         float64
CI         float64
HI         float64
Tave       float64
Tmax       float64
Tmin       float64
RH         float64
Precp      float64
Pop        float64
Pop_Den    float64
dtype: object

In [13]:
cty_id_dict = collections.defaultdict()
id_cty_dict = collections.defaultdict()

for cty, idx in zip(cty_id_df.County, cty_id_df.ID):
    cty_id_dict[cty] = idx
    id_cty_dict[idx] = cty

In [14]:
cty_id_df.columns

Index(['County', 'ID', 'Time', 'Denv', 'BI', 'CI', 'HI', 'Tave', 'Tmax',
       'Tmin', 'RH', 'Precp', 'Pop', 'Pop_Den'],
      dtype='object')

In [16]:
# dataset 1
X = ['Tave', 'Tmax', 'Tmin', 'RH', 'Precp']

Y = ['BI', 'CI', 'HI']

YX = ['ID', 'Time', 'BI', 'CI', 'HI', 'Tave', 'Tmax', 'Tmin', 'RH', 'Precp']

In [37]:
# dataset 2. Lag Dengue Fever & Pop Data

X = ['BI', 'CI', 'HI', 'Pop', 'Pop_Den']

Y = ['Denv']

YX = ['ID', 'Time', 'Denv', 'BI', 'CI', 'HI', 'Pop', 'Pop_Den']

In [38]:
yx = cty_id_df[YX]
yx.head()

Unnamed: 0,ID,Time,Denv,BI,CI,HI,Pop,Pop_Den
0,1,1,0,0.5,6.24975,0.5,336704.9965,72.774987
1,1,2,0,0.0,0.0,0.0,336676.9953,72.749983
2,1,3,0,0.0,0.0,0.0,336648.9965,72.724987
3,1,4,0,0.0,0.0,0.0,336621.0,72.7
4,1,5,0,0.0,0.0,0.0,336586.9948,72.7


In [44]:
yx.dtypes

ID           int64
Time         int64
Denv         int64
BI         float64
CI         float64
HI         float64
Pop        float64
Pop_Den    float64
dtype: object

In [39]:
# split by cty
cty_yx = collections.defaultdict(list)

for i in range(yx.shape[0]):

    
    row = yx.iloc[i, :].tolist()
    
    cnty = row[0]
    
    cty_yx[cnty].append(row)
    

In [40]:
# convert list to df
cty_yx_df = collections.defaultdict()

for k, v in cty_yx.items():

    x = pd.DataFrame(cty_yx[k], columns = YX)

    cty_yx_df[k] = x


In [41]:
cty_yx_df[1].head()

Unnamed: 0,ID,Time,Denv,BI,CI,HI,Pop,Pop_Den
0,1.0,1.0,0.0,0.5,6.24975,0.5,336704.9965,72.774987
1,1.0,2.0,0.0,0.0,0.0,0.0,336676.9953,72.749983
2,1.0,3.0,0.0,0.0,0.0,0.0,336648.9965,72.724987
3,1.0,4.0,0.0,0.0,0.0,0.0,336621.0,72.7
4,1.0,5.0,0.0,0.0,0.0,0.0,336586.9948,72.7


In [42]:
def lag_df_3(df, lag = 1, yx = ['d'], y_cols = ['x'] , x_cols = ['u']):
    

    # split y, x
    YY = df[y_cols]
    XX = df[x_cols]

    n_rows = YY.shape[0]
    
    # slice with df.iloc[:, :]
    
    yy = YY.iloc[lag:, :]
    yy = yy.reset_index()
    xx = XX.iloc[:-lag, :]
    xx = xx.reset_index()
    
    yyxx = pd.concat([yy, xx], axis = 1)
    
    idx = df.ID[0]
    yyxx['ID'] = idx
    
    ti = pd.Series(np.arange(1, (yyxx.shape[0] + 1), 1))
    yyxx['Time'] = ti
    
    yyxx = yyxx[yx]
    
    
    return yyxx


In [43]:
lags = 5 # lag 1, 2, 3, 4

for lag in range(1, lags, 1):
    
    print('lag = ', lag)
    
    tmp = []

    #for idx in sorted_id:
    for idx in sorted(cty_yx_df.keys()):
        
        #cty = id_cty_dict[idx]
        
        df = cty_yx_df[idx]
        
        df2 = lag_df_3(df, lag = lag, yx = YX, y_cols = Y, x_cols = X)
        
        #print('\n', df2.tail())
        
        tmp.append(df2)
        

    fn = pd.concat(tmp, axis = 0, ignore_index = True)
    
    #print(fn.tail())
    print('\n', fn.describe())
    
    fn.to_csv('./data/lag_%s_week_dengue_bi.csv' %str(lag), index = False)

lag =  1

                 ID         Time         Denv           BI           CI  \
count  6562.000000  6562.000000  6562.000000  6562.000000  6562.000000   
mean      9.000000   193.500000     9.677080     1.505030     4.795492   
std       4.899353   111.436719   118.971057     1.924215     4.551983   
min       1.000000     1.000000     0.000000     0.000000     0.000000   
25%       5.000000    97.000000     0.000000     0.380000     1.498042   
50%       9.000000   193.500000     0.000000     0.940000     3.556310   
75%      13.000000   290.000000     1.000000     1.930000     6.821138   
max      17.000000   386.000000  3416.000000    30.440000    44.444000   

                HI           Pop      Pop_Den  
count  6562.000000  6.562000e+03  6562.000000  
mean      1.253799  1.308158e+06  1225.360142  
std       1.379109  1.123504e+06  2233.603782  
min       0.000000  9.725300e+04    61.975025  
25%       0.347596  5.025101e+05   269.512497  
50%       0.842968  7.002432e+05  