In [65]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model

In [66]:
## ---- Notebook Variables ---- ##
nb_variables = {'window': 4}

In [67]:
# ---- Testing ---- #
x_test = pd.read_csv('../dengue_features_test.csv', index_col = 0)
x_test = x_test.loc['sj']
x_test = x_test.set_index('week_start_date')
x_test.index = pd.to_datetime(x_test.index)
x_test.sort_index(inplace = True)

In [68]:
## ----- Further Data Engineering (Same as Training Set)----- ##

In [69]:
x_test.drop(['ndvi_ne', 'ndvi_nw'], axis = 1, inplace = True)
x_test.drop('reanalysis_sat_precip_amt_mm', axis = 1, inplace = True)
x_test[['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
        'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']] += -273.15

In [70]:
min_temp = pd.Series(np.mean(x_test[[col for col in x_test.columns if '_min_' in col]].values, axis = 1))
min_temp.index = x_test.index
x_test['min_temp'] = min_temp
x_test.drop([col for col in x_test.columns if '_min_' in col], axis = 1, inplace = True)

In [71]:
max_temp = pd.Series(np.mean(x_test[[col for col in x_test.columns if '_max_' in col]].values, axis = 1))
max_temp.index = x_test.index
x_test['max_temp'] = max_temp
x_test.drop([col for col in x_test.columns if '_max_' in col], axis = 1, inplace = True)

In [72]:
reanalysis_avg_temp_c = pd.Series(np.mean(x_test[['reanalysis_avg_temp_k', 'reanalysis_air_temp_k']], axis = 1))
reanalysis_avg_temp_c.index = x_test.index
x_test['reanalysis_avg_temp_c'] = reanalysis_avg_temp_c
x_test.drop(['reanalysis_avg_temp_k', 'reanalysis_air_temp_k'], axis = 1, inplace = True)

In [73]:
cols = x_test.columns.tolist()
cols = cols[-3:] + cols[:-3]

In [74]:
df = pd.read_csv('df_for_testing.csv', index_col = 0)
df.index = pd.to_datetime(df.index)
df.sort_index(inplace = True)

In [75]:
# Importan line for later. WE KEEP IT!
dates_needed = df.index[df.index < x_test.index[0]][-nb_variables['window']:]
rows_needed = df.loc[dates_needed].drop('total_cases', axis = 1)

In [76]:
rows_needed

Unnamed: 0_level_0,year,weekofyear,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_dew_point_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_precip_mm,min_temp,max_temp,reanalysis_avg_temp_c
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2008-04-01,2008,14,0.119371,0.066386,3.82,20.085714,3.67,74.6,14.662857,2.714286,26.242857,6.814286,0.5,22.775,28.625,25.005
2008-04-08,2008,15,0.137757,0.141214,16.96,19.582857,35.0,75.027143,14.184286,2.185714,25.0,5.714286,30.7,22.175,27.825,24.362143
2008-04-15,2008,16,0.2039,0.209843,0.0,19.124286,4.82,72.285714,13.858571,2.785714,25.314286,6.242857,11.2,22.225,27.975,24.554286
2008-04-22,2008,17,0.077314,0.090586,0.0,21.13,2.17,76.96,15.671429,3.957143,27.042857,7.514286,0.3,23.025,29.725,25.532857


In [77]:
x_test = pd.concat([rows_needed, x_test], axis = 0)
x_test.shape

(264, 16)

In [78]:
perc = np.round(100 * x_test.isna().sum().sum() / (len(x_test) * len(x_test.columns)), 2)
print(f'- Percentage of missing values across the entire dataset {perc}% \n')

for col in x_test.columns[:-1]: # exclude the label variable
    x_test[col] = x_test[col].fillna(x_test.rolling(window = nb_variables['window'],
                                                    min_periods = 1).mean()[col])
    
new_perc = np.round(100 * x_test.isna().sum().sum() / (len(x_test) * len(x_test.columns)), 2)
print(f'- New percentage of missing values across the entire dataset {new_perc}%')

- Percentage of missing values across the entire dataset 0.62% 

- New percentage of missing values across the entire dataset 0.05%


In [79]:
x_test = x_test.fillna(0)
new_perc = np.round(100 * x_test.isna().sum().sum() / (len(x_test) * len(x_test.columns)), 2)
print(f'- Percentage of missing values across the entire dataset {new_perc}%')

- Percentage of missing values across the entire dataset 0.0%


In [80]:
x_test.shape

(264, 16)

In [81]:
window = nb_variables['window']
xin_test = []
for i in range(window, len(x_test)):
    xin_test.append(x_test.values[i - window : i, :])
    
xin_test = np.array(xin_test)

In [82]:
xin_test.shape

(260, 4, 16)

In [83]:
model = load_model('../models/sj_lstm.keras')
predictions = model.predict(xin_test)
predictions = np.round(predictions).astype('int32')
predictions[predictions < 0] = 0
predictions.shape



(260, 1)

In [84]:
submission = pd.read_csv('../dengue_features_test.csv', index_col = 0)
submission = submission.loc['sj']
submission = submission.loc[:, ['year', 'weekofyear']]
submission.head()

Unnamed: 0_level_0,year,weekofyear
city,Unnamed: 1_level_1,Unnamed: 2_level_1
sj,2008,18
sj,2008,19
sj,2008,20
sj,2008,21
sj,2008,22


In [85]:
predictions = pd.Series(predictions.reshape(-1))
predictions.index = submission.index
submission = pd.concat([submission, predictions], axis = 1)
submission.columns = ['year', 'weekofyear', 'total_cases']

In [86]:
submission.head()

Unnamed: 0_level_0,year,weekofyear,total_cases
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sj,2008,18,26
sj,2008,19,27
sj,2008,20,26
sj,2008,21,24
sj,2008,22,26


In [87]:
submission.to_csv('San_Juan_LSTM_predictions.csv')