In [35]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model

In [36]:
## ---- Notebook Variables ---- ##
nb_variables = {'window': 4}

In [37]:
# ---- Testing ---- #
x_test = pd.read_csv('../dengue_features_test.csv', index_col = 0)
x_test = x_test.loc['iq']
x_test = x_test.set_index('week_start_date')
x_test.index = pd.to_datetime(x_test.index)
x_test.sort_index(inplace = True)

In [38]:
## ----- Further Data Engineering (Same as Training Set)----- ##

In [39]:
x_test.drop(['ndvi_ne', 'ndvi_nw'], axis = 1, inplace = True)
x_test.drop('reanalysis_sat_precip_amt_mm', axis = 1, inplace = True)
x_test[['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
        'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']] += -273.15

In [40]:
min_temp = pd.Series(np.mean(x_test[[col for col in x_test.columns if '_min_' in col]].values, axis = 1))
min_temp.index = x_test.index
x_test['min_temp'] = min_temp
x_test.drop([col for col in x_test.columns if '_min_' in col], axis = 1, inplace = True)

In [41]:
max_temp = pd.Series(np.mean(x_test[[col for col in x_test.columns if '_max_' in col]].values, axis = 1))
max_temp.index = x_test.index
x_test['max_temp'] = max_temp
x_test.drop([col for col in x_test.columns if '_max_' in col], axis = 1, inplace = True)

In [42]:
reanalysis_avg_temp_c = pd.Series(np.mean(x_test[['reanalysis_avg_temp_k', 'reanalysis_air_temp_k']], axis = 1))
reanalysis_avg_temp_c.index = x_test.index
x_test['reanalysis_avg_temp_c'] = reanalysis_avg_temp_c
x_test.drop(['reanalysis_avg_temp_k', 'reanalysis_air_temp_k'], axis = 1, inplace = True)

In [43]:
cols = x_test.columns.tolist()
cols = cols[-3:] + cols[:-3]

In [44]:
df = pd.read_csv('df_for_testing.csv', index_col = 0)
df.index = pd.to_datetime(df.index)
df.sort_index(inplace = True)

In [45]:
# Importan line for later. WE KEEP IT!
dates_needed = df.index[df.index < x_test.index[0]][-nb_variables['window']:]
rows_needed = df.loc[dates_needed].drop('total_cases', axis = 1)

In [46]:
rows_needed

Unnamed: 0_level_0,year,weekofyear,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_dew_point_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_precip_mm,min_temp,max_temp,reanalysis_avg_temp_c
week_start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2010-06-04,2010,22,0.136043,0.225657,86.47,23.302857,207.1,91.6,18.07,7.471429,27.433333,10.5,36.6,20.225,35.025,25.711429
2010-06-11,2010,23,0.250357,0.233714,58.94,22.351429,50.6,94.28,17.008571,7.5,24.4,6.9,7.4,19.225,32.275,23.945714
2010-06-18,2010,24,0.278886,0.325486,59.67,22.174286,62.33,94.66,16.815714,7.871429,25.433333,8.733333,16.0,19.875,32.075,23.783571
2010-06-25,2010,25,0.274214,0.315757,63.22,22.657143,36.9,89.082857,17.355714,11.014286,27.475,9.9,20.4,20.675,34.175,25.816429


In [47]:
x_test = pd.concat([rows_needed, x_test], axis = 0)
x_test.shape

(160, 16)

In [48]:
perc = np.round(100 * x_test.isna().sum().sum() / (len(x_test) * len(x_test.columns)), 2)
print(f'- Percentage of missing values across the entire dataset {perc}% \n')

for col in x_test.columns[:-1]: # exclude the label variable
    x_test[col] = x_test[col].fillna(x_test.rolling(window = nb_variables['window'],
                                                    min_periods = 1).mean()[col])
    
new_perc = np.round(100 * x_test.isna().sum().sum() / (len(x_test) * len(x_test.columns)), 2)
print(f'- New percentage of missing values across the entire dataset {new_perc}%')

- Percentage of missing values across the entire dataset 1.21% 

- New percentage of missing values across the entire dataset 0.55%


In [49]:
x_test = x_test.fillna(0)
new_perc = np.round(100 * x_test.isna().sum().sum() / (len(x_test) * len(x_test.columns)), 2)
print(f'- Percentage of missing values across the entire dataset {new_perc}%')

- Percentage of missing values across the entire dataset 0.0%


In [50]:
x_test.shape

(160, 16)

In [51]:
window = nb_variables['window']
xin_test = []
for i in range(window, len(x_test)):
    xin_test.append(x_test.values[i - window : i, :])
    
xin_test = np.array(xin_test)

In [52]:
xin_test.shape

(156, 4, 16)

In [53]:
model = load_model('../models/iq_lstm.keras')
predictions = model.predict(xin_test)
predictions = np.round(predictions).astype('int32')
predictions[predictions < 0] = 0
predictions.shape



(156, 1)

In [54]:
submission = pd.read_csv('../dengue_features_test.csv', index_col = 0)
submission = submission.loc['iq']
submission = submission.loc[:, ['year', 'weekofyear']]
submission.head()

Unnamed: 0_level_0,year,weekofyear
city,Unnamed: 1_level_1,Unnamed: 2_level_1
iq,2010,26
iq,2010,27
iq,2010,28
iq,2010,29
iq,2010,30


In [55]:
predictions = pd.Series(predictions.reshape(-1))
predictions.index = submission.index
submission = pd.concat([submission, predictions], axis = 1)
submission.columns = ['year', 'weekofyear', 'total_cases']

In [56]:
submission.head()

Unnamed: 0_level_0,year,weekofyear,total_cases
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
iq,2010,26,23
iq,2010,27,24
iq,2010,28,24
iq,2010,29,25
iq,2010,30,24


In [57]:
submission.to_csv('Iquitos_LSTM_predictions.csv')