In [425]:
import pandas as pd 
import seaborn as sns
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


#Data reading
#Load train and test csv file
dengue_features_train = pd.read_csv('dataset/dengue_features_train.csv')
dengue_labels_train = pd.read_csv('dataset/dengue_labels_train.csv')
dengue_test = pd.read_csv("dataset/dengue_features_test.csv")

In [372]:
dengue_features_train.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [373]:

#Let's fusion the 2 dataframes:
#Merging the Train dataframe with the labels data frame 
dengue_train = pd.merge(dengue_labels_train, dengue_features_train, on=['city','year','weekofyear'])

dengue_train_sj = dengue_train[dengue_train.city == 'sj'].copy()
dengue_train_iq = dengue_train[dengue_train.city == 'iq'].copy()
dengue_test_sj = dengue_test[dengue_test.city == 'sj'].copy()
dengue_test_iq = dengue_test[dengue_test.city == 'iq'].copy()


In [374]:
dengue_train_sj.fillna(method='ffill', inplace=True)
dengue_train_iq.fillna(method='ffill', inplace=True)

dengue_test_sj.fillna(method='ffill', inplace=True)
dengue_test_iq.fillna(method='ffill', inplace=True)

In [375]:
#Check duplicate rows
np.sum(dengue_train_sj.duplicated())

0

In [376]:
dengue_train_sj.head()

Unnamed: 0,city,year,weekofyear,total_cases,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,4,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,5,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,4,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,3,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,6,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [377]:
dengue_train_sj.drop("week_start_date",axis=1,inplace=True)
dengue_test_sj.drop("week_start_date",axis=1,inplace=True)
dengue_train_iq.drop("week_start_date",axis=1,inplace=True)
dengue_test_iq.drop("week_start_date",axis=1,inplace=True)

In [378]:
dengue_train_sj.drop("city",axis=1,inplace=True)
dengue_test_sj.drop("city",axis=1,inplace=True)
dengue_train_iq.drop("city",axis=1,inplace=True)
dengue_test_iq.drop("city",axis=1,inplace=True)

## SAN Juan

In [379]:
dengue_train_sj.head()

Unnamed: 0,year,weekofyear,total_cases,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,1990,18,4,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,1990,19,5,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,1990,20,4,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,1990,21,3,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,1990,22,6,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [380]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dengue_train_sj,dengue_train_sj["total_cases"], test_size=0.2)

In [381]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error as MAE

In [382]:
classifier = MLPRegressor(max_iter=10000)
parameters = [{'hidden_layer_sizes': [(100, ), (20, 30), (13, 13, 13)],'activation':['relu']}]

In [383]:
clf = GridSearchCV(estimator=classifier,param_grid=parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'hidden_layer_sizes': [(100,), (20, 30), (13, 13, 13)], 'activation': ['relu']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [384]:
clf.best_params_

{'activation': 'relu', 'hidden_layer_sizes': (13, 13, 13)}

In [385]:
y_true, y_pred = y_test, clf.predict(X_test).astype(int)

print("Mean Absolute Error(MAE): %f" %MAE(y_true, y_pred))

Mean Absolute Error(MAE): 27.159574


In [387]:
clf = MLPRegressor(max_iter=10000, hidden_layer_sizes=(13, 13, 13),activation='relu')
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test).astype(int)
print("Mean Absolute Error(MAE): %f" %MAE(y_true, y_pred))

Mean Absolute Error(MAE): 1.446809


In [388]:
y_train = dengue_train_sj['total_cases']
X_train = dengue_train_sj.drop(labels=['total_cases'], axis=1)
X_test = dengue_test_sj

In [389]:
clf = MLPRegressor(max_iter=10000, hidden_layer_sizes=(100,),activation='relu')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).astype(int)

In [390]:

sub_df_sj = pd.DataFrame(y_pred, columns=["total_cases"])
sub_df_sj.insert(0, 'city', 'sj')
sub_df_sj.insert(1, 'year', dengue_test_sj['year'])
sub_df_sj.insert(2, 'weekofyear', dengue_test_sj['weekofyear'])
sub_df_sj.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,29
1,sj,2008,19,29
2,sj,2008,20,28
3,sj,2008,21,29
4,sj,2008,22,24


## Iqutos

In [439]:
X_train, X_test, y_train, y_test = train_test_split(dengue_train_iq,dengue_train_iq["total_cases"], test_size=0.2)

classifier = MLPRegressor(max_iter=10000)
parameters = [{'hidden_layer_sizes': [(100, ), (20, 30), (13, 13, 13)],'activation':['relu']}]

clf = GridSearchCV(estimator=classifier,param_grid=parameters)
clf.fit(X_train, y_train)

clf.best_params_


{'activation': 'relu', 'hidden_layer_sizes': (13, 13, 13)}

In [440]:
y_true, y_pred = y_test, clf.predict(X_test).astype(int)
print("Mean Absolute Error(MAE): %f" %MAE(y_true, y_pred))

Mean Absolute Error(MAE): 11.480769


In [448]:
clf = MLPRegressor(max_iter=10000, hidden_layer_sizes=(13, 13, 13),activation='relu')
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test).astype(int)

print(accuracy_score(y_true,y_pred))

print("Mean Absolute Error(MAE): %f" %MAE(y_true, y_pred))


0.057692307692307696
Mean Absolute Error(MAE): 6.951923


In [449]:

y_train = dengue_train_iq['total_cases']
X_train = dengue_train_iq.drop(labels=['total_cases'], axis=1)
X_test = dengue_test_iq

clf = MLPRegressor(max_iter=10000, hidden_layer_sizes=(20, 30),activation='relu')
clf.fit(X_train, y_train)


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [450]:
y_pred = clf.predict(X_test).astype(int)

In [451]:

sub_df_iq = pd.DataFrame(y_pred, columns=["total_cases"])
sub_df_iq.insert(0, 'city', 'iq')
sub_df_iq.insert(1, 'year', dengue_test_sj['year'])
sub_df_iq.insert(2, 'weekofyear', dengue_test_sj['weekofyear'])
sub_df_iq.head()

sub_df_iq

Unnamed: 0,city,year,weekofyear,total_cases
0,iq,2008,18,40
1,iq,2008,19,45
2,iq,2008,20,40
3,iq,2008,21,40
4,iq,2008,22,41
5,iq,2008,23,52
6,iq,2008,24,46
7,iq,2008,25,46
8,iq,2008,26,48
9,iq,2008,27,41


## Creating Submission Format

In [452]:
sub = sub_df_sj.append(sub_df_iq, ignore_index=True)
sub.to_csv('Submission_pca.csv', index = False)