In [1]:
import tensorflow as tf
import numpy as np
import csv
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import Imputer
from sklearn import model_selection
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor



In [3]:
# Read from .csv files
X_train = pd.read_csv('data/dengue_features_train.csv')
y_train = pd.read_csv('data/dengue_labels_train.csv')
# Drop columns year and start date
X_train = X_train.drop(X_train.columns[[1, 3]], axis=1)

# Separate the cities into two files
X_train_sj = X_train.loc[X_train.city == 'sj']
X_train_iq = X_train.loc[X_train.city == 'iq']
y_train_sj = y_train.loc[y_train.city == 'sj']
y_train_iq = y_train.loc[y_train.city == 'iq']
del y_train_sj['city']
del y_train_iq['city']
del X_train_sj['city']
del X_train_iq['city']

y_train_sj = np.array(y_train_sj.drop(y_train_sj.columns[:2], axis =1))
y_train_iq = np.array(y_train_iq.drop(y_train_iq.columns[:2], axis=1))
# print(X_test_sj)

In [4]:
# Replace all missing values with a mean value
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp1 = Imputer(strategy='mean')

X_train_arr_sj = imp.fit_transform(X_train_sj)
X_train_arr_iq = imp1.fit_transform(X_train_iq)
y_train_arr_sj = imp.fit_transform(y_train_sj)
y_train_arr_iq = imp1.fit_transform(y_train_iq)

# print(X_test_arr_sj)

In [6]:
type(X_train_arr_sj)

numpy.ndarray

In [216]:
# This is for training only
# divide the training sets into train, test, validation
X_train_sj1, X_test_sj1, y_train_sj1, y_test_sj1 = model_selection.train_test_split(X_train_arr_sj, y_train_arr_sj, 
                                                                    test_size=0.2, random_state=42)
X_train_sj1, X_val_sj1, y_train_sj1, y_val_sj1   = model_selection.train_test_split(X_train_sj1, y_train_sj1, 
                                                                    test_size=0.5, random_state=42)

feature_columns_sj = tf.contrib.learn.infer_real_valued_columns_from_input(X_train_sj1)
regressor_sj = xgb.XGBRegressor(n_estimators = 550, # number of boosted trees
                             learning_rate = 0.00402047, # step size shrinkage used in update to prevent overfitting
                             max_depth = 15, # maximum depth of a tree
                             subsample = 0.9815, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.701) # subsample features

X_train_iq1, X_test_iq1, y_train_iq1, y_test_iq1 = model_selection.train_test_split(X_train_arr_iq, y_train_arr_iq, 
                                                                    test_size=0.2, random_state=42)
X_train_iq1, X_val_iq1, y_train_iq1, y_val_iq1  = model_selection.train_test_split(X_train_arr_iq, y_train_arr_iq, 
                                                                    test_size=0.5, random_state=42)

feature_columns_iq = tf.contrib.learn.infer_real_valued_columns_from_input(X_train_iq1)
regressor_iq = xgb.XGBRegressor(n_estimators = 490, # number of boosted trees
                             learning_rate = 0.00202047, # step size shrinkage used in update to prevent overfitting
                             max_depth = 10, # maximum depth of a tree
                             subsample = 0.6815, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.701) # subsample features



In [217]:
# Fit and predict.
regressor_sj.fit(X_train_sj1, y_train_sj1.ravel()) # ravel to make a column-vector into a 1d array
regressor_iq.fit(X_train_iq1, y_train_iq1.ravel())

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.701,
       gamma=0, learning_rate=0.00202047, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=490, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.6815)

In [218]:
predictions_sj = list(regressor_sj.predict(X_val_sj1))
score_sj = mean_absolute_error(y_val_sj1, predictions_sj)

predictions_iq = list(regressor_iq.predict(X_val_iq1))
score_iq = mean_absolute_error(y_val_iq1, predictions_iq)

In [219]:
print('Accuracy_sj: {0:f}'.format(score_sj), 'Accuracy_iq: {0:f}'.format(score_iq))

Accuracy_sj: 18.752811 Accuracy_iq: 6.183146


In [220]:
predictions_sj = list(regressor_sj.predict(X_test_sj1))
score_sj = mean_absolute_error(y_test_sj1, predictions_sj)

predictions_iq = list(regressor_iq.predict(X_test_iq1))
score_iq = mean_absolute_error(y_test_iq1, predictions_iq)

print('Accuracy_sj: {0:f}'.format(score_sj), 'Accuracy_iq: {0:f}'.format(score_iq))

Accuracy_sj: 19.475421 Accuracy_iq: 5.511176
