In [1]:
# Implementation of a xgboost regressor


In [2]:
#import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.cross_validation import KFold



#### Preprocess Data

This function preprocesses the data, fills na values and separates the data from each city, all features are saved.
It is able to distinguish when you are loading just the test data and the train data.

In [3]:
# make function to preprocess data
def preprocess_data(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path)
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path)
        #df = df.join(labels)
    
    # separate san juan and iquitos
    sj_features = df[df.city == 'sj']
    iq_features = df[df.city == 'iq']
    if labels_path:
        sj_labels = labels[labels.city == 'sj']
        iq_labels = labels[labels.city == 'iq']    
        return sj_features, iq_features, sj_labels, iq_labels
    return sj_features, iq_features

In [4]:
sj_features, iq_features, sj_labels, iq_labels = preprocess_data(
                                                                'data/dengue_features_train.csv',
                                                                labels_path="data/dengue_labels_train.csv")

In [5]:
#load final test data
sj_test_final, iq_test_final = preprocess_data("data/dengue_features_test.csv")

Since data is already divided by city I remove that column as well as the date column as other columns represent it, so it is kinda redundant, as well as python doesn't like its string formatting

In [6]:
#dropping date and city as city already divided

iq_features = iq_features.drop(iq_features.columns[[0,3]], axis=1)
sj_features = sj_features.drop(sj_features.columns[[0,3]], axis=1)
sj_test_final = sj_test_final.drop(sj_test_final.columns[[0,3]], axis=1)
iq_test_final = iq_test_final.drop(iq_test_final.columns[[0,3]], axis=1)

#removing city, year, weekofyear from labels tables
sj_labels = sj_labels.total_cases
#sj_labels = sj_labels.set_index([0,2])
iq_labels = iq_labels.total_cases

#sj_labels.head()

I don't separate data since kfolds does it for me

In [24]:
#since data is linear it makes sense to separate data linearly
#split train and test data
'''
sj_train = sj_features.head(800)
sj_train_target = sj_labels.head(800)
sj_test = sj_features.tail(sj_features.shape[0] - 800)
sj_test_target = sj_labels.tail(sj_labels.shape[0] - 800)



iq_train = iq_features.head(400)
iq_train_target = iq_labels.head(400)
iq_test = iq_features.tail(iq_features.shape[0] - 400)
iq_test_target = iq_labels.tail(iq_labels.shape[0] - 400)
'''



In [8]:
#randomly separating data
# splitting data into training set and validation set
'''
sj_train, sj_test, sj_train_target, sj_test_target = train_test_split(sj_features, sj_labels, test_size=0.2, random_state=41)

iq_train, iq_test, iq_train_target, iq_test_target = train_test_split(iq_features, iq_labels, test_size=0.2, random_state=41)
'''


In [7]:
sj_features.head()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,1990,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,1990,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,1990,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,1990,21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,1990,22,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [8]:
sj_features.iloc[[80,90]]

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
80,1991,46,0.0954,0.1114,0.194943,0.198057,78.73,298.685714,298.785714,295.615714,...,72.7,83.514286,78.73,17.061429,1.971429,26.657143,6.742857,31.1,22.2,28.2
90,1992,4,0.122533,0.13105,0.188843,0.168871,0.0,297.091429,297.335714,293.021429,...,5.11,78.291429,0.0,14.482857,2.357143,24.857143,7.442857,30.6,20.0,1.3


-------------------------------------------------------------------------------------------------------------------

Before I actually build my models I worked around with the following model to test hyperparameters
-----------------------------------------------------------------------------------------------------------

In [9]:
sj_predictions = np.zeros(sj_features.shape[0])
len(sj_predictions)

936

In [20]:
kf = KFold(sj_features.shape[0], n_folds=10)



In [21]:
for trainIndex, testIndex in kf:
    trainFold, testFold = sj_features.iloc[trainIndex], sj_features.iloc[testIndex]
    trainFoldTarget, testFoldTarget = sj_labels.iloc[trainIndex], sj_labels.iloc[testIndex]
    
    xgbr = xgb.XGBRegressor(n_estimators = 550, # number of boosted trees
                                learning_rate = 0.002, # step size shrinkage used in update to prevent overfitting
                                max_depth = 7,
                                subsample = .6815,
                                colsample_bytree = .701)
    
    xgbr.fit(trainFold, trainFoldTarget)
    xgbpred =xgbr.predict(testFold)
    #testPred.append(xgbr.predict(test))
    sj_predictions[testIndex] = xgbpred
    
    # Print the MAE
    print(metrics.mean_absolute_error(testFoldTarget, xgbpred))


10.2802037381
46.7331123352
18.9131377403
7.06084261549
54.4977999139
43.6185969393
5.96552631703
7.10489999487
22.100781177
69.013759613
13.2160291672
9.82777933364
20.9971737405
9.91921404575
6.95114263575
7.37277461113
13.8583940527
15.9421124666
10.4877642497
30.3295153328


In [22]:
print(metrics.mean_absolute_error(sj_labels, sj_predictions))
#obviously this number is going to be really small since we tested our data with the same data we used for 
#training

21.2247206569


-------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------

In [12]:
def xboostRegressor(city_feat, city_labels):
    '''
    this function builds a xboost model given a city
    '''
    
    predictions = np.zeros(city_feat.shape[0])
    
    kf = KFold(city_feat.shape[0], n_folds=15)
    
    for trainIndex, testIndex in kf:
        trainFold, testFold = city_feat.iloc[trainIndex], city_feat.iloc[testIndex]
        trainFoldTarget, testFoldTarget = city_labels.iloc[trainIndex], city_labels.iloc[testIndex]

        xgbr = xgb.XGBRegressor(n_estimators = 100, # number of boosted trees
                                learning_rate = 0.01, # step size shrinkage used in update to prevent overfitting
                                max_depth = 7,
                                subsample = .6815,
                                colsample_bytree = .701)

        xgbr.fit(trainFold, trainFoldTarget)
        xgbpred =xgbr.predict(testFold)
        #testPred.append(xgbr.predict(test))
        sj_predictions[testIndex] = xgbpred
    return xgbr
    
    # Print the AUC
    #print(metrics.mean_absolute_error(testFoldTarget, xgbpred))

In [13]:
sj_model = xboostRegressor(sj_features, sj_labels)
iq_model = xboostRegressor(iq_features, iq_labels)

In [111]:

final_sj_predictions = sj_model.predict(sj_test_final)
final_iq_predictions = iq_model.predict(iq_test_final)

In [114]:
submission = pd.read_csv("data/dengue_labels_test.csv",
                         index_col=[0, 1, 2])


submission.total_cases = np.concatenate([final_sj_predictions.astype(np.int64), final_iq_predictions.astype(np.int64)])
submission.to_csv("submission/submission_xgboost.csv")


In [None]:
#submission