In [1]:
# Implementation of a xgboost regressor


In [2]:
@import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.cross_validation import KFold

In [3]:
# make function to preprocess data
def preprocess_data(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path)
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path)
        #df = df.join(labels)
    
    # separate san juan and iquitos
    sj_features = df[df.city == 'sj']
    iq_features = df[df.city == 'iq']
    if labels_path:
        sj_labels = labels[labels.city == 'sj']
        iq_labels = labels[labels.city == 'iq']    
        return sj_features, iq_features, sj_labels, iq_labels
    return sj_features, iq_features

In [4]:
sj_features, iq_features, sj_labels, iq_labels = preprocess_data(
                                                                'data/dengue_features_train.csv',
                                                                labels_path="data/dengue_labels_train.csv")

In [5]:
#load final test data
sj_test_final, iq_test_final = preprocess_data("data/dengue_features_test.csv")

In [6]:
#dropping date and city as city already divided

iq_features = iq_features.drop(iq_features.columns[[0,3]], axis=1)
sj_features = sj_features.drop(sj_features.columns[[0,3]], axis=1)
sj_test_final = sj_test_final.drop(sj_test_final.columns[[0,3]], axis=1)
iq_test_final = iq_test_final.drop(iq_test_final.columns[[0,3]], axis=1)

#removing city, year, weekofyear from labels tables
sj_labels = sj_labels.total_cases
#sj_labels = sj_labels.set_index([0,2])
iq_labels = iq_labels.total_cases


sj_labels.head()

0    4
1    5
2    4
3    3
4    6
Name: total_cases, dtype: int64

In [7]:
#since data is linear it makes sense to separate data linearly
#split train and test data

sj_train = sj_features.head(800)
sj_train_target = sj_labels.head(800)
sj_test = sj_features.tail(sj_features.shape[0] - 800)
sj_test_target = sj_labels.tail(sj_labels.shape[0] - 800)



iq_train = iq_features.head(400)
iq_train_target = iq_labels.head(400)
iq_test = iq_features.tail(iq_features.shape[0] - 400)
iq_test_target = iq_labels.tail(iq_labels.shape[0] - 400)


In [8]:
#randomly separating data
# splitting data into training set and validation set

sj_train, sj_test, sj_train_target, sj_test_target = train_test_split(sj_features, sj_labels, test_size=0.2, random_state=41)

iq_train, iq_test, iq_train_target, iq_test_target = train_test_split(iq_features, iq_labels, test_size=0.2, random_state=41)

In [9]:
sj_train.head()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
75,1991,41,0.093,0.093,0.145083,0.168167,32.1,299.562857,299.671429,295.887143,...,55.1,80.515714,32.1,17.344286,2.257143,27.657143,6.614286,31.7,22.8,8.3
119,1992,33,0.16395,0.12665,0.215057,0.219271,65.55,299.215714,299.221429,296.007143,...,81.8,82.75,65.55,17.415714,1.957143,27.671429,7.557143,31.7,22.8,41.9
882,2007,16,0.0689,0.044267,0.108357,0.085029,0.0,299.335714,299.557143,294.507143,...,2.22,74.832857,0.0,15.854286,3.0,27.728571,8.014286,33.3,22.8,1.8
319,1996,25,0.07125,0.0646,0.1461,0.156171,47.93,298.49,298.564286,295.435714,...,73.7,83.322857,47.93,16.81,1.928571,26.442857,5.8,30.6,22.8,84.6
693,2003,35,0.0726,0.1,0.204371,0.160357,98.68,300.668571,300.728571,297.048571,...,50.86,80.797143,98.68,18.571429,2.542857,27.914286,6.457143,31.7,23.3,69.6


In [10]:
sj_train_target.head()

75     116
119     30
882      4
319      6
693     32
Name: total_cases, dtype: int64

In [32]:
submission = pd.read_csv("data/dengue_labels_test.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv("submission/submission_MLP.csv")


In [None]:
#submission