In [1]:
import sys
if sys.version_info >= (3, 5):
    from importlib.util import spec_from_file_location
    
import os
from time import time
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_curve, auc, log_loss
from scipy.stats import randint as sp_randint
from scipy import interp
from drivendata_validator import DrivenDataValidator
import itertools
from tpot import TPOTRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

In [2]:
def pre_process_train_test_data(train, test, label_var, exclude_scaling):
    labels = np.ravel(train[label_var])
    train = pd.get_dummies(train.drop(label_var, axis=1))
    test = pd.get_dummies(test)

    # match test set and training set columns
    to_drop = np.setdiff1d(test.columns, train.columns)
    to_add = np.setdiff1d(train.columns, test.columns)

    test.drop(to_drop, axis=1, inplace=True)
    test = test.assign(**{c: 0 for c in to_add})

    train.fillna(0, inplace=True)
    test.fillna(0, inplace=True)

    test_indices = test.index
    train_indices = train.index
    train_test = pd.concat([train, test])
    train_test.sort_values(['year', 'weekofyear'], inplace=True)
    train_test.interpolate(method='linear', inplace=True)

    print("Shapes before transformation")
    print("Train : ", train.shape)
    print("Test : ", test.shape)
    print("Train + Test : ", train_test.shape)

    numeric_vals = train_test.select_dtypes(include=['int64', 'float64'])
    numeric_vals = numeric_vals.loc[:, [x for x in list(numeric_vals.columns.values) if x not in exclude_scaling]]
    scaler = StandardScaler()
    train_test[numeric_vals.columns] = scaler.fit_transform(numeric_vals)

    train = train_test.loc[train_indices, :]
    test = train_test.loc[test_indices, :]

    train[label_var] = labels

    print("Shapes after transformation")
    print("Train : ", train.shape)
    print("Test : ",  test.shape)

    return train, test

In [3]:
DATA_DIR = '../data'

## define data paths
data_paths = {'train_x': os.path.join(DATA_DIR, 'dengue_features_train.csv'),
              'train_y': os.path.join(DATA_DIR, 'dengue_labels_train.csv'),
               'test_x':  os.path.join(DATA_DIR, 'dengue_features_test.csv')}

# load training data
X_train = pd.read_csv(data_paths['train_x'])
y_train = pd.read_csv(data_paths['train_y'])
X_train.drop(columns='week_start_date', inplace=True)

# load test data
X_test = pd.read_csv(data_paths['test_x'])
X_test.drop(columns='week_start_date', inplace=True)

# #### The first thing to notice is that each country's surveys have wildly different numbers of columns, so we'll plan on training separate models for each country and combining our predictions for submission at the end.
# ### Pre-process Data
print("Shapes before transformation")
print("Train : ", X_train.shape)
print("Train Labels : ", y_train.shape)
print("Test : ", X_test.shape)
print("Columns : ", X_train.columns)
train_data = pd.merge(X_train, y_train, on=['city', 'year', 'weekofyear'])
train_data.index = np.arange(0, train_data.shape[0])
X_test.index = np.arange(train_data.shape[0]+1, train_data.shape[0]+X_test.shape[0]+1)

print("Preprocessing Training")
label_var = 'total_cases'
exclude_scaling = ['year', 'weekofyear']
a_train, a_test = pre_process_train_test_data(train_data, X_test, label_var, exclude_scaling)
X_train = a_train.drop(label_var, axis=1)
y_train = np.ravel(a_train[label_var])

## restructure train data
all_train_data = {'features': X_train,
                  'labels': y_train}

## restructure test data
all_test_data = {'features': a_test}

# ### Cross-validation -- Tune Parameters
X = all_train_data['features'].values.astype(np.float32)
y = all_train_data['labels'].astype(np.int16)
X_test = all_test_data['features'].values.astype(np.float32)

Shapes before transformation
('Train : ', (1456, 23))
('Train Labels : ', (1456, 4))
('Test : ', (416, 23))
('Columns : ', Index([u'city', u'year', u'weekofyear', u'ndvi_ne', u'ndvi_nw', u'ndvi_se',
       u'ndvi_sw', u'precipitation_amt_mm', u'reanalysis_air_temp_k',
       u'reanalysis_avg_temp_k', u'reanalysis_dew_point_temp_k',
       u'reanalysis_max_air_temp_k', u'reanalysis_min_air_temp_k',
       u'reanalysis_precip_amt_kg_per_m2',
       u'reanalysis_relative_humidity_percent',
       u'reanalysis_sat_precip_amt_mm',
       u'reanalysis_specific_humidity_g_per_kg', u'reanalysis_tdtr_k',
       u'station_avg_temp_c', u'station_diur_temp_rng_c',
       u'station_max_temp_c', u'station_min_temp_c', u'station_precip_mm'],
      dtype='object'))
Preprocessing Training
Shapes before transformation
('Train : ', (1456, 24))
('Test : ', (416, 24))
('Train + Test : ', (1872, 24))
Shapes after transformation
('Train : ', (1456, 25))
('Test : ', (416, 24))


In [4]:
# # NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1).values
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(features, tpot_data['target'].values, random_state=42)

In [5]:
# Score on the training set was:-16.536910679310278
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.01, max_depth=1, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.55)),
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.9, learning_rate=0.001, loss="ls", max_depth=2, max_features=0.7500000000000001, min_samples_leaf=12, min_samples_split=17, n_estimators=100, subsample=1.0)),
    Nystroem(gamma=0.25, kernel="laplacian", n_components=10),
    GradientBoostingRegressor(alpha=0.95, learning_rate=0.1, loss="lad", max_depth=2, max_features=0.7000000000000001, min_samples_leaf=19, min_samples_split=7, n_estimators=100, subsample=0.6500000000000001)
)

exported_pipeline.fit(X, y)
results = exported_pipeline.predict(X_test)

In [6]:
submission = pd.read_csv('../data/submission_format.csv')
submission.loc[:, 'total_cases'] = np.array(results, dtype=np.int32)

## Submission Format
submission.to_csv('../data/submission_30_gen.csv', index=False)

# no parameters unless we have a read_csv kwargs file
v = DrivenDataValidator()

if v.is_valid('../data/submission_format.csv', '../data/submission_30_gen.csv'):
    print "I am awesome."
else:
    print "I am not so cool."


I am awesome.
