In [None]:
#!/usr/local/bin/python3
import os
from time import time
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from scipy.stats import randint as sp_randint
from scipy import interp
from drivendata_validator import DrivenDataValidator
from tpot import TPOTRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, log_loss, mean_absolute_error, make_scorer

import warnings
warnings.filterwarnings("ignore")

In [None]:
def pre_process_train_test_data(train, test, label_var, exclude_scaling):
    labels = np.ravel(train[label_var])
    train = pd.get_dummies(train.drop(label_var, axis=1))
    test = pd.get_dummies(test)

    # match test set and training set columns
    to_drop = np.setdiff1d(test.columns, train.columns)
    to_add = np.setdiff1d(train.columns, test.columns)

    test.drop(to_drop, axis=1, inplace=True)
    test = test.assign(**{c: 0 for c in to_add})

    test_indices = test.index
    train_indices = train.index
    train_test = pd.concat([train, test])
    train_test.sort_values(['year', 'weekofyear'], inplace=True)
    train_test.interpolate(method='linear', inplace=True)

    print("Shapes before transformation")
    print("Train : ", train.shape)
    print("Test : ", test.shape)
    print("Train + Test : ", train_test.shape)

    numeric_vals = train_test.select_dtypes(include=['int64', 'float64'])
    numeric_vals = numeric_vals.loc[:, [x for x in list(numeric_vals.columns.values) if x not in exclude_scaling]]
    scaler = StandardScaler()
    train_test[numeric_vals.columns] = scaler.fit_transform(numeric_vals)

    train = train_test.loc[train_indices, :]
    test = train_test.loc[test_indices, :]

    train[label_var] = labels

    print("Shapes after transformation")
    print("Train : ", train.shape)
    print("Test : ",  test.shape)

    return train, test

In [None]:
## define data paths
DATA_DIR = '../data'
data_paths = {'train_x': os.path.join(DATA_DIR, 'dengue_features_train.csv'),
              'train_y': os.path.join(DATA_DIR, 'dengue_labels_train.csv'),
               'test_x':  os.path.join(DATA_DIR, 'dengue_features_test.csv')}

## load training data
X_train = pd.read_csv(data_paths['train_x'])
y_train = pd.read_csv(data_paths['train_y'])
X_train.drop(columns='week_start_date', inplace=True)

## load test data
X_test = pd.read_csv(data_paths['test_x'])
X_test.drop(columns='week_start_date', inplace=True)

## Pre-process Data
print("Shapes before transformation")
print("Train : ", X_train.shape)
print("Train Labels : ", y_train.shape)
print("Test : ", X_test.shape)
print("Columns : ", X_train.columns)
train_data = pd.merge(X_train, y_train, on=['city', 'year', 'weekofyear'])
train_data.index = np.arange(0, train_data.shape[0])
X_test.index = np.arange(train_data.shape[0]+1, train_data.shape[0]+X_test.shape[0]+1)


In [None]:
train_data = train_data.rename(columns={'precipitation_amt_mm': 'sat_precip', 
                        'reanalysis_air_temp_k': 'pred_temp',
                        'reanalysis_avg_temp_k': 'pred_avg_temp',
                        'reanalysis_dew_point_temp_k': 'pred_dew_temp',
                        'reanalysis_max_air_temp_k': 'pred_max_temp',
                        'reanalysis_min_air_temp_k': 'pred_min_temp',
                        'reanalysis_precip_amt_kg_per_m2': 'pred_precip_vol',
                        'reanalysis_specific_humidity_g_per_kg' : 'pred_spec_humidity',
                        'reanalysis_tdtr_k' : 'pred_temp_rng',
                        'reanalysis_relative_humidity_percent' : 'pred_rel_humidity_per',
                        'reanalysis_sat_precip_amt_mm': 'pred_sat_precip'
                        })
train_data.interpolate(method='linear', inplace=True)

X_test = X_test.rename(columns={'precipitation_amt_mm': 'pred_precip', 
                        'reanalysis_air_temp_k': 'pred_temp',
                        'reanalysis_avg_temp_k': 'pred_avg_temp',
                        'reanalysis_dew_point_temp_k': 'pred_dew_temp',
                        'reanalysis_max_air_temp_k': 'pred_max_temp',
                        'reanalysis_min_air_temp_k': 'pred_min_temp',
                        'reanalysis_precip_amt_kg_per_m2': 'pred_precip_vol',
                        'reanalysis_specific_humidity_g_per_kg' : 'pred_spec_humidity',
                        'reanalysis_tdtr_k' : 'pred_temp_rng',
                        'reanalysis_relative_humidity_percent' : 'pred_rel_humidity_per',
                        'reanalysis_sat_precip_amt_mm': 'pred_sat_precip'
                        })
X_test.interpolate(method='linear', inplace=True)

kelvin_cols = ['pred_temp', 'pred_avg_temp', 'pred_dew_temp', 'pred_max_temp', 'pred_min_temp']
train_data.loc[:, kelvin_cols] = train_data.loc[:, kelvin_cols].copy() - 273.15
X_test.loc[:, kelvin_cols] = X_test.loc[:, kelvin_cols].copy() - 273.15

train_data.head()


In [None]:
train_data.columns.values

In [None]:
pred_cols = ['pred_precip', ] 
sat_cols = 

In [None]:
train_data.loc[:, ['pred_avg_temp', 'station_avg_temp_c']].head()

In [None]:
# Check the type of our features. Are there any data inconsistencies?
train_data.dtypes

In [None]:
# Display the statistical overview of the employees
train_data.describe()

In [None]:
# Create a correlation matrix. What features correlate the most with turnover? What other correlations did you find?
plt.figure(figsize=(12, 10))
corr = train_data.corr()
corr = (corr)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, cmap="vlag",)
plt.title('Heatmap of Correlation Matrix')
# corr

In [None]:
sns.clustermap(corr, center=0, cmap="vlag",
               linewidths=.75, figsize=(13, 13))

In [None]:
# sns.pairplot(train_data, hue="total_cases")

In [None]:
# print("Preprocessing Training")
# label_var = 'total_cases'
# exclude_scaling = ['year', 'weekofyear']
# a_train, a_test = pre_process_train_test_data(train_data, X_test, label_var, exclude_scaling)
# X_train = a_train.drop(label_var, axis=1)
# y_train = np.ravel(a_train[label_var])

# ## restructure train data
# all_train_data = {'features': X_train,
#                   'labels': y_train}

# ## restructure test data
# all_test_data = {'features': a_test}

# # ### Cross-validation -- Tune Parameters
# X = all_train_data['features'].values.astype(np.float32)
# y = all_train_data['labels'].astype(np.int16)
# X_test = all_test_data['features'].values.astype(np.float32)

In [None]:
# tune_params = 0
# if tune_params > 0:
#     bestParams = []
#     X = all_train_data['features'].values.astype(np.float32)
#     y = all_train_data['labels'].astype(np.int32)
#     mae_score = make_scorer(mean_absolute_error, greater_is_better=False)
#     pipeline_optimizer = TPOTRegressor(scoring=mae_score, cv=5,
#                                         periodic_checkpoint_folder='tpot_best_models_100',
#                                         n_jobs=2, random_state=42, verbosity=1, memory='auto',
#                                         generations=100, max_eval_time_mins=10)
#     pipeline_optimizer.fit(X, y)
#     pipeline_optimizer.export('tpot_best_model_pipeline_gen_100_mae.py')