# 06MBIG - Machine Learning  
## Álvaro González Rodríguez  
### 74746657S

En este notebook se pretende devolver una solución al problema de la competición del Dengue de DrivenData.
(https://www.drivendata.org/competitions/44/dengai-predicting-disease-spread/).  
El objetivo es predecir los casos totales de dengue en los días especificados en el dataset *dengue_features_test.csv*.  
La puntuación mínima que debemos obtener es de MAE = 29.2764

---
# Actividad 3 - Optimización


In [15]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_absolute_error
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
# import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

In [2]:
# Cargamos los datos de la actividad 1
train = pd.read_csv('dengue_train.csv')
test = pd.read_csv('dengue_test.csv')

In [3]:
# Definimos dataset de Train, Test y separamos en variables dependientes e independiente.
X_train = train.drop('total_cases', axis=1)
X_test = test
y_train = train['total_cases']

# Definimos transformador
transformer = MinMaxScaler(feature_range=[0,1]).fit(X_train)

# # Normalizamos los datos:
X_train_norm = transformer.transform(X_train)
X_test_norm = transformer.transform(X_test)

# Definimos transformador PCA con 4 componentes
pca = PCA(n_components=4)

# # Reducimos dimensionalidad
X_train_norm_pca = pca.fit_transform(X_train_norm)
X_test_norm_pca = pca.transform(X_test_norm)

In [4]:
def report(results, n_top=3): # Función para mostrar resultados
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [5]:
param_dist = {"n_estimators": [16, 32], # Number of trees in random forest
              "max_features": ['auto', 'sqrt'], # Number of features to consider at every split
              "max_depth": [4,5,6], # Maximum number of levels in tree
              "min_samples_split": [2, 4, 6], #  Minimum number of samples required to split a node
              "min_samples_leaf": [8, 12, 16], # Minimum number of samples required at each leaf node
              "bootstrap": [True, False], # Method of selecting samples for training each tree
              "criterion": ['mae'],
              'n_jobs': [-1],
              'random_state': [23]
            }

In [6]:
model = RandomForestRegressor() 
grid_regres = GridSearchCV(estimator = model,
                           param_grid= param_dist,
                           cv=5)

# Fit the random search model
grid_regres.fit(X = X_train_norm_pca, 
                y = y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, rand...
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
  

In [10]:
report(grid_regres.cv_results_, n_top = 5)

Model with rank: 1
Mean validation score: -0.087 (std: 0.182)
Parameters: {'bootstrap': True, 'criterion': 'mae', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 16, 'min_samples_split': 2, 'n_estimators': 16, 'n_jobs': -1, 'random_state': 23}

Model with rank: 1
Mean validation score: -0.087 (std: 0.182)
Parameters: {'bootstrap': True, 'criterion': 'mae', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 16, 'min_samples_split': 4, 'n_estimators': 16, 'n_jobs': -1, 'random_state': 23}

Model with rank: 1
Mean validation score: -0.087 (std: 0.182)
Parameters: {'bootstrap': True, 'criterion': 'mae', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 16, 'min_samples_split': 6, 'n_estimators': 16, 'n_jobs': -1, 'random_state': 23}

Model with rank: 4
Mean validation score: -0.107 (std: 0.165)
Parameters: {'bootstrap': False, 'criterion': 'mae', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 16, 'min_samples_split': 2, 'n_estimators': 16, 'n_job

In [11]:
grid_regres.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=4, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=16,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=16, n_jobs=-1, oob_score=False,
                      random_state=23, verbose=0, warm_start=False)

In [12]:
# Escogemos el mejor estimador
best_grid = grid_regres.best_estimator_

# Ajustamos el mejor estimador y predecimos
best_grid.fit(X = X_train_norm_pca,
              y = y_train)

# Predecimos los valores de test de la competicion              
predictions = best_grid.predict(X = X_test_norm_pca).astype('int')

# Convertimos a 0 los valores negativos
predictions[predictions<0] = 0

# Guardamos los resultados en un DataFrame
results = pd.read_csv('./dengue_features_test.csv', usecols=['city', 'year', 'weekofyear'])
results['total_cases'] = predictions

results.to_csv('RandomForestRegressor_GSCV_results.csv', index=False)

## RandomSearch

In [19]:
model = RandomForestRegressor()
rnd_regres = RandomizedSearchCV(estimator = model,
                                param_distributions = param_dist, 
                                n_iter = 100,
                                cv = 5,
                                random_state=23,
                                n_jobs = -1)

# Fit the random search model
rnd_regres.fit(X = X_train_norm_pca, 
               y = y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [20]:
report(rnd_regres.cv_results_, n_top = 5)

Model with rank: 1
Mean validation score: -0.087 (std: 0.182)
Parameters: {'random_state': 23, 'n_jobs': -1, 'n_estimators': 16, 'min_samples_split': 2, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'mae', 'bootstrap': True}

Model with rank: 2
Mean validation score: -0.107 (std: 0.165)
Parameters: {'random_state': 23, 'n_jobs': -1, 'n_estimators': 16, 'min_samples_split': 4, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'mae', 'bootstrap': False}

Model with rank: 2
Mean validation score: -0.107 (std: 0.165)
Parameters: {'random_state': 23, 'n_jobs': -1, 'n_estimators': 16, 'min_samples_split': 2, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'mae', 'bootstrap': False}

Model with rank: 4
Mean validation score: -0.117 (std: 0.172)
Parameters: {'random_state': 23, 'n_jobs': -1, 'n_estimators': 16, 'min_samples_split': 2, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'max_depth': 4, 'criterion

In [21]:
rnd_regres.best_params_

{'random_state': 23,
 'n_jobs': -1,
 'n_estimators': 16,
 'min_samples_split': 2,
 'min_samples_leaf': 16,
 'max_features': 'sqrt',
 'max_depth': 4,
 'criterion': 'mae',
 'bootstrap': True}

In [22]:
# Escogemos el mejor estimador
best_random = rnd_regres.best_estimator_

# Ajustamos el mejor estimador y predecimos
best_random.fit(X = X_train_norm_pca,
                y = y_train)

# Predecimos los valores de test de la competicion              
predictions = best_random.predict(X = X_test_norm_pca).astype('int')

# Convertimos a 0 los valores negativos
predictions[predictions<0] = 0

# Guardamos los resultados en un DataFrame
results = pd.read_csv('./dengue_features_test.csv', usecols=['city', 'year', 'weekofyear'])
results['total_cases'] = predictions

results.to_csv('RandomForestRegressor_RandomSearchCV_results.csv', index=False)