# Gradient Boosting Classifier

In [11]:
%load_ext autoreload
%autoreload 2

In [17]:
import pickle 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score, hamming_loss, f1_score, precision_score 
from src.utilities import competition_submission, data_cleaning

### Import the pickled dataframes from the Eda_cleaning notebook:

In [5]:
with open("pump_predictors_data_cleaned.pickle", "rb") as f: # "wb" means "write as bytes"
    dum_df = pickle.load(f)
    
    
with open("pump_target_data_cleaned.pickle", "rb") as f: # "wb" means "write as bytes"
    target = pickle.load(f)

In [6]:
# Train, test, split:
X_train, X_test, y_train, y_test = train_test_split(dum_df, target)



In [151]:
#running the GridSearch takes several hours, the best params are used below


def model_GridSearch(X_train, X_test, y_train, y_test):
    
    best_params = {'learning_rate': [0.075, 0.7],
                      'max_depth': [13, 14],
                      'min_samples_leaf': [15, 16],
                      'max_features': [1.0],
                      'n_estimators': [100, 200]}                     

    estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                             param_grid=best_params,
                             n_jobs=-1)

    estimator.fit(X_train, y_train)

    best_params = estimator.best_params_

    print (best_params)

    validation_accuracy = estimator.score(X_test, y_test)
    print('Validation accuracy: ', validation_accuracy)

In [2]:
#running the Gradient Boosting Model with the best params found from the Grid Search:


def best_GBC_model(X_train, X_test, y_train, y_test):
    
    best_params = {'learning_rate': [0.075],
                        'max_depth': [14],
                        'min_samples_leaf': [16],
                        'max_features': [1.0],
                        'n_estimators': [100]}                     

    estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                             param_grid=best_params,
                             n_jobs=-1)

    estimator.fit(X_train, y_train)

    best_params = estimator.best_params_

    print (best_params)

    validation_accuracy = estimator.score(X_test, y_test)
    print('Validation accuracy: ', validation_accuracy)
    return estimator

In [20]:
best_GBC_model(X_train, X_test, y_train, y_test)


{'learning_rate': 0.075, 'max_depth': 14, 'max_features': 1.0, 'min_samples_leaf': 16, 'n_estimators': 100}
Validation accuracy:  0.8045117845117845


In [7]:
gbc = best_GBC_model(X_train, X_test, y_train, y_test)


{'learning_rate': 0.075, 'max_depth': 14, 'max_features': 1.0, 'min_samples_leaf': 16, 'n_estimators': 100}
Validation accuracy:  0.7999326599326599


In [159]:
model_GridSearch(X_train, X_test, y_train, y_test)

{'learning_rate': 0.075, 'max_depth': 14, 'max_features': 1.0, 'min_samples_leaf': 16, 'n_estimators': 100}
Validation accuracy:  0.800942760942761


### submit predictions to competition

In [18]:
competition_submission.submit_test_data(gbc)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


data is cleaned and encoded
Prediction for test data finished
Store submission dataframe into file: successfully
