# Gradient Boosting Classifier

In [None]:
import pickle 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score, hamming_loss, f1_score, precision_score 


### Import the pickled dataframes from the Eda_cleaning notebook:

In [14]:
with open("pump_predictors_data_cleaned.pickle", "rb") as f: # "wb" means "write as bytes"
    dum_df = pickle.load(f)
    
    
with open("pump_target_data_cleaned.pickle", "rb") as f: # "wb" means "write as bytes"
    target = pickle.load(f)

In [15]:
# Train, test, split:
X_train, X_test, y_train, y_test = train_test_split(dum_df, target)



In [151]:
#running the GridSearch takes several hours, the best params are used below


def model_GridSearch(X_train, X_test, y_train, y_test):
    
    best_params = {'learning_rate': [0.075, 0.7],
                      'max_depth': [13, 14],
                      'min_samples_leaf': [15, 16],
                      'max_features': [1.0],
                      'n_estimators': [100, 200]}                     

    estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                             param_grid=best_params,
                             n_jobs=-1)

    estimator.fit(X_train, y_train)

    best_params = estimator.best_params_

    print (best_params)

    validation_accuracy = estimator.score(X_test, y_test)
    print('Validation accuracy: ', validation_accuracy)

In [19]:
#running the Gradient Boosting Model with the best params found from the Grid Search:


def best_GBC_model(X_train, X_test, y_train, y_test):
    
    best_params = {'learning_rate': [0.075],
                        'max_depth': [14],
                        'min_samples_leaf': [16],
                        'max_features': [1.0],
                        'n_estimators': [100]}                     

    estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                             param_grid=best_params,
                             n_jobs=-1)

    estimator.fit(X_train, y_train)

    best_params = estimator.best_params_

    print (best_params)

    validation_accuracy = estimator.score(X_test, y_test)
    print('Validation accuracy: ', validation_accuracy)

In [20]:
best_GBC_model(X_train, X_test, y_train, y_test)


{'learning_rate': 0.075, 'max_depth': 14, 'max_features': 1.0, 'min_samples_leaf': 16, 'n_estimators': 100}
Validation accuracy:  0.8045117845117845


In [31]:
best_GBC_model(X_train, X_test, y_train, y_test)


{'learning_rate': 0.075, 'max_depth': 14, 'max_features': 1.0, 'min_samples_leaf': 16, 'n_estimators': 100}
Validation accuracy:  0.803973063973064


In [159]:
model_GridSearch(X_train, X_test, y_train, y_test)

{'learning_rate': 0.075, 'max_depth': 14, 'max_features': 1.0, 'min_samples_leaf': 16, 'n_estimators': 100}
Validation accuracy:  0.800942760942761


In [15]:
def get_score(model, X_train, y_train, y_train_hat):
    rec = recall_score(y_train, y_train_hat, average = 'weighted')
    hamming = hamming_loss(y_train, y_train_hat)
    f1 = f1_score(y_train, y_train_hat, average = 'weighted')
    prec = precision_score(y_train, y_train_hat, average = 'weighted')
    acc = model.score(X_train, y_train)
    print(f"""
        Recall Score: {rec} 
        Hamming Loss: {hamming} 
        f1 Score: {f1} 
        Precision Score: {prec}
        Accuracy: {acc}""")

In [16]:
bt2 = BaggingClassifier(max_features=0.6, max_samples= 0.4,
 n_estimators= 200)

bt2.fit(X_train, y_train)

y_train_pred_bt2 = bt2.predict(X_train)

get_score(bt2, X_train, y_train, y_train_pred_bt2)


        Recall Score: 0.904736251402918 
        Hamming Loss: 0.09526374859708193 
        f1 Score: 0.9010870919337772 
        Precision Score: 0.9061715069016353
        Accuracy: 0.904736251402918


In [17]:
bt2_cv_score = cross_val_score(bt2, X_train, y_train, cv=7)
mean_bt2_cv_score = np.mean(bt2_cv_score)
print(f"Mean Cross Validation Best Param Score: {mean_bt2_cv_score :.2%}")



Mean Cross Validation Best Param Score: 79.94%
