In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

In [2]:
df = pd.read_csv('datasets/processed_dataset.csv')

### Data splitting

In [3]:
random_seed = 47
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['PumpFailed']), 
                                                df['PumpFailed'], test_size=0.3, random_state=random_seed)

In [4]:
print(f'Train df size, X: {x_train.shape}, Y: {y_train.shape}')
print(f'Test df size, X: {x_test.shape}, Y: {y_test.shape}')

Train df size, X: (18830, 388), Y: (18830,)
Test df size, X: (8070, 388), Y: (8070,)


### Modeling using weights

Set class weights to combat unbalanced data. The `scale_pos_weight` parameter in XGBoost allows to set the weight of a rare class compared to a frequent class.

In [5]:
start = time.time()

ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
xgb_clf = XGBClassifier(objective='binary:logistic', 
                        learning_rate=0.01, 
                        max_depth=10, 
                        subsample=0.5, 
                        colsample_bytree=0.7, 
                        n_estimators=200, 
                        reg_lambda=1.1, 
                        min_child_weight=1,
                        scale_pos_weight = ratio)

xgb_clf.fit(x_train, y_train)

time_in_minutes = round((time.time() - start) / 60, 1)
print('All done. This operation took', time_in_minutes, 'minutes.')

All done. This operation took 0.2 minutes.


#### Metrics

In [6]:
y_pred_train = xgb_clf.predict(x_train)
print('Classification report for training set:')
print(classification_report(y_train, y_pred_train))

Classification report for training set:
              precision    recall  f1-score   support

           0       1.00      0.88      0.94     16769
           1       0.52      1.00      0.68      2061

    accuracy                           0.90     18830
   macro avg       0.76      0.94      0.81     18830
weighted avg       0.95      0.90      0.91     18830



In [7]:
y_pred_test = xgb_clf.predict(x_test)
print('Classification report for testing set:')
print(classification_report(y_test, y_pred_test))

Classification report for testing set:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93      7189
           1       0.50      0.98      0.66       881

    accuracy                           0.89      8070
   macro avg       0.75      0.93      0.80      8070
weighted avg       0.94      0.89      0.90      8070



### Modeling without using weights

In [11]:
start = time.time()

xgb_clf = XGBClassifier(objective='binary:logistic', 
                        learning_rate=0.01, 
                        max_depth=10, 
                        subsample=0.5, 
                        colsample_bytree=0.7, 
                        n_estimators=200, 
                        reg_lambda=1.1, 
                        min_child_weight=1)

xgb_clf.fit(x_train, y_train)

time_in_minutes = round((time.time() - start) / 60, 1)
print('All done. This operation took', time_in_minutes, 'minutes.')

All done. This operation took 0.3 minutes.


In [12]:
y_pred_train = xgb_clf.predict(x_train)
print('Classification report for training set:')
print(classification_report(y_train, y_pred_train))

Classification report for training set:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     16769
           1       0.90      0.63      0.74      2061

    accuracy                           0.95     18830
   macro avg       0.93      0.81      0.86     18830
weighted avg       0.95      0.95      0.95     18830



In [13]:
y_pred_test = xgb_clf.predict(x_test)
print('Classification report for testing set:')
print(classification_report(y_test, y_pred_test))

Classification report for testing set:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      7189
           1       0.82      0.48      0.61       881

    accuracy                           0.93      8070
   macro avg       0.88      0.73      0.79      8070
weighted avg       0.93      0.93      0.92      8070



In [None]:
#comment results

### Hyperparameters Optimization for XGBoost

In [8]:
start = time.time()

xgb_clf = XGBClassifier(objective='binary:logistic', scale_pos_weight = ratio)

param_grid_xgb_clf = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [10],
    'subsample': [0.6, 0.9],
    'colsample_bytree': [0.8],
    'n_estimators': [100, 200],
    'min_child_weight': [3],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [1, 1.5]
}

grid_count_gxb_clf = ParameterGrid(param_grid_xgb_clf)
print('Total combinations of parameters: ', len(grid_count_gxb_clf))

grid_search_xgb_clf = GridSearchCV(estimator=xgb_clf, param_grid=param_grid_xgb_clf, cv=5, scoring='f1', n_jobs=-1)
grid_search_xgb_clf.fit(x_train, y_train)

time_in_minutes = round((time.time() - start) / 60, 1)
print('All done. This operation took', time_in_minutes, 'minutes.')

Total combinations of parameters:  32
All done. This operation took 18.5 minutes.


#### Metrics

In [9]:
y_pred_train = grid_search_xgb_clf.predict(x_train)
print('Classification report for training set:')
print(classification_report(y_train, y_pred_train))

Classification report for training set:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     16769
           1       0.84      1.00      0.91      2061

    accuracy                           0.98     18830
   macro avg       0.92      0.99      0.95     18830
weighted avg       0.98      0.98      0.98     18830



In [10]:
y_pred_test = grid_search_xgb_clf.predict(x_test)
print('Classification report for testing set:')
print(classification_report(y_test, y_pred_test))

Classification report for testing set:
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      7189
           1       0.60      0.89      0.72       881

    accuracy                           0.92      8070
   macro avg       0.80      0.91      0.84      8070
weighted avg       0.94      0.92      0.93      8070



In [None]:
#comment rezults