In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm

In [2]:
train = pd.read_csv('aps_failure_training_set.csv', na_values='na')
test =  pd.read_csv('aps_failure_test_set.csv', na_values='na')

train_labels = train['class']
test_labels = test['class']
train_features = train.drop('class', axis=1)
test_features = test.drop('class', axis=1)

train_labels = train_labels.replace({'neg':0, 'pos' : 1})
test_labels = test_labels.replace({'neg':0, 'pos' : 1})

In [3]:
#Missing Value Analysis
miss_val = pd.DataFrame(train_features.isnull().sum())
miss_val = miss_val.reset_index()
miss_val = miss_val.rename(columns = {'index': 'Predictors', 0: 'Missing_Percentage'})
miss_val['Missing_Percentage'] = (miss_val['Missing_Percentage']/len(train_features))*100
miss_val = miss_val.sort_values('Missing_Percentage', ascending = False).reset_index(drop = True)
miss_val.head()

Unnamed: 0,Predictors,Missing_Percentage
0,br_000,82.106667
1,bq_000,81.203333
2,bp_000,79.566667
3,bo_000,77.221667
4,cr_000,77.215


In [4]:
#Dropping Columns that have more than 35% missing values
dropped = list(miss_val.loc[miss_val['Missing_Percentage'] > 35,'Predictors'])
train_features.drop(columns = dropped, inplace = True)
test_features.drop(columns = dropped, inplace = True)

In [5]:
#Feature Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_features)
train_features = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns)
test_features = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns)

In [6]:
#Imputing Missing Values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'median')
imputer.fit(train_features)
train_features = pd.DataFrame(imputer.transform(train_features), columns=train_features.columns)
test_features = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)



In [7]:
#Dimensionality Reduction - using Principal Component Analysis
pca = PCA(0.99) #Define how much variance to explain
pca.fit(train_features)
best_train_features = pca.transform(train_features)
best_train_features = pd.DataFrame(best_train_features)
best_test_features = pd.DataFrame(pca.transform(test_features))

In [8]:
print(f'Number of components {pca.n_components_}')

Number of components 49


In [32]:
#Dimensionality Reduction using SelectKBest -- chosen over PCA
k_best = 80
selectKBest = SelectKBest(chi2, k_best)
selectKBest.fit(train_features, train_labels)
best_train_features = selectKBest.transform(train_features)
idxs_selected = selectKBest.get_support(indices=True)
best_train_features = train_features.iloc[:,idxs_selected]
best_test_features = test_features.iloc[:,idxs_selected]

In [33]:
#Balancing the dataset
number_samples = 2500
idxs_pos = train_labels[train_labels==1].index
idxs_neg = train_labels[train_labels==0].sample(n=number_samples, replace=False, random_state=0).index
idxs_balanced = np.concatenate((idxs_pos,idxs_neg))
best_train_features_balanced = best_train_features.loc[idxs_balanced]
train_labels_balanced = train_labels.loc[idxs_balanced]
print(f'Proportion balanced: {number_samples/1000}/1')

Proportion balanced: 2.5/1


In [36]:
#Classification using Gradient Boosting methods
gbc = GradientBoostingClassifier(random_state=0)
params = [{'loss': ['deviance', 'exponential'] , 'n_estimators': range(50,71,10) , 'min_samples_split':[2,100,500]
           ,'min_samples_leaf': [5,10,50]}]
gbcc = GridSearchCV(gbc, params, cv=5, scoring='recall', verbose=10, n_jobs=3)
gbcc.fit(best_train_features_balanced, train_labels_balanced)
gbc = gbcc
display(gbc)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    3.7s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    9.6s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   13.9s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   20.8s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   27.7s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   35.6s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   44.1s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:   53.3s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  1.3min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  1.5min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  2.5min
[Parallel(

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no...
                                                  presort='auto',
  

In [38]:
#Evaluating the performance of the Gradient Boosting algorithm and obtaining the cost
y_pred = gbc.predict(best_test_features)
report = classification_report(test_labels, y_pred)
print(report)
cm = confusion_matrix(test_labels, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is: {float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       1.00      0.97      0.98     15625
           1       0.42      0.96      0.59       375

    accuracy                           0.97     16000
   macro avg       0.71      0.96      0.79     16000
weighted avg       0.99      0.97      0.97     16000

Total cost is: 12400.0


In [39]:
#Classificating using Random Forest
params = [{'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], 'n_estimators': range(100,201,25),
           'max_depth': [10,20,30]}]
rfc = RandomForestClassifier(random_state=0)
#Executa grid search com cross validation
rfcc = GridSearchCV(rfc, params, cv=5, scoring='recall', verbose=10, n_jobs=3)
rfcc.fit(best_train_features_balanced, train_labels_balanced)
rfc = rfcc

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    3.7s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   11.2s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   16.8s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   30.8s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   45.3s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   55.0s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  2.8min
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  3.4min
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  4.3min
[Parallel(

In [40]:
#Evaluating the Random Forest Model
y_pred = rfc.predict(best_test_features)
report = classification_report(test_labels, y_pred)
print(report)
cm = confusion_matrix(test_labels, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is: {float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       1.00      0.97      0.98     15625
           1       0.42      0.97      0.59       375

    accuracy                           0.97     16000
   macro avg       0.71      0.97      0.79     16000
weighted avg       0.99      0.97      0.97     16000

Total cost is: 10460.0


In [41]:
#Classification using Support Vector Machines
params = [{'kernel': ['rbf', 'linear'], 'gamma': [0.01,0.05, 0.1, 0.2,0.3,0.4], 'C': [0.001, 0.01, 0.1, 1,10]}]
svmc = GridSearchCV(svm.SVC(C=1), params, cv=5, scoring='recall', verbose=10, n_jobs=3)
svmc.fit(best_train_features_balanced, train_labels_balanced)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    2.2s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    5.1s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    7.6s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   11.8s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   15.7s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   21.5s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   26.3s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:   33.0s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:   38.6s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:   44.3s
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:   50.3s
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:   56.5s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  1.2min
[Parallel(

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=3,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10],
                          'gamma': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4],
                          'kernel': ['rbf', 'linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=10)

In [42]:
#Evaluating the SVM model
y_pred = svmc.predict(best_test_features)
report = classification_report(test_labels, y_pred)
print(report)
cm = confusion_matrix(test_labels, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is: {float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     15625
           1       0.48      0.92      0.63       375

    accuracy                           0.97     16000
   macro avg       0.74      0.95      0.81     16000
weighted avg       0.99      0.97      0.98     16000

Total cost is: 18720.0


In [43]:
#Building a Neural Network
import keras
from keras.models import Sequential
from keras.layers import Dense
classifier = Sequential()
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 80))
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [44]:
classifier.fit(best_train_features_balanced, train_labels_balanced, batch_size = 10, epochs = 10)
y_pred = classifier.predict(best_test_features)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
#Evaluating the neural network
y_pred = (y_pred > 0.5)
report = classification_report(test_labels, y_pred)
print(report)
cm = confusion_matrix(test_labels, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is: {float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     15625
           1       0.36      0.96      0.52       375

    accuracy                           0.96     16000
   macro avg       0.68      0.96      0.75     16000
weighted avg       0.98      0.96      0.97     16000

Total cost is: 14390.0


In [34]:
#Classification using XGBoost
from xgboost import XGBClassifier
params = [{'max_depth': [3,4,5], 'n_estimators': [50,100,150], 'learning_rate': [ 0.01,0.05,0.1]}]
xgb = GridSearchCV(XGBClassifier(), params, cv=5, scoring='recall', verbose=10, n_jobs=3)
xgb.fit(best_train_features_balanced, train_labels_balanced)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    4.6s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   14.2s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   26.5s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   40.3s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  2.7min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  3.3min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  4.0min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  5.3min
[Parallel(n_jobs=3)]: Done 135 out of 135 | elapsed:  6.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=3,
             param_grid=[{'learning_rate': [0.01, 0.05, 0.1],
                          'max_depth': [3, 4, 5],
                          'n_estimators': [50, 100, 150]}]

In [35]:
#Evaluating 
y_pred = xgb.predict(best_test_features)
report = classification_report(test_labels, y_pred)
print(report)
cm = confusion_matrix(test_labels, y_pred).ravel()
cm = pd.DataFrame(cm.reshape((1,4)), columns=['tn', 'fp', 'fn', 'tp'])
total_cost = 10*cm.fp + 500*cm.fn
print(f'Total cost is: {float(total_cost.values[0])}')

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     15625
           1       0.49      0.97      0.65       375

    accuracy                           0.98     16000
   macro avg       0.75      0.97      0.82     16000
weighted avg       0.99      0.98      0.98     16000

Total cost is: 10230.0
