In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix,roc_auc_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,chi2,RFE
from sklearn.preprocessing import Imputer,Normalizer,MinMaxScaler

ModuleNotFoundError: No module named 'imblearn'

In [513]:
raw_data, meta = arff.loadarff('data/1year.arff')
data = pd.DataFrame(raw_data)
data

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class
0,0.200550,0.379510,0.396410,2.04720,32.3510,0.388250,0.249760,1.330500,1.13890,0.504940,...,0.121960,0.397180,0.87804,0.001924,8.4160,5.1372,82.65800,4.41580,7.42770,b'0'
1,0.209120,0.499880,0.472250,1.94470,14.7860,0.000000,0.258340,0.996010,1.69960,0.497880,...,0.121300,0.420020,0.85300,0.000000,4.1486,3.2732,107.35000,3.40000,60.98700,b'0'
2,0.248660,0.695920,0.267130,1.55480,-1.1523,0.000000,0.309060,0.436950,1.30900,0.304080,...,0.241140,0.817740,0.76599,0.694840,4.9909,3.9510,134.27000,2.71850,5.20780,b'0'
3,0.081483,0.307340,0.458790,2.49280,51.9520,0.149880,0.092704,1.866100,1.05710,0.573530,...,0.054015,0.142070,0.94598,0.000000,4.5746,3.6147,86.43500,4.22280,5.54970,b'0'
4,0.187320,0.613230,0.229600,1.40630,-7.3128,0.187320,0.187320,0.630700,1.15590,0.386770,...,0.134850,0.484310,0.86515,0.124440,6.3985,4.3158,127.21000,2.86920,7.89800,b'0'
5,0.228220,0.497940,0.359690,1.75020,-47.7170,0.000000,0.281390,1.008300,1.97860,0.502060,...,0.139320,0.454570,0.85891,0.023002,3.4028,8.9949,88.44400,4.12690,12.29900,b'0'
6,0.111090,0.647440,0.289710,1.47050,2.5349,0.000000,0.111090,0.544540,1.73480,0.352560,...,0.605900,0.315100,0.40871,0.000000,6.3222,2.9098,129.55000,2.81730,18.35200,b'0'
7,0.532320,0.027059,0.705540,53.95400,299.5800,0.000000,0.652400,35.957000,0.65273,0.972940,...,0.086730,0.547130,0.49521,0.013194,9.1300,82.0500,7.45030,48.99100,2.32170,b'0'
8,0.009020,0.632020,0.053735,1.12630,-37.8420,0.000000,0.014434,0.582230,1.33320,0.367980,...,0.180110,0.024512,0.84165,0.340940,9.9665,4.2382,116.50000,3.13300,2.56030,b'0'
9,0.124080,0.838370,0.142040,1.16940,-91.8830,0.000000,0.153280,0.192790,2.11560,0.161630,...,0.079665,0.767680,0.92847,0.000000,3.3192,6.4994,144.63000,2.52360,107.67000,b'0'


This function uses SkLearn GridSearch to test out all the possible combinates defined in the parameters, to determine which features generate the best results for the strategy defined in the pipeline.

In [514]:
def select_best_combination(pipe, parameters, X_train, y_train):
    CV = GridSearchCV(pipe, parameters, scoring = 'roc_auc', n_jobs= 1)
    CV.fit(X_train, y_train)

    print('Best score and parameter combination = ')

    print(CV.best_score_)    
    print(CV.best_params_)   

    predictions = CV.predict_proba(X_test)
    return predictions

Here, the function is retrieving the predictions and classifying those with a value higher than the threshold to 1. The rest are classified as 0.

In [515]:
def categorize(predictions,threshold):
    result = []
    proba = []

    for i in predictions:
        proba.append(i[1])
        if i[1] > threshold:
            result.append(1)
        else:
            result.append(0)

    view = pd.DataFrame(np.vstack((proba, y_test)).T,columns=['Predicted Outcomes','Actual Outcomes'])

    return result

This function creates a confusion matrix, and calculates all the relating metrics to determine the performance of that strategy's test.

In [516]:
def results_metrics(y_test, result):
    conf_matrix = confusion_matrix(y_test, result)

    accuracy = np.sum(result == y_test) / len(result) * 100
    precision = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[0,1]) * 100
    recall = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[1,0]) * 100
    specificity = conf_matrix[1,1] / (conf_matrix[1,1] + conf_matrix[0,1]) * 100

    print(conf_matrix)
    print("Accuracy: {0:.2f}%".format(accuracy))
    print("Precision: {0:.2f}%".format(precision))
    print("Recall: {0:.2f}%".format(recall))
    print("Specificity: {0:.2f}%".format(specificity))
    
    return accuracy,precision,recall,specificity

Generate the ROC(Receiver Operating Characteristic) AUC (Area Under Curve) score.

In [517]:
def roc_score(y_test, predictions):

    score = roc_auc_score(y_test, predictions)
    print("ROC AUC score: {}".format(score))
    
    return score

In [None]:
final_pipeline = CV.best_estimator_
select_indices = final_pipeline.named_steps['selector'].transform(
    np.arange(len(X_train.columns)).reshape(1, -1)
)
feature_names = X_train.columns[select_indices]
feature_names

Here we add the results from the current epoch to the total, allowing us to find the average later.

In [545]:
def add_results_to_strategy(strategy_results, X_train, X_test, y_train, y_test, pipe, parameters):
    
    predictions = select_best_combination(pipe, parameters, X_train, y_train)
    result = categorize(predictions,0.55)
    
    accuracy,precision,recall,specificity = results_metrics(y_test,result)
    score = roc_score(y_test,predictions[:,1])
    
    strategy_results.accuracy += accuracy
    strategy_results.precision += precision
    strategy_results.recall += recall
    strategy_results.specificity += specificity
    strategy_results.roc_auc += score
    
    return strategy_results;

In [546]:
class StrategyResult:
    accuracy = 0
    precision = 0
    recall = 0
    specificity = 0
    roc_auc = 0

Below, a simulation with 5 different strategies is run for a number of times. 

With each epoch, a new training/testing set is retrieved from the global dataset. 

For each strategy, GridSearches are used to determine the best factors for that particular strategy with the given data.

After the simulation is complete, the average results for each strategy is determined, in order to distinguish the best performing strategy.

In [547]:
input = data

input['class'] = input['class'].apply(lambda x: float(x))

strategy_results = []
for i in range(0,5):
    strategy_results.append(StrategyResult())
        

epochs = 3
        
for i in range(0,epochs):
    
    print("\n\n *****************************")
    print("*****  EPOCH {}  *****".format(i))
    print("*****************************\n\n ")
    

    X_train, X_test, y_train, y_test = train_test_split(input.loc[:,input.columns != 'class'], input['class'], test_size=0.3)

    cls = LogisticRegression()
    
    

    print("\n\n **************** Robust Scaling + PCA + RFE + Smote *************")

    pipe = Pipeline([
        ('imp', Imputer(missing_values='NaN', axis=0)),
        ('scaler', preprocessing.RobustScaler()),
        ('pca', PCA()),
        ('selector', RFE(cls)),
        ('smote', SMOTE(random_state=42)),
        ('logistic', cls)])

    parameters = {}
    parameters['imp__strategy'] = ['mean','median']
    parameters['pca__n_components'] = [60,45,30]
    parameters['selector__n_features_to_select'] = [10,15,30]

    strategy_results[0] = add_results_to_strategy(strategy_results[0], X_train, X_test, y_train, y_test, pipe, parameters);
    

    print("\n\n **************** MinMaxScaling + PCA + RFE + Smote *************")

    pipe = Pipeline([
        ('imp', Imputer(missing_values='NaN', axis=0)),
        ('scaler', preprocessing.MinMaxScaler()),
        ('pca', PCA()),
        ('selector', RFE(cls)),
        ('smote', SMOTE(random_state=42)),
        ('logistic', cls)])

    parameters = {}
    parameters['imp__strategy'] = ['mean','median']
    parameters['pca__n_components'] = [60,45,30]
    parameters['selector__n_features_to_select'] = [10,15,30]

    strategy_results[1] = add_results_to_strategy(strategy_results[1], X_train, X_test, y_train, y_test, pipe, parameters);

    
    print("\n\n **************** Robust Scaling + PCA + RFE *************")

    pipe = Pipeline([
        ('imp', Imputer(missing_values='NaN', axis=0)),
        ('scaler', preprocessing.RobustScaler()),
        ('pca', PCA()),
        ('selector', RFE(cls)),
        ('logistic', cls)])

    parameters = {}
    parameters['imp__strategy'] = ['mean','median']
    parameters['pca__n_components'] = [60,45,30]
    parameters['selector__n_features_to_select'] = [10,15,30]

    strategy_results[2] = add_results_to_strategy(strategy_results[2], X_train, X_test, y_train, y_test, pipe, parameters);


    print("\n\n **************** Robust Scaling + RFE + Smote *************")

    pipe = Pipeline([
        ('imp', Imputer(missing_values='NaN', axis=0)),
        ('scaler', preprocessing.RobustScaler()),
        ('selector', RFE(cls)),
        ('smote', SMOTE(random_state=42)),
        ('logistic', cls)])

    parameters = {}
    parameters['imp__strategy'] = ['mean','median']
    parameters['selector__n_features_to_select'] = [10,15,30]

    strategy_results[3] = add_results_to_strategy(strategy_results[3], X_train, X_test, y_train, y_test, pipe, parameters);


    print("\n\n **************** RFE + Smote *************")

    pipe = Pipeline([
        ('imp', Imputer(missing_values='NaN', axis=0)),
        ('selector', RFE(cls)),
        ('smote', SMOTE(random_state=42)),
        ('logistic', cls)])

    parameters = {}
    parameters['imp__strategy'] = ['mean','median']
    parameters['selector__n_features_to_select'] = [10,15,30]

    strategy_results[4] = add_results_to_strategy(strategy_results[4], X_train, X_test, y_train, y_test, pipe, parameters);
    
    
print("\n\n *****************************")
print("*****  RESULTS *****")
print("*****************************\n\n ")
    
for i in range(0,len(strategy_results)):
    print("\n\nStrategy {} results".format(i+1))
    print("Accuracy: {}".format(strategy_results[i].accuracy / epochs))
    print("Precision: {}".format(strategy_results[i].precision / epochs))
    print("Recall: {}".format(strategy_results[i].recall / epochs))
    print("Specificity: {}".format(strategy_results[i].specificity / epochs))
    print("ROC AUC: {}".format(strategy_results[i].roc_auc / epochs))
    
    
    
    
    



 *****************************
*****  EPOCH 0  *****
*****************************

 


 **************** Robust Scaling + PCA + RFE + Smote *************
Best score and parameter combination = 
0.7158025428984746
{'imp__strategy': 'mean', 'pca__n_components': 30, 'selector__n_features_to_select': 30}
[[1966   47]
 [  84   12]]
Accuracy: 93.79%
Precision: 97.67%
Recall: 95.90%
Specificity: 20.34%
ROC AUC score: 0.6744028398741515


 **************** MinMaxScaling + PCA + RFE + Smote *************
Best score and parameter combination = 
0.6729396867907963
{'imp__strategy': 'median', 'pca__n_components': 30, 'selector__n_features_to_select': 30}
[[1706  307]
 [  68   28]]
Accuracy: 82.22%
Precision: 84.75%
Recall: 96.17%
Specificity: 8.36%
ROC AUC score: 0.618417784401391


 **************** Robust Scaling + PCA + RFE *************
Best score and parameter combination = 
0.7321596915263161
{'imp__strategy': 'mean', 'pca__n_components': 30, 'selector__n_features_to_select': 30}
[[1998  