In [27]:
from arff2pandas import a2p
import pandas as pd
import numpy as np


Treinando ensembles para a base JM1

In [28]:
with open('jm1.arff') as f:
    df = a2p.load(f)
    print(df)

       loc@NUMERIC  v(g)@NUMERIC  ev(g)@NUMERIC  iv(g)@NUMERIC  n@NUMERIC  \
0              1.1           1.4            1.4            1.4        1.3   
1              1.0           1.0            1.0            1.0        1.0   
2             72.0           7.0            1.0            6.0      198.0   
3            190.0           3.0            1.0            3.0      600.0   
4             37.0           4.0            1.0            4.0      126.0   
5             31.0           2.0            1.0            2.0      111.0   
6             78.0           9.0            5.0            4.0        0.0   
7              8.0           1.0            1.0            1.0       16.0   
8             24.0           2.0            1.0            2.0        0.0   
9            143.0          22.0           20.0           10.0        0.0   
10            73.0          10.0            4.0            6.0        0.0   
11            83.0          11.0           10.0            7.0        0.0   

Base com missing values, preencher com a media -> 4 valores nan

In [29]:
df = df.interpolate()

In [30]:
input_features = df.drop(["defects@{false,true}"], axis=1)

Base desbalanceada -> lembrar de pegar k-fold estratificado

In [31]:
output_class = df["defects@{false,true}"]
len(output_class[output_class == "true"])

2106

In [32]:
len(output_class[output_class == "false"])

8779

In [33]:
output_class = np.where(output_class == 'true', 1, 0)

from imblearn.over_sampling import SMOTE
input_features, output_class = SMOTE().fit_sample(input_features, output_class)

In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [35]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

#Taxa de negativos verdadeiros
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    return specificity

#Tava de positivos verdadeiros
def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp+fn)
    return sensitivity

#Media geometrica entre as taxas
def gmean(y_true, y_pred):
    import math
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)
    return (specificity*sensitivity)**(1/2)



In [36]:
def get_bagging_results(input_features, output_class):
    import warnings
    warnings.filterwarnings("ignore")

    scoring = {'acc': 'accuracy',
               'auc': 'roc_auc',
               #'specificity': make_scorer(specificity),
               #'sensitivity': make_scorer(sensitivity),
               'gmean': make_scorer(gmean),
               #'precision': 'precision',
               #'recall': 'recall',
               'f1': 'f1'

    }

    print("----Bagging----")
    start = True
    for name, clf in [("Decision Tree", DecisionTreeClassifier()), ("Perceptron", Perceptron())]:
        print(name)
        for perc in [1.0]:
            es = BaggingClassifier(base_estimator=clf, n_estimators=100, 
                           max_samples=perc, max_features=1.0, bootstrap=True, 
                           bootstrap_features=False, n_jobs=4)

            scores = cross_validate(es, input_features, output_class, scoring=scoring,
                             cv=10, return_train_score=False)

            if(start):          
                start = False


                for key in scores.keys():
                    print(key, end='\t')
                print()

            print(perc, end='\t')
            for val in scores.values():
                print("{}+-({})".format(str(round(np.mean(val), 2)), str(round(np.std(val), 2))), end='\t')
            print()
            

In [37]:
get_bagging_results(input_features, output_class)

----Bagging----
Decision Tree
fit_time	score_time	test_acc	test_auc	test_gmean	test_f1	
1.0	5.68+-(0.32)	0.88+-(0.2)	0.86+-(0.11)	0.93+-(0.1)	0.85+-(0.13)	0.84+-(0.16)	
Perceptron
1.0	0.79+-(0.09)	1.18+-(0.09)	0.62+-(0.04)	0.62+-(0.05)	0.61+-(0.04)	0.61+-(0.03)	


In [42]:
def get_random_subspace_results(input_features, output_class):
    import warnings
    warnings.filterwarnings("ignore")

    scoring = {'acc': 'accuracy',
               'auc': 'roc_auc',
               #'specificity': make_scorer(specificity),
               #'sensitivity': make_scorer(sensitivity),
               'gmean': make_scorer(gmean),
               #'precision': 'precision',
               #'recall': 'recall',
               'f1': 'f1'

    }
    
    print("----Random Subspace----")
    start = True
    for name, clf in [("Decision Tree", DecisionTreeClassifier()), ("Perceptron", Perceptron())]:
        print(name)
        es = BaggingClassifier(base_estimator=clf, n_estimators=100, 
                       max_samples=1.0, max_features=0.5, bootstrap=False, 
                       bootstrap_features=True, n_jobs=2)

        scores = cross_validate(es, input_features, output_class, scoring=scoring,
                         cv=10, return_train_score=False)

        if(start):          
            start = False
            for key in scores.keys():
                print(key, end='\t')
            print()

        for val in scores.values():
            print("{}+-({})".format(str(round(np.mean(val), 2)), str(round(np.std(val), 2))), end='\t')
        print()

Treinando ensembles para a base CM1

In [12]:
with open('cm1.arff') as f:
    cm1 = a2p.load(f)
    print(cm1)

     loc@NUMERIC  v(g)@NUMERIC  ev(g)@NUMERIC  iv(g)@NUMERIC  n@NUMERIC  \
0            1.1           1.4            1.4            1.4        1.3   
1            1.0           1.0            1.0            1.0        1.0   
2           24.0           5.0            1.0            3.0       63.0   
3           20.0           4.0            4.0            2.0       47.0   
4           24.0           6.0            6.0            2.0       72.0   
5           24.0           6.0            6.0            2.0       72.0   
6            7.0           1.0            1.0            1.0       11.0   
7           12.0           2.0            1.0            2.0       23.0   
8           25.0           5.0            5.0            5.0      107.0   
9           46.0          15.0            3.0            1.0      239.0   
10          34.0           5.0            5.0            1.0      155.0   
11          10.0           2.0            1.0            1.0       35.0   
12          23.0         

In [13]:
cm1_input = cm1.drop(["defects@{false,true}"], axis=1)
cm1_output = cm1["defects@{false,true}"]

In [14]:
len(cm1_output[cm1_output == "true"])

49

In [15]:
len(cm1_output[cm1_output == "false"])

449

In [16]:
cm1_output = np.where(cm1_output == 'true', 1, 0)
cm1_input, cm1_output = SMOTE().fit_sample(cm1_input, cm1_output)

In [21]:
get_bagging_results(cm1_input, cm1_output)

----Bagging----
Decision Tree
fit_time	score_time	test_acc	test_auc	test_gmean	test_f1	
0.5	0.3+-(0.02)	0.82+-(0.04)	0.91+-(0.04)	0.97+-(0.04)	0.91+-(0.04)	0.92+-(0.04)	
0.6	0.35+-(0.07)	0.81+-(0.07)	0.92+-(0.05)	0.97+-(0.03)	0.91+-(0.05)	0.92+-(0.05)	
0.7	0.33+-(0.06)	0.8+-(0.04)	0.92+-(0.04)	0.97+-(0.03)	0.91+-(0.04)	0.92+-(0.04)	
0.8	0.42+-(0.06)	0.83+-(0.06)	0.92+-(0.04)	0.98+-(0.03)	0.92+-(0.04)	0.92+-(0.04)	
0.9	0.39+-(0.05)	0.85+-(0.06)	0.92+-(0.04)	0.98+-(0.03)	0.92+-(0.04)	0.92+-(0.04)	
1.0	0.39+-(0.03)	0.8+-(0.08)	0.92+-(0.05)	0.98+-(0.03)	0.91+-(0.05)	0.92+-(0.05)	
Perceptron
0.5	0.19+-(0.02)	0.81+-(0.06)	0.5+-(0.01)	0.47+-(0.23)	0.02+-(0.06)	0.67+-(0.0)	
0.6	0.25+-(0.06)	0.83+-(0.08)	0.5+-(0.01)	0.5+-(0.23)	0.04+-(0.08)	0.67+-(0.0)	
0.7	0.22+-(0.06)	0.78+-(0.05)	0.5+-(0.01)	0.48+-(0.23)	0.02+-(0.06)	0.67+-(0.0)	
0.8	0.24+-(0.06)	0.84+-(0.06)	0.5+-(0.01)	0.4+-(0.18)	0.02+-(0.06)	0.67+-(0.0)	
0.9	0.19+-(0.03)	0.79+-(0.05)	0.5+-(0.01)	0.42+-(0.22)	0.02+-(0.06)	0.67+-(0.0)	
1.0

In [43]:
get_random_subspace_results(cm1_input, cm1_output)

----Random Subspace----
Decision Tree
fit_time	score_time	test_acc	test_auc	test_gmean	test_f1	
0.36+-(0.01)	0.74+-(0.03)	0.95+-(0.06)	0.99+-(0.02)	0.95+-(0.06)	0.95+-(0.07)	
Perceptron
0.17+-(0.03)	0.7+-(0.02)	0.55+-(0.03)	0.48+-(0.23)	0.31+-(0.13)	0.68+-(0.01)	
