In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn import svm

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

import joblib


In [2]:
def print_confusion_matrix(y_true, y_pred, classifier_name):
    cmx_data = confusion_matrix(y_true, y_pred, labels=[0,1,2])
    df_cmx = pd.DataFrame(cmx_data)
    fig, ax = plt.subplots(figsize=(2, 2))
    sns.heatmap(df_cmx, annot=True, fmt='g', square=False)
    #ax.set_ylim(len(set(y_true)), 0)
    plt.savefig('Dokumenty/kolejnosc5/confusion_matrix_' + classifier_name + '.png')
    plt.close()
    
    report = classification_report(y_true, y_pred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv('Dokumenty/kolejnosc5/classification_report_' +  classifier_name +'.csv')


def hgbt_train(X, Y,i):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    k=1
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = HistGradientBoostingClassifier().fit(X_fold, Y_fold.ravel())
        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)
        joblib.dump(model, "Dokumenty/kolejnosc5/hgbt_"+str(k)+".pkl") 
        k=k+1
        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1
    acc = metrics.accuracy_score(all_tru, all_preds)
    name = "HGBT_" + str(i) + "_" + str(acc)
    print_confusion_matrix(all_tru, all_preds, name)
    return acc


def svn_train(X, Y, gamma, C,i):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    k=1
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = svm.SVC( gamma=gamma, C=C).fit(X_fold, Y_fold.ravel())

        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)
        joblib.dump(model, "Dokumenty/kolejnosc5/svn_"+str(k)+".pkl") 
        k=k+1
        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1
    acc = metrics.balanced_accuracy_score(all_tru, all_preds)
    name = "SVN_" + str(i)  + "_C_" + str(C) + "_gamma_"+str(gamma) + "_" + str(acc)
    print_confusion_matrix(all_tru, all_preds, name)
    return acc


def rf_train(X, Y,crit,i):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    k=1
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = RandomForestClassifier( criterion = crit).fit(X_fold, Y_fold.ravel())
        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)
        joblib.dump(model, "Dokumenty/kolejnosc5/rf_"+str(k)+".pkl") 
        k=k+1
        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1
    acc = metrics.balanced_accuracy_score(all_tru, all_preds)
    name = "RF_" + str(i) + "_criterion_" + crit+"_"   + str(acc)
    print_confusion_matrix(all_tru, all_preds, name)
    return acc


def knn_train(X, Y, n, i ):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)

    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    k=1
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = KNeighborsClassifier(n_neighbors=n, algorithm = 'auto').fit(X_fold, Y_fold.ravel())
        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)
        joblib.dump(model, "Dokumenty/kolejnosc5/knn_"+str(k)+".pkl") 
        k=k+1
        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1

    acc = metrics.balanced_accuracy_score(all_tru, all_preds)
    name = "KNN_" + str(i) + "_n_" + str(n) +"_"   + str(acc)
    print_confusion_matrix(all_tru, all_preds, name)
    return acc


In [3]:
path_to_data = 'Dokumenty/kolejnosc5/cechy_episleb_smote_norm_CHI2.csv'
path_to_Y = "Dokumenty/Ynew.csv"

X_train = pd.read_csv(path_to_data).values
Y_train = pd.read_csv(path_to_Y, header=None).values - 1

In [10]:
i=83
X_new = X_train[:, 0:i]
acchgbt = hgbt_train(X_new, Y_train,i)
print(acchgbt)        
    
i=415
X_new = X_train[:, 0:i]
accknn = knn_train(X_new, Y_train, n=1,i=i)
print(accknn)

i= 87
X_new = X_train[:, 0:i]
accrf = rf_train(X_new, Y_train, crit='gini',i=i)
print(accrf)    
        
i=426
X_new = X_train[:, 0:i]
accsvn = svn_train(X_new, Y_train, gamma=0.001, C=50,i=i)
print(accsvn)


0.7964912280701755
0.768421052631579
0.7368421052631579
0.8526315789473684


In [62]:
path_to_data = 'Dokumenty/kolejnosc5/cechy_episleb_smote_norm_RMRM.csv'
path_to_Y = "Dokumenty/Ynew.csv"

X_train = pd.read_csv(path_to_data).values
Y_train = pd.read_csv(path_to_Y, header=None).values - 1

In [30]:
i=39
X_new = X_train[:, 0:i]
acchgbt = hgbt_train(X_new, Y_train,i)
print(acchgbt)        
    
i=36
X_new = X_train[:, 0:i]
accknn = knn_train(X_new, Y_train, n=1,i=i)
print(accknn)

i= 82
X_new = X_train[:, 0:i]
accrf = rf_train(X_new, Y_train, crit='entropy',i=i)
print(accrf)    
        
i=293
X_new = X_train[:, 0:i]
accsvn = svn_train(X_new, Y_train, gamma=0.001, C=100,i=i)
print(accsvn)


0.7789473684210526
0.7929824561403508
0.7473684210526316
0.8491228070175438


In [46]:
path_to_data = 'Dokumenty/kolejnosc5/cechy_episleb_smote_norm_RelefF.csv'
path_to_Y = "Dokumenty/Ynew.csv"

X_train = pd.read_csv(path_to_data).values
Y_train = pd.read_csv(path_to_Y, header=None).values - 1

In [47]:
i=465
X_new = X_train[:, 0:i]
acchgbt = hgbt_train(X_new, Y_train,i)
print(acchgbt)        
    
i=424
X_new = X_train[:, 0:i]
accknn = knn_train(X_new, Y_train, n=1,i=i)
print(accknn)

i= 137
X_new = X_train[:, 0:i]
accrf = rf_train(X_new, Y_train, crit='gini',i=i)
print(accrf)    
        
i=445
X_new = X_train[:, 0:i]
accsvn = svn_train(X_new, Y_train, gamma=0.001, C=100,i=i)
print(accsvn)


0.7824561403508772
0.7789473684210527
0.743859649122807
0.8456140350877193
