In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn import svm

In [3]:

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [4]:
def print_confusion_matrix(y_true, y_pred, classifier_name):
    cmx_data = confusion_matrix(y_true, y_pred, labels=[0,1,2])
    df_cmx = pd.DataFrame(cmx_data)
    fig, ax = plt.subplots(figsize=(2, 2))
    sns.heatmap(df_cmx, annot=True, fmt='g', square=False)
    #ax.set_ylim(len(set(y_true)), 0)
    plt.savefig('Dokumenty/kolejnosc5/cm/releff/confusion_matrix_' + classifier_name + '.png')
    plt.close()
    
    report = classification_report(y_true, y_pred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv('Dokumenty/kolejnosc5/cm/chi2/classification_report_' +  classifier_name +'.csv')


def hgbt_train(X, Y,i):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = HistGradientBoostingClassifier().fit(X_fold, Y_fold.ravel())
        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)

        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1
    acc = metrics.accuracy_score(all_tru, all_preds)
   # name = "HGBT_" + str(i) + "_" + str(acc)
   # if acc>0.7: print_confusion_matrix(all_tru, all_preds, name)
    return acc


def svn_train(X, Y, gamma, C,i):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = svm.SVC( gamma=gamma, C=C).fit(X_fold, Y_fold.ravel())

        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)
        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1
    acc = metrics.balanced_accuracy_score(all_tru, all_preds)
    #name = "SVN_" + str(i)  + "_C_" + str(C) + "_gamma_"+str(gamma) + "_" + str(acc)
    #if acc>0.7:    print_confusion_matrix(all_tru, all_preds, name)
    return acc


def rf_train(X, Y,crit,i):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = RandomForestClassifier( criterion = crit).fit(X_fold, Y_fold.ravel())
        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)

        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1
    acc = metrics.balanced_accuracy_score(all_tru, all_preds)
    #name = "RF_" + str(i) + "_criterion_" + crit+"_"   + str(acc)
    #if acc>0.7:    print_confusion_matrix(all_tru, all_preds, name)
    return acc


def knn_train(X, Y, n, i ):
    # KFOLD
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)

    all_preds = np.zeros(Y.shape)
    all_tru = np.zeros(Y.shape)
    cnt=0
    for train, test in kfold.split(X, Y):
        X_fold = X[train, :]
        Y_fold = Y[train]
        model = KNeighborsClassifier(n_neighbors=n, algorithm = 'auto').fit(X_fold, Y_fold.ravel())
        X_test = X[test, :]
        Y_test1 = Y[test]
        preds_test = model.predict(X_test)

        for c in range(0, len(preds_test)):
            all_preds[cnt] = preds_test[c]
            all_tru[cnt] = Y_test1[c]
            cnt=cnt+1

    acc = metrics.balanced_accuracy_score(all_tru, all_preds)
   # name = "KNN_" + str(i) + "_n_" + str(n) +"_"   + str(acc)
    #if acc>0.7:    print_confusion_matrix(all_tru, all_preds, name)
    return acc


def iterative_training(rescaledX, k):
    hgbt_tmp=[]
    knn_tmp=[]
    rf_tmp=[]
    svn_tmp=[]

    for i in range(1,k+1):
        print(f'{i}/{k}')
        X_new = rescaledX[:, 0:i]
        acchgbt = hgbt_train(X_new, Y_train,i)
        hgbt_tmp.append({'ft_num': i, 'name':'HGBT','acc': acchgbt})
        
    
    for i in range(1,k+1):
        print(f'{i}/{k}') 
        X_new = rescaledX[:, 0:i]
        for n in [1, 3, 5]:
            accknn = knn_train(X_new, Y_train, n=n,i=i)
            knn_tmp.append({'ft_num': i, 'name':'KNN', 'n': n ,'acc': accknn})

        
    for i in range(1,k+1):
        print(f'{i}/{k}') 
        X_new = rescaledX[:, 0:i]
        for cit in ['gini', 'entropy']:
            accrf = rf_train(X_new, Y_train, crit=cit,i=i)
            rf_tmp.append({'ft_num': i, 'name':'RF', 'criterion': cit, 'acc': accrf})
    
        
    for i in range(1,k+1):
        print(f'{i}/{k}') 
        X_new = rescaledX[:, 0:i]
        for C in [  1, 5, 10, 50, 100, 500, 1000]:
            for gamma in [0.001, 0.01,  0.1,  1, 'auto', 'scale']:
                accsvn = svn_train(X_new, Y_train, gamma=gamma, C=C,i=i)
                svn_tmp.append({'ft_num': i, 'name':'SVN', 'C': C,'gamma': gamma,  'acc': accsvn})

    return hgbt_tmp, knn_tmp, rf_tmp, svn_tmp

In [96]:
path_to_data = 'Dokumenty/kolejnosc5/cechy_episleb_smote_norm_CHI2.csv'
path_to_Y = "Dokumenty/Ynew.csv"

X_train = pd.read_csv(path_to_data).values
Y_train = pd.read_csv(path_to_Y, header=None).values - 1

In [97]:
[w, k] = X_train.shape
hgbt_tmp, knn_tmp, rf_tmp, svn_tmp = iterative_training(X_train, k)

HGBT_res = pd.DataFrame(hgbt_tmp)
maxidx_hgbt = HGBT_res['acc'].idxmax()
print(f"HGBT acc max: {HGBT_res['acc'].iloc[maxidx_hgbt]}, number of features: {HGBT_res['ft_num'].iloc[maxidx_hgbt]} ")

KNN_res = pd.DataFrame(knn_tmp)
maxidx_knn = KNN_res['acc'].idxmax()
print(
    f"KNN acc max: {KNN_res['acc'].iloc[maxidx_knn]}, number of features: {KNN_res['ft_num'].iloc[maxidx_knn]} , n = {KNN_res['n'].iloc[maxidx_knn]} ")

RF_res = pd.DataFrame(rf_tmp)
maxidx_rf = RF_res['acc'].idxmax()
print(
    f"RF acc max: {RF_res['acc'].iloc[maxidx_rf]}, number of features: {RF_res['ft_num'].iloc[maxidx_rf]}, criterion: {RF_res['criterion'].iloc[maxidx_rf]} ")

SVN_res = pd.DataFrame(svn_tmp)
maxidx_svn = SVN_res['acc'].idxmax()
print(
    f"SVN acc max: {SVN_res['acc'].iloc[maxidx_svn]}, number of features: {SVN_res['ft_num'].iloc[maxidx_svn]}, C: {SVN_res['C'].iloc[maxidx_svn]}, gamma: {SVN_res['gamma'].iloc[maxidx_svn]}")

HGBT_res.to_csv('Dokumenty/gridsearch_HGBT_cechy_episleb_smote_norm_CHI2.csv')
KNN_res.to_csv('Dokumenty/gridsearch_KNN_ccechy_episleb_smote_norm_CHI2.csv')
RF_res.to_csv('Dokumenty/gridsearch_RF_ccechy_episleb_smote_norm_CHI2.csv')
SVN_res.to_csv('Dokumenty/gridsearch_SVN_ccechy_episleb_smote_norm_CHI2.csv')

1/482
2/482
3/482
4/482
5/482
6/482
7/482
8/482
9/482
10/482
11/482
12/482
13/482
14/482
15/482
16/482
17/482
18/482
19/482
20/482
21/482
22/482
23/482
24/482
25/482
26/482
27/482
28/482
29/482
30/482
31/482
32/482
33/482
34/482
35/482
36/482
37/482
38/482
39/482
40/482
41/482
42/482
43/482
44/482
45/482
46/482
47/482
48/482
49/482
50/482
51/482
52/482
53/482
54/482
55/482
56/482
57/482
58/482
59/482
60/482
61/482
62/482
63/482
64/482
65/482
66/482
67/482
68/482
69/482
70/482
71/482
72/482
73/482
74/482
75/482
76/482
77/482
78/482
79/482
80/482
81/482
82/482
83/482
84/482
85/482
86/482
87/482
88/482
89/482
90/482
91/482
92/482
93/482
94/482
95/482
96/482
97/482
98/482
99/482
100/482
101/482
102/482
103/482
104/482
105/482
106/482
107/482
108/482
109/482
110/482
111/482
112/482
113/482
114/482
115/482
116/482
117/482
118/482
119/482
120/482
121/482
122/482
123/482
124/482
125/482
126/482
127/482
128/482
129/482
130/482
131/482
132/482
133/482
134/482
135/482
136/482
137/482
138/482
139/

In [99]:
path_to_data = 'Dokumenty/kolejnosc5/cechy_episleb_smote_norm_RMRM.csv'
path_to_Y = "Dokumenty/Ynew.csv"

X_train = pd.read_csv(path_to_data).values
Y_train = pd.read_csv(path_to_Y, header=None).values - 1

In [100]:
[w, k] = X_train.shape
hgbt_tmp, knn_tmp, rf_tmp, svn_tmp = iterative_training(X_train, k)

HGBT_res = pd.DataFrame(hgbt_tmp)
maxidx_hgbt = HGBT_res['acc'].idxmax()
print(f"HGBT acc max: {HGBT_res['acc'].iloc[maxidx_hgbt]}, number of features: {HGBT_res['ft_num'].iloc[maxidx_hgbt]} ")

KNN_res = pd.DataFrame(knn_tmp)
maxidx_knn = KNN_res['acc'].idxmax()
print(
    f"KNN acc max: {KNN_res['acc'].iloc[maxidx_knn]}, number of features: {KNN_res['ft_num'].iloc[maxidx_knn]} , n = {KNN_res['n'].iloc[maxidx_knn]} ")

RF_res = pd.DataFrame(rf_tmp)
maxidx_rf = RF_res['acc'].idxmax()
print(
    f"RF acc max: {RF_res['acc'].iloc[maxidx_rf]}, number of features: {RF_res['ft_num'].iloc[maxidx_rf]}, criterion: {RF_res['criterion'].iloc[maxidx_rf]} ")

SVN_res = pd.DataFrame(svn_tmp)
maxidx_svn = SVN_res['acc'].idxmax()
print(
    f"SVN acc max: {SVN_res['acc'].iloc[maxidx_svn]}, number of features: {SVN_res['ft_num'].iloc[maxidx_svn]}, C: {SVN_res['C'].iloc[maxidx_svn]}, gamma: {SVN_res['gamma'].iloc[maxidx_svn]}")

HGBT_res.to_csv('Dokumenty/gridsearch_HGBT_cechy_episleb_smote_norm_RMRM.csv')
KNN_res.to_csv('Dokumenty/gridsearch_KNN_ccechy_episleb_smote_norm_RMRM.csv')
RF_res.to_csv('Dokumenty/gridsearch_RF_ccechy_episleb_smote_norm_RMRM.csv')
SVN_res.to_csv('Dokumenty/gridsearch_SVN_ccechy_episleb_smote_norm_RMRM.csv')

1/482
2/482
3/482
4/482
5/482
6/482
7/482
8/482
9/482
10/482
11/482
12/482
13/482
14/482
15/482
16/482
17/482
18/482
19/482
20/482
21/482
22/482
23/482
24/482
25/482
26/482
27/482
28/482
29/482
30/482
31/482
32/482
33/482
34/482
35/482
36/482
37/482
38/482
39/482
40/482
41/482
42/482
43/482
44/482
45/482
46/482
47/482
48/482
49/482
50/482
51/482
52/482
53/482
54/482
55/482
56/482
57/482
58/482
59/482
60/482
61/482
62/482
63/482
64/482
65/482
66/482
67/482
68/482
69/482
70/482
71/482
72/482
73/482
74/482
75/482
76/482
77/482
78/482
79/482
80/482
81/482
82/482
83/482
84/482
85/482
86/482
87/482
88/482
89/482
90/482
91/482
92/482
93/482
94/482
95/482
96/482
97/482
98/482
99/482
100/482
101/482
102/482
103/482
104/482
105/482
106/482
107/482
108/482
109/482
110/482
111/482
112/482
113/482
114/482
115/482
116/482
117/482
118/482
119/482
120/482
121/482
122/482
123/482
124/482
125/482
126/482
127/482
128/482
129/482
130/482
131/482
132/482
133/482
134/482
135/482
136/482
137/482
138/482
139/

In [5]:
path_to_data = 'Dokumenty/kolejnosc5/cechy_episleb_smote_norm_RelefF.csv'
path_to_Y = "Dokumenty/Ynew.csv"

X_train = pd.read_csv(path_to_data).values
Y_train = pd.read_csv(path_to_Y, header=None).values - 1

In [6]:
[w, k] = X_train.shape
hgbt_tmp, knn_tmp, rf_tmp, svn_tmp = iterative_training(X_train, k)

HGBT_res = pd.DataFrame(hgbt_tmp)
maxidx_hgbt = HGBT_res['acc'].idxmax()
print(f"HGBT acc max: {HGBT_res['acc'].iloc[maxidx_hgbt]}, number of features: {HGBT_res['ft_num'].iloc[maxidx_hgbt]} ")

KNN_res = pd.DataFrame(knn_tmp)
maxidx_knn = KNN_res['acc'].idxmax()
print(
    f"KNN acc max: {KNN_res['acc'].iloc[maxidx_knn]}, number of features: {KNN_res['ft_num'].iloc[maxidx_knn]} , n = {KNN_res['n'].iloc[maxidx_knn]} ")

RF_res = pd.DataFrame(rf_tmp)
maxidx_rf = RF_res['acc'].idxmax()
print(
    f"RF acc max: {RF_res['acc'].iloc[maxidx_rf]}, number of features: {RF_res['ft_num'].iloc[maxidx_rf]}, criterion: {RF_res['criterion'].iloc[maxidx_rf]} ")

SVN_res = pd.DataFrame(svn_tmp)
maxidx_svn = SVN_res['acc'].idxmax()
print(
    f"SVN acc max: {SVN_res['acc'].iloc[maxidx_svn]}, number of features: {SVN_res['ft_num'].iloc[maxidx_svn]}, C: {SVN_res['C'].iloc[maxidx_svn]}, gamma: {SVN_res['gamma'].iloc[maxidx_svn]}")

HGBT_res.to_csv('Dokumenty/gridsearch_HGBT_cechy_episleb_smote_norm_RelefF.csv')
KNN_res.to_csv('Dokumenty/gridsearch_KNN_ccechy_episleb_smote_norm_RelefF.csv')
RF_res.to_csv('Dokumenty/gridsearch_RF_ccechy_episleb_smote_norm_RelefF.csv')
SVN_res.to_csv('Dokumenty/gridsearch_SVN_ccechy_episleb_smote_norm_RelefF.csv')

1/482
2/482
3/482
4/482
5/482
6/482
7/482
8/482
9/482
10/482
11/482
12/482
13/482
14/482
15/482
16/482
17/482
18/482
19/482
20/482
21/482
22/482
23/482
24/482
25/482
26/482
27/482
28/482
29/482
30/482
31/482
32/482
33/482
34/482
35/482
36/482
37/482
38/482
39/482
40/482
41/482
42/482
43/482
44/482
45/482
46/482
47/482
48/482
49/482
50/482
51/482
52/482
53/482
54/482
55/482
56/482
57/482
58/482
59/482
60/482
61/482
62/482
63/482
64/482
65/482
66/482
67/482
68/482
69/482
70/482
71/482
72/482
73/482
74/482
75/482
76/482
77/482
78/482
79/482
80/482
81/482
82/482
83/482
84/482
85/482
86/482
87/482
88/482
89/482
90/482
91/482
92/482
93/482
94/482
95/482
96/482
97/482
98/482
99/482
100/482
101/482
102/482
103/482
104/482
105/482
106/482
107/482
108/482
109/482
110/482
111/482
112/482
113/482
114/482
115/482
116/482
117/482
118/482
119/482
120/482
121/482
122/482
123/482
124/482
125/482
126/482
127/482
128/482
129/482
130/482
131/482
132/482
133/482
134/482
135/482
136/482
137/482
138/482
139/