In [6]:
import warnings
from scipy.io import arff
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import tensorflow as tf
from typing import Tuple, List
from keras import Sequential
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import catboost as cb
from mlxtend.classifier import StackingClassifier
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
warnings.filterwarnings("ignore")

## Przygotowanie danych

In [7]:
data = arff.loadarff('rethinpathy.arff')
df = pd.DataFrame(data[0])
df[df.columns[0:8]] = df[df.columns[0:8]].astype(int)
df[df.columns[18:20]] = df[df.columns[18:20]].astype(int)
col_names = ['Quality', 'Pre-screening', 'MA result1', 'MA result2', 'MA result3', 'MA result4', 'MA result5', 'MA result6', 'Exudates result1', 'Exudates result2', 'Exudates result3', 'Exudates result4', 'Exudates result5', 'Exudates result6', 'Exudates result7',
            'Exudates result', 'Macula - Optic disc dist', 'Optic disc diameter', 'AM/FM classification', 'Class']
df.columns = col_names

In [8]:
# ograniczenie wartości odstających
df = df[df['MA result1'] <= 110]
df = df[df['Exudates result1'] <= 230]
# wybór kolumn z wyższym wsp Pearsona oraz przekazanie etykiet do osobnego DataFrame
data = df[df.columns[2:16]]
target = df['Class']
# podzielenie danych na zestaw trenujący i testowy
data_train, data_test, target_train, target_test = train_test_split(
    data, target, test_size=0.10, random_state=10)
# standaryzacja danych
std_scale = StandardScaler(with_mean=True, with_std=True).fit(data_train)
data_train_norm = std_scale.transform(data_train)
data_test_norm = std_scale.transform(data_test)

### Stacking

In [10]:
# #neural network
# adam = Adam(lr=0.01)

# classifier = Sequential()
# classifier.add(Dense(12, activation='sigmoid', kernel_initializer='random_normal', input_dim=14))
# classifier.add(Dense(8, activation='sigmoid', kernel_initializer='random_normal'))
# classifier.add(Dense(4, activation='sigmoid', kernel_initializer='random_normal'))
# classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
# classifier.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
# history1 = classifier.fit(data_train_norm, target_train, epochs=1000, verbose=0)
# _, accuracy = classifier.evaluate(data_test_norm, target_test)
# pred_nn = classifier.predict(data_test_norm)

#print('Accuracy: %.2f' % (accuracy*100))

# plt.plot(history1.history['accuracy'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.show()

In [13]:
svc_model = LinearSVC(max_iter = 1000, class_weight = 'balanced', C=0.5)
clf_svc = svc_model.fit(data_train_norm, target_train)
pred_svc = clf_svc.predict(data_test_norm)
print(f1_score(pred_svc, target_test))
print("LinearSVC acc: %.2f" % (accuracy_score(target_test, pred_svc)*100))
print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_svc)*100))

0.75
LinearSVC acc: 82.14
ROC AUC: 79.69


## Bagging meta-estimator

In [52]:
bag = BaggingClassifier(svc_model, max_samples=0.2, max_features=0.9)
clf_bag = bag.fit(data_train_norm, target_train)
pred_bag = clf_bag.predict(data_test_norm)
print(f1_score(pred_bag, target_test))
print("Bag acc: ", accuracy_score(target_test, pred_bag, normalize = True))
print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_bag)*100))

0.6923076923076923
Bag acc:  0.7857142857142857
ROC AUC: 75.78


## Forest of randomized trees

In [40]:
parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 20, 
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6,
              'max_leaf_nodes': None}
rfc = RandomForestClassifier(**parameters)
clf_rfc = rfc.fit(data_train_norm, target_train)
pred_rfc = clf_rfc.predict(data_test_norm)
print(f1_score(pred_rfc, target_test))
print("RFC acc: ", accuracy_score(target_test, pred_rfc, normalize = True))
print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_rfc)*100))

0.6597938144329897
RFC acc:  0.7053571428571429
ROC AUC: 70.05


## LogisticRegression

In [41]:
lr = LogisticRegression(max_iter = 1000000, class_weight = 'balanced', C=0.5)
clf_lr = lr.fit(data_train_norm, target_train)
pred_lr = clf_lr.predict(data_test_norm)
print(f1_score(pred_lr, target_test))
print("LR acc: %.2f" % (accuracy_score(target_test, pred_lr)*100))
print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_lr)*100))

0.7088607594936709
LR acc: 79.46
ROC AUC: 76.82


## DT

In [42]:
dt = DecisionTreeClassifier(random_state=0)
clf_dt = dt.fit(data_train_norm, target_train)
pred_dt = clf_dt.predict(data_test_norm)
print(f1_score(pred_dt, target_test))
print("DT acc: ", accuracy_score(target_test, pred_dt, normalize = True))
print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_dt)*100))

0.6851851851851851
DT acc:  0.6964285714285714
ROC AUC: 70.57


## SVC

In [43]:
svc = SVC(probability=True, class_weight='balanced', random_state=500)
clf_svc = svc.fit(data_train_norm, target_train)
pred_svc = clf_svc.predict(data_test_norm)
print(f1_score(pred_svc, target_test))
print("SVC acc: ", accuracy_score(target_test, pred_svc, normalize = True))
print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_svc)*100))

0.7560975609756098
SVC acc:  0.8214285714285714
ROC AUC: 79.95


## Voting 1 - 3 modele

In [44]:
classifiers = [svc_model, bag, rfc, lr, dt, svc]
from itertools import combinations
kombinacje = list(combinations(classifiers, 3))
i = 0
accuracies = []

for komb in kombinacje:
    clf_vote1 = VotingClassifier(
        estimators=[('model1', komb[0]), ('model2', komb[1]), ('model3', komb[2])]
    )
    clf_vote1.fit(data_train_norm, target_train)
    pred_vote1 = clf_vote1.predict(data_test_norm)
    score_vote = f1_score(pred_vote1, target_test)
    i= i+1
    accuracies.append(accuracy_score(target_test, pred_vote1, normalize = True))
    del clf_vote1, pred_vote1

print("max acc: ", max(accuracies), " na indeksie: ", accuracies.index(max(accuracies)))
print(kombinacje[accuracies.index(max(accuracies))])

In [45]:
test = {"bag, rfc, dt"      : [0.875, 4],
        "svm_model, dt, svc": [0.866, 11],
        "svm_model, rfc, dt": [0.866, 5]}

## Voting 2 - 5 modeli

In [46]:
test_5 = {"svm_model, bag, dt, rfc, svc": [[0.875, 8], [0.866, 2]]}

In [47]:
classifiers = [svc_model, bag, rfc, lr, dt, svc]
from itertools import combinations
kombinacje5 = list(combinations(classifiers, 5))
i5 = 0
accuracies5 = []

for komb5 in kombinacje5:
    clf_vote1 = VotingClassifier(
        estimators=[('model1', komb5[0]), ('model2', komb5[1]), ('model3', komb5[2]), ('model4', komb5[3]), ('model5', komb5[4])]
    )
    clf_vote1.fit(data_train_norm, target_train)
    pred_vote1 = clf_vote1.predict(data_test_norm)
    score_vote = f1_score(pred_vote1, target_test)
    i5= i5+1
    accuracies5.append(accuracy_score(target_test, pred_vote1, normalize = True))
    del clf_vote1, pred_vote1

print("max acc: ", max(accuracies5), " na indeksie: ", accuracies5.index(max(accuracies5)))
print(kombinacje[accuracies5.index(max(accuracies5))])

## Bagging

In [48]:
# classifiers = [svc_model, rfc, lr, dt, svc]
# for clf in classifiers:
#     bag_t = BaggingClassifier(clf, max_samples=0.2, max_features=0.9, bootstrap=False)
#     clf_bag_t = bag_t.fit(data_train_norm, target_train)
#     pred_bag_t = clf_bag_t.predict(data_test_norm)
#     print("/////////")
#     print(clf)
#     print(f1_score(pred_bag_t, target_test))
#     print("Bag acc: ", accuracy_score(target_test, pred_bag_t, normalize = True))
#     print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_bag_t)*100))
#     del bag_t, clf_bag_t, pred_bag_t

In [49]:
# classifiers = [bag, rfc, lr, dt, svc]
# for clf in classifiers:
#     bag_b = AdaBoostClassifier(clf, n_estimators=100, learning_rate=0.01)
#     clf_bag_b = bag_b.fit(data_train_norm, target_train)
#     pred_bag_b = clf_bag_b.predict(data_test_norm)
#     print("/////////")
#     print(clf)
#     print(f1_score(pred_bag_b, target_test))
#     print("Bag acc: ", accuracy_score(target_test, pred_bag_b, normalize = True))
#     print('ROC AUC: %.2f' % (roc_auc_score(target_test, pred_bag_b)*100))
#     del bag_b, clf_bag_b, pred_bag_b

In [50]:
clf_meta = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9)
clf_stack = StackingClassifier(classifiers = [svc_model, bag, rfc, lr, dt, svc], meta_classifier=clf_meta, use_probas=False, use_features_in_secondary=False)
clf_stack_fit = clf_stack.fit(data_train_norm, target_train)
clf_stack_pred = clf_stack_fit.predict(data_test_norm)
print(f1_score(clf_stack_pred, target_test))
print("DT acc: ", accuracy_score(target_test, clf_stack_pred, normalize = True))
print('ROC AUC: %.2f' % (roc_auc_score(target_test, clf_stack_pred)*100))

0.6851851851851851
DT acc:  0.6964285714285714
ROC AUC: 70.57
