# Usporedba ML modela

## Uvoz biblioteka

In [1]:
import numpy as np
import pandas as pd
from tabulate import tabulate

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (confusion_matrix, classification_report, recall_score, 
                             precision_score, f1_score, accuracy_score, 
                             roc_auc_score, average_precision_score)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

## Uvoz podataka

In [2]:
dataset = pd.read_csv('data/training_data.csv')

## Čišćenje i analiza podataka

In [3]:
duplicates = dataset[dataset.duplicated()]
print(duplicates)

dataset.drop_duplicates().reset_index()

         Red  Green   Blue    NIR      ndvi      ndwi    msavi2     mtvi2  \
4944   209.0  287.0  321.0  289.0  0.160643 -0.003472  0.276471  0.513115   
15406  115.0  209.0  308.0  199.0  0.267516  0.024510  0.421498  0.842424   
23195  493.0  603.0  670.0  565.0  0.068053  0.032534  0.127335  0.304770   
33453  147.0  205.0  226.0  135.0 -0.042553  0.205882 -0.088532  0.340005   
33953  142.0  211.0  205.0  158.0  0.053333  0.143631  0.100979  0.518414   
43782  113.0  234.0  305.0  213.0  0.306748  0.046980  0.468899  0.978560   
44548  109.0  230.0  296.0  202.0  0.299035  0.064815  0.459781  1.000706   

           vari     tgi  Class  
4944   0.445714   690.0      1  
15406  5.875000 -2650.0      1  
23195  0.258216  -170.0      1  
33453  0.460317   770.0      1  
33953  0.466216  2775.0      1  
43782  2.880952   -25.0      1  
44548  2.813953   275.0      1  


Unnamed: 0,index,Red,Green,Blue,NIR,ndvi,ndwi,msavi2,mtvi2,vari,tgi,Class
0,0,1376.0,1150.0,795.0,2598.0,0.307499,-0.386339,0.470314,0.338574,-0.130560,13390.0,0
1,1,1342.0,1101.0,776.0,2676.0,0.332006,-0.416998,0.498459,0.360913,-0.144571,11065.0,0
2,2,1347.0,1101.0,785.0,2671.0,0.329517,-0.416225,0.495648,0.356393,-0.147925,10350.0,0
3,3,1303.0,1140.0,722.0,2628.0,0.337064,-0.394904,0.504138,0.393367,-0.094712,19375.0,0
4,4,1358.0,1141.0,727.0,2758.0,0.340136,-0.414722,0.507569,0.380207,-0.122460,17245.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
48130,48137,2493.0,2214.0,1764.0,2726.0,0.044645,-0.103644,0.085459,-0.022867,-0.094801,17235.0,2
48131,48138,2517.0,2298.0,1835.0,2722.0,0.039130,-0.084462,0.075299,-0.010665,-0.073490,20115.0,2
48132,48139,2452.0,2291.0,1868.0,2724.0,0.052550,-0.086341,0.099837,0.032247,-0.056000,19745.0,2
48133,48140,2285.0,2130.0,1642.0,2595.0,0.063525,-0.098413,0.119440,0.049287,-0.055896,23855.0,2


In [4]:
class_counts = dataset['Class'].value_counts()
total_instances = dataset.count()

class_ratios = class_counts / total_instances

# Izračun omjera nerazmjernosti između najbrojnije i najmanje brojne klase
imbalance_ratio = class_counts.max() / class_counts.min()

print('Class Distribution:\n', class_counts)
print('Imbalance Ratio (IR):', imbalance_ratio)

Class Distribution:
 Class
1    25552
2    15650
0     6940
Name: count, dtype: int64
Imbalance Ratio (IR): 3.6818443804034584


## Podjela podataka

In [5]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
from sklearn.model_selection import train_test_split

# Podjela podataka u omjeru 70-15-15: 70% za treniranje, 15% za validaciju, 15% za testiranje
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.1765, stratify=y_temp, random_state=0)

## Skaliranje podataka

In [7]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

## Neuravnoteženi podaci

In [8]:
# Inicijalizacija SMOTE-a za oversampling (balansiranje klasa)
smote = SMOTE(random_state=0)

X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

In [9]:
# Inicijalizacija SMOTE-a za undersampling (balansiranje klasa)
tomek = TomekLinks()

X_train_under, y_train_under = tomek.fit_resample(X_train, y_train)

## Klasa

In [10]:
class MLModel:
    # Inicijalizacija MLModel s imenom, klasom modela i opcionalnim parametrima.
    # Ako je params lista rječnika, to znači da će se koristiti za podešavanje hiperparametara s GridSearchCV. 
    # Ako je params rječnik s direktnim parametrima, tada se model inicijalizira s tim parametrima bez potrebe za daljnjim podešavanjem.
    def __init__(self, name, model_class, params=None):
        
        self.name = name
        self.params_arg = params
        self.params = params  # Spremanje parametara za podešavanje
        self.model_class = model_class  # Spremanje klase modela, a ne instance

        if isinstance(params, list):
            self.model = model_class()  # Inicijalizacija bez parametara
            self.is_tuning_required = True  # Oznaka da je potrebno podešavanje hiperparametara
        elif isinstance(params, dict):
            self.model = model_class(**params)  # Inicijalizacija s danim parametrima
            self.is_tuning_required = False  # Nije potrebno podešavanje hiperparametara
        else:
            self.model = model_class()
            self.is_tuning_required = False

        self.validation = {
            'Accuracy': None,
            'Precision': None,
            'Recall': None,
            'F1 Score': None,
            'Confusion Matrix': None,
            'AUC-ROC': None,
            'AUC-PR': None,
            'Classification Report': None,
        }

        self.test = {
            'Accuracy': None,
            'Precision': None,
            'Recall': None,
            'F1 Score': None,
            'Confusion Matrix': None,
            'AUC-ROC': None,
            'AUC-PR': None,
            'Classification Report': None,
        }

    # Treniranje modela
    def train(self, X_train, y_train):
        if self.is_tuning_required:
            print('Tuning hyperparameters using GridSearchCV...\n')
            self.tune_hyperparameters(X_train, y_train, self.params)
        else:
            self.model.fit(X_train, y_train)

    # Podešavanje hiperparametara pomoću GridSearchCV
    def tune_hyperparameters(self, X, y, param_grid, scoring='f1_macro', cv=10, n_jobs=-1):
        grid_search = GridSearchCV(estimator=self.model, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=n_jobs)
        grid_search.fit(X, y)
        
        self.model = grid_search.best_estimator_
        best_f1 = grid_search.best_score_
        self.params = grid_search.best_params_
        print('Best found parameters: ', self.params)
        print(f'Best F1 score with cross-validation: {best_f1 * 100:.2f}\n')

    # Evaluacija modela na danim podacima i spremanje metrike
    def evaluate(self, X, y, data_type='test'):
        y_pred = self.model.predict(X)
        y_prob = self.model.predict_proba(X)
        
        y_bin = label_binarize(y, classes=np.unique(y))

        accuracy = accuracy_score(y, y_pred)
        precision = precision_score(y, y_pred, average='weighted')
        recall = recall_score(y, y_pred, average='weighted')
        f1 = f1_score(y, y_pred, average='weighted')
        cm = confusion_matrix(y, y_pred)
        class_report = classification_report(y, y_pred, output_dict=True)
                                                     
        try:
            auc_roc = roc_auc_score(y_bin, y_prob, average='macro', multi_class='ovr')
            auc_pr = average_precision_score(y_bin, y_prob, average='macro')
        except ValueError as e:
            print(f'Greška u izračunu AUC: {e}')
            auc_roc, auc_pr = None, None

        metrics = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Confusion Matrix': cm,
            'AUC-ROC': auc_roc,
            'AUC-PR': auc_pr, 
            'Classification Report': class_report,
        }

        if data_type == 'test':
            self.test.update(metrics)
        elif data_type == 'validation':
            self.validation.update(metrics)
        else:
            raise ValueError('data_type mora biti \'test\' ili \'validation\'')

        self.print_metrics(data_type)

    # Ispis metrika u lijepo formatiranoj tablici
    def print_metrics(self, data_type='test'):
        if data_type == 'test':
            metrics = self.test
        elif data_type == 'validation':
            metrics = self.validation
        else:
            raise ValueError('data_type mora biti \'test\' ili \'validation\'')

        metrics_keys = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC', 'AUC-PR']
        formatted_metrics = [f'{metrics[key]:10.10f}' for key in metrics_keys]

        print(f'\n{data_type.capitalize()} Metrics: \n')
        print(f'{"Metric":<15}', end='')
        for key in metrics_keys:
            print(f'{key:<15}', end='')
        print()
        
        print('-' * (15 + 15 * len(metrics_keys)))

        print(f'{"Value":<15}', end='')
        for value in formatted_metrics:
            print(f'{value:<15}', end='')
        print()

        cm = metrics['Confusion Matrix']
        cm_str = '\n'.join(['\t'.join(map(str, row)) for row in cm])
        print(f'\nConfusion Matrix:\n{cm_str}')
        print()

        class_report = metrics['Classification Report']
        
        if class_report:
            print('\nClassification Report:')
            print(f'{"Class":<15} {"Precision":<15} {"Recall":<15} {"F1 Score":<15} {"Support":<15}')
            print('-' * 70)
            
            for label, metrics in class_report.items():
                if isinstance(metrics, dict):
                    precision = metrics.get('precision', 'N/A')
                    recall = metrics.get('recall', 'N/A')
                    f1 = metrics.get('f1-score', 'N/A')
                    support = metrics.get('support', 'N/A')
                    print(f'{label:<15} {precision:<15} {recall:<15} {f1:<15} {support:<15}')

        print()

## Modeli

### Logistic Regression

In [11]:
params = {'max_iter': 5000}

# Kreiranje instance MLModel
logistic_model = MLModel('Logistic Regression', LogisticRegression, params)

# Obuka modela
logistic_model.train(X_train, y_train)

print(f"{logistic_model.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
logistic_model.evaluate(X_valid, y_valid, data_type='validation')
logistic_model.evaluate(X_test, y_test, data_type='test')

Logistic regression


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9965388343   0.9965382937   0.9965388343   0.9965385403   0.9999719476   0.9998783897   

Confusion Matrix:
1030	0	11
2	3832	0
9	3	2336


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.989433237271854 0.989433237271854 0.989433237271854 1041           
1               0.9992177314211212 0.9994783515910276 0.9993480245142782 3834           
2               0.9953131657435024 0.9948892674616695 0.995101171458999 2348           
macro avg       0.9946547114788258 0.9946002854415169 0.9946274777483771 7223           
weighted avg    0.9965382937054302 0.9965388342793853 0.9965385

In [12]:
params = {'max_iter': 5000}

# Kreiranje instance MLModel
logistic_model_over = MLModel('Logistic Regression (Oversampling)', LogisticRegression, params)

# Obuka modela
logistic_model_over.train(X_train_over, y_train_over)

print(f"{logistic_model_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
logistic_model_over.evaluate(X_valid, y_valid, data_type='validation')
logistic_model_over.evaluate(X_test, y_test, data_type='test')

Logistic regression (oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9969541742   0.9969634641   0.9969541742   0.9969562640   0.9999762519   0.9998921634   

Confusion Matrix:
1035	0	6
2	3832	0
11	3	2334


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9875954198473282 0.9942363112391931 0.990904739109622 1041           
1               0.9992177314211212 0.9994783515910276 0.9993480245142782 3834           
2               0.9974358974358974 0.9940374787052811 0.9957337883959044 2348           
macro avg       0.9947496829014489 0.995917380511834 0.9953288506732682 7223           
weighted avg    0.9969634641435877 0.996954174

In [13]:
params = {'max_iter': 5000}

# Kreiranje instance MLModel
logistic_model_under = MLModel('Logistic Regression (Undersampling)', LogisticRegression, params)

# Obuka modela
logistic_model_under.train(X_train_under, y_train_under)

print(f"{logistic_model_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
logistic_model_under.evaluate(X_valid, y_valid, data_type='validation')
logistic_model_under.evaluate(X_test, y_test, data_type='test')

Logistic regression (undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9965388343   0.9965382937   0.9965388343   0.9965385403   0.9999719476   0.9998783897   

Confusion Matrix:
1030	0	11
2	3832	0
9	3	2336


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.989433237271854 0.989433237271854 0.989433237271854 1041           
1               0.9992177314211212 0.9994783515910276 0.9993480245142782 3834           
2               0.9953131657435024 0.9948892674616695 0.995101171458999 2348           
macro avg       0.9946547114788258 0.9946002854415169 0.9946274777483771 7223           
weighted avg    0.9965382937054302 0.9965388342

### k-Nearest Neighbour

In [14]:
# Kreiranje instance MLModel
knn_model = MLModel('k-Nearest Neighbour', KNeighborsClassifier)

# Obuka modela
knn_model.train(X_train, y_train)

print(f"{knn_model.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
knn_model.evaluate(X_valid, y_valid, data_type='validation')
knn_model.evaluate(X_test, y_test, data_type='test')

K-nearest neighbour


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9994462135   0.9994461165   0.9994462135   0.9994461227   0.9999989287   0.9999949803   

Confusion Matrix:
1039	0	2
0	3834	0
1	1	2346


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990384615384615 0.9980787704130644 0.9985583853916387 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9991482112436116 0.9991482112436116 0.9991482112436116 2348           
macro avg       0.9993086388630378 0.9990756605522254 0.9991920671793686 7223           
weighted avg    0.999446116463759 0.9994462134847016 0.99944612

In [15]:
# Kreiranje instance MLModel
knn_model_over = MLModel('k-Nearest Neighbour (Oversampling)', KNeighborsClassifier)

# Obuka modela
knn_model_over.train(X_train_over, y_train_over)

print(f"{knn_model_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
knn_model_over.evaluate(X_valid, y_valid, data_type='validation')
knn_model_over.evaluate(X_test, y_test, data_type='test')

K-nearest neighbour (oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9993077669   0.9993076850   0.9993077669   0.9993077022   0.9999014038   0.9995801134   

Confusion Matrix:
1039	0	2
0	3834	0
2	1	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980787704130644 0.9980787704130644 0.9980787704130644 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9991478483170004 0.9987223168654173 0.9989350372736955 2348           
macro avg       0.9989886208457017 0.998933695759494 0.9989611375298719 7223           
weighted avg    0.9993076849791651 0.999307766855

In [16]:
# Kreiranje instance MLModel
knn_model_under = MLModel('k-Nearest Neighbour (Undersampling)', KNeighborsClassifier)

# Obuka modela
knn_model_under.train(X_train_under, y_train_under)

print(f"{knn_model_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
knn_model_under.evaluate(X_valid, y_valid, data_type='validation')
knn_model_under.evaluate(X_test, y_test, data_type='test')

K-nearest neighbour (undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9994462135   0.9994461165   0.9994462135   0.9994461227   0.9999013100   0.9995798715   

Confusion Matrix:
1039	0	2
0	3834	0
1	1	2346


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990384615384615 0.9980787704130644 0.9985583853916387 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9991482112436116 0.9991482112436116 0.9991482112436116 2348           
macro avg       0.9993086388630378 0.9990756605522254 0.9991920671793686 7223           
weighted avg    0.999446116463759 0.99944621348

### k-Nearest Neighbour (Grid Search Cross Validation)

In [17]:
params = [{'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}]

# Kreiranje instance MLModel
knn_model_gscv = MLModel('k-Nearest Neighbour (Grid Search Cross Validation)', KNeighborsClassifier, params)

# Obuka modela
knn_model_gscv.train(X_train, y_train)

print(f"{knn_model_gscv.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
knn_model_gscv.evaluate(X_valid, y_valid, data_type='validation')
knn_model_gscv.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'uniform'}
Best F1 score with cross-validation: 99.93

K-nearest neighbour (grid search cross validation)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9991693202   0.9991694275   0.9991693202   0.9991690066   0.9998790200   0.9998124830   

Confusion Matrix:
1037	0	4
0	3834	0
1	1	2346


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990366088631984 0.9961575408261287 0.9975949975949976 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9982978723404256

In [18]:
params = [{'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}]

# Kreiranje instance MLModel
knn_model_gscv_over = MLModel('k-Nearest Neighbour (Grid Search Cross Validation, Oversampling)', KNeighborsClassifier, params)

# Obuka modela
knn_model_gscv_over.train(X_train_over, y_train_over)

print(f"{knn_model_gscv_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
knn_model_gscv_over.evaluate(X_valid, y_valid, data_type='validation')
knn_model_gscv_over.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'}
Best F1 score with cross-validation: 99.99

K-nearest neighbour (grid search cross validation, oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9991693202   0.9991690901   0.9991693202   0.9991691629   0.9997067687   0.9991634782   

Confusion Matrix:
1038	0	3
0	3834	0
2	1	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980769230769231 0.9971181556195965 0.9975973089860644 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9

In [19]:
params = [{'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}]

# Kreiranje instance MLModel
knn_model_gscv_under = MLModel('k-Nearest Neighbour (Grid Search Cross Validation, Undersampling)', KNeighborsClassifier, params)

# Obuka modela
knn_model_gscv_under.train(X_train_under, y_train_under)

print(f"{knn_model_gscv_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
knn_model_gscv_under.evaluate(X_valid, y_valid, data_type='validation')
knn_model_gscv_under.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'uniform'}
Best F1 score with cross-validation: 99.94

K-nearest neighbour (grid search cross validation, undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9991693202   0.9991694275   0.9991693202   0.9991690066   0.9997812945   0.9993976226   

Confusion Matrix:
1037	0	4
0	3834	0
1	1	2346


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990366088631984 0.9961575408261287 0.9975949975949976 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9

### Support Vector Classifier

In [20]:
params = {'C': 1.0, 'kernel': 'linear', 'probability': True} #probability = True zbog AUC-ROC i AUC-PR 

# Kreiranje instance MLModel
svm_model = MLModel('Support Vector Classifier', SVC, params=params)

# Obuka modela
svm_model.train(X_train, y_train)

print(f"{svm_model.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
svm_model.evaluate(X_valid, y_valid, data_type='validation')
svm_model.evaluate(X_test, y_test, data_type='test')

Support vector classifier


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9970926208   0.9970915184   0.9970926208   0.9970918790   0.9999794902   0.9999117327   

Confusion Matrix:
1030	0	11
1	3833	0
8	1	2339


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9913378248315688 0.989433237271854 0.9903846153846155 1041           
1               0.9997391757955139 0.9997391757955139 0.9997391757955139 3834           
2               0.9953191489361702 0.9961669505962522 0.9957428693060877 2348           
macro avg       0.995465383187751 0.9951131212212067 0.9952888868287392 7223           
weighted avg    0.9970915183928826 0.9970926207946836 0

In [21]:
params = {'C': 1.0, 'kernel': 'linear', 'probability': True} #probability = True zbog AUC-ROC i AUC-PR 

# Kreiranje instance MLModel
svm_model_over = MLModel('Support Vector Classifier (Oversampling)', SVC, params=params)

# Obuka modela
svm_model_over.train(X_train_over, y_train_over)

print(f"{svm_model_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
svm_model_over.evaluate(X_valid, y_valid, data_type='validation')
svm_model_over.evaluate(X_test, y_test, data_type='test')

Support vector classifier (oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9977848539   0.9977974960   0.9977848539   0.9977873293   0.9999847852   0.9999338853   

Confusion Matrix:
1038	0	3
1	3833	0
11	1	2336


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9885714285714285 0.9971181556195965 0.9928263988522237 1041           
1               0.9997391757955139 0.9997391757955139 0.9997391757955139 3834           
2               0.9987174005985464 0.9948892674616695 0.996799658630254 2348           
macro avg       0.995676001655163 0.99724886629226 0.9964550777593305 7223           
weighted avg    0.9977974960194163 0.99778

In [22]:
params = {'C': 1.0, 'kernel': 'linear', 'probability': True} #probability = True zbog AUC-ROC i AUC-PR 

# Kreiranje instance MLModel
svm_model_under = MLModel('Support Vector Classifier (Undersampling)', SVC, params=params)

# Obuka modela
svm_model_under.train(X_train_under, y_train_under)

print(f"{svm_model_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
svm_model_under.evaluate(X_valid, y_valid, data_type='validation')
svm_model_under.evaluate(X_test, y_test, data_type='test')

Support vector classifier (undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9970926208   0.9970915184   0.9970926208   0.9970918790   0.9999797783   0.9999133234   

Confusion Matrix:
1030	0	11
1	3833	0
8	1	2339


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9913378248315688 0.989433237271854 0.9903846153846155 1041           
1               0.9997391757955139 0.9997391757955139 0.9997391757955139 3834           
2               0.9953191489361702 0.9961669505962522 0.9957428693060877 2348           
macro avg       0.995465383187751 0.9951131212212067 0.9952888868287392 7223           
weighted avg    0.9970915183928826 0.99

### Support Vector Classifier (Grid Search Cross Validation)

In [23]:
params = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear'], 'probability': [True]},
            {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma': ['scale', 'auto'], 'probability': [True]}]

# Kreiranje instance MLModel
svm_model_gscv = MLModel('Support Vector Classifier (Grid Search Cross Validation)', SVC, params)

# Obuka modela
svm_model_gscv.train(X_train, y_train)

print(f"{svm_model_gscv.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
svm_model_gscv.evaluate(X_valid, y_valid, data_type='validation')
svm_model_gscv.evaluate(X_test, y_test, data_type='test')


Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True}
Best F1 score with cross-validation: 99.83

Support vector classifier (grid search cross validation)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9993077669   0.9993077133   0.9993077669   0.9993075833   0.9999988543   0.9999952342   

Confusion Matrix:
1038	0	3
0	3834	0
1	1	2346


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990375360923965 0.9971181556195965 0.9980769230769231 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9987228

In [24]:
params = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear'], 'probability': [True]},
            {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma': ['scale', 'auto'], 'probability': [True]}]

# Kreiranje instance MLModel
svm_model_gscv_over = MLModel('Support Vector Classifier (Grid Search Cross Validation, Oversampling)', SVC, params)

# Obuka modela
svm_model_gscv_over.train(X_train_over, y_train_over)

print(f"{svm_model_gscv_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
svm_model_gscv_over.evaluate(X_valid, y_valid, data_type='validation')
svm_model_gscv_over.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True}
Best F1 score with cross-validation: 99.90

Support vector classifier (grid search cross validation, oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9994462135   0.9994462135   0.9994462135   0.9994462135   0.9999992135   0.9999965725   

Confusion Matrix:
1039	0	2
0	3834	0
2	0	2346


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980787704130644 0.9980787704130644 0.9980787704130644 1041           
1               1.0             1.0             1.0             3834           
2               0

In [25]:
params = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear'], 'probability': [True]},
            {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma': ['scale', 'auto'], 'probability': [True]}]

# Kreiranje instance MLModel
svm_model_gscv_under = MLModel('Support Vector Classifier (Grid Search Cross Validation, Undersampling)', SVC, params)

# Obuka modela
svm_model_gscv_under.train(X_train_under, y_train_under)

print(f"{svm_model_gscv_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
svm_model_gscv_under.evaluate(X_valid, y_valid, data_type='validation')
svm_model_gscv_under.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True}
Best F1 score with cross-validation: 99.84

Support vector classifier (grid search cross validation, undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9991693202   0.9991690901   0.9991693202   0.9991691629   0.9999988543   0.9999952342   

Confusion Matrix:
1038	0	3
0	3834	0
2	1	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980769230769231 0.9971181556195965 0.9975973089860644 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2         

### Naive Bayes

In [26]:
# Kreiranje instance MLModel
nb_model = MLModel('Naive Bayes', GaussianNB)

# Obuka modela
nb_model.train(X_train, y_train)

print(f"{nb_model.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
nb_model.evaluate(X_valid, y_valid, data_type='validation')
nb_model.evaluate(X_test, y_test, data_type='test')

Naive bayes


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9954312612   0.9954718808   0.9954312612   0.9954378999   0.9995227965   0.9980679439   

Confusion Matrix:
1034	0	7
0	3834	0
23	3	2322


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.978240302743614 0.9932756964457252 0.9857006673021925 1041           
1               0.9992181391712275 1.0             0.999608916699257 3834           
2               0.9969944182052383 0.9889267461669506 0.9929441949967928 2348           
macro avg       0.9914842867066933 0.9940674808708919 0.9927512596660808 7223           
weighted avg    0.995471880753771 0.9954312612487886 0.9954378999223318 

In [27]:
# Kreiranje instance MLModel
nb_model_over = MLModel('Naive Bayes (Oversampling)', GaussianNB)

# Obuka modela
nb_model_over.train(X_train_over, y_train_over)

print(f"{nb_model_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
nb_model_over.evaluate(X_valid, y_valid, data_type='validation')
nb_model_over.evaluate(X_test, y_test, data_type='test')

Naive bayes (oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9954312612   0.9954805663   0.9954312612   0.9954388814   0.9995111777   0.9980696383   

Confusion Matrix:
1035	0	6
0	3834	0
24	3	2321


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9773371104815864 0.9942363112391931 0.9857142857142859 1041           
1               0.9992181391712275 1.0             0.999608916699257 3834           
2               0.9974215728405672 0.9885008517887564 0.9929411764705881 2348           
macro avg       0.9913256074977936 0.9942457210093165 0.992754792961377 7223           
weighted avg    0.9954805663330292 0.9954312612487886 0.9

In [28]:
# Kreiranje instance MLModel
nb_model_under = MLModel('Naive Bayes (Undersampling)', GaussianNB)

# Obuka modela
nb_model_under.train(X_train_under, y_train_under)

print(f"{nb_model_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
nb_model_under.evaluate(X_valid, y_valid, data_type='validation')
nb_model_under.evaluate(X_test, y_test, data_type='test')

Naive bayes (undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9954312612   0.9954718808   0.9954312612   0.9954378999   0.9995220103   0.9980681728   

Confusion Matrix:
1034	0	7
0	3834	0
23	3	2322


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.978240302743614 0.9932756964457252 0.9857006673021925 1041           
1               0.9992181391712275 1.0             0.999608916699257 3834           
2               0.9969944182052383 0.9889267461669506 0.9929441949967928 2348           
macro avg       0.9914842867066933 0.9940674808708919 0.9927512596660808 7223           
weighted avg    0.995471880753771 0.9954312612487886 0.9

### Decision Tree Classification

In [29]:
# Kreiranje instance MLModel
dtc_model = MLModel('Decision Tree Classification', DecisionTreeClassifier)

# Obuka modela
dtc_model.train(X_train, y_train)

print(f"{dtc_model.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
dtc_model.evaluate(X_valid, y_valid, data_type='validation')
dtc_model.evaluate(X_test, y_test, data_type='test')

Decision tree classification


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9980617472   0.9980626087   0.9980617472   0.9980607758   0.9978635598   0.9955800313   

Confusion Matrix:
1034	0	7
0	3833	1
1	5	2342


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990338164251208 0.9932756964457252 0.9961464354527939 1041           
1               0.9986972381448671 0.9997391757955139 0.9992179353493221 3834           
2               0.9965957446808511 0.9974446337308348 0.9970200085142614 2348           
macro avg       0.9981089330836129 0.9968198353240246 0.9974614597721257 7223           
weighted avg    0.9980626086745964 0.99806174719645

In [30]:
# Kreiranje instance MLModel
dtc_model_over = MLModel('Decision Tree Classification (Oversampling)', DecisionTreeClassifier)

# Obuka modela
dtc_model_over.train(X_train_over, y_train_over)

print(f"{dtc_model_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
dtc_model_over.evaluate(X_valid, y_valid, data_type='validation')
dtc_model_over.evaluate(X_test, y_test, data_type='test')

Decision tree classification (oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9983386405   0.9983408885   0.9983386405   0.9983370664   0.9979131209   0.9954795625   

Confusion Matrix:
1032	0	9
0	3834	0
1	2	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990319457889641 0.9913544668587896 0.9951783992285439 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9961767204757859 0.9987223168654173 0.9974478945129732 2348           
macro avg       0.9982290966103217 0.9966922612414023 0.9974551791828525 7223           
weighted avg    0.9983408885499548 0.99

In [31]:
# Kreiranje instance MLModel
dtc_model_under = MLModel('Decision Tree Classification (Undersampling)', DecisionTreeClassifier)

# Obuka modela
dtc_model_under.train(X_train_under, y_train_under)

print(f"{dtc_model_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
dtc_model_under.evaluate(X_valid, y_valid, data_type='validation')
dtc_model_under.evaluate(X_test, y_test, data_type='test')

Decision tree classification (undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9980617472   0.9980616029   0.9980617472   0.9980602073   0.9977172361   0.9946499993   

Confusion Matrix:
1032	0	9
0	3834	0
3	2	2343


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9971014492753624 0.9913544668587896 0.9942196531791908 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9961734693877551 0.997870528109029 0.9970212765957447 2348           
macro avg       0.997584514076444 0.9964083316559394 0.996993391193992 7223           
weighted avg    0.9980616028756706 0.9980

### Decision Tree (Grid Search Cross Validation)

In [32]:
params = [{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}]

# Kreiranje instance MLModel
dtc_model_gscv = MLModel('Decision Tree (Grid Search Cross Validation)', DecisionTreeClassifier, params)

# Obuka modela
dtc_model_gscv.train(X_train, y_train)

print(f"{dtc_model_gscv.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
dtc_model_gscv.evaluate(X_valid, y_valid, data_type='validation')
dtc_model_gscv.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'criterion': 'entropy', 'splitter': 'random'}
Best F1 score with cross-validation: 99.84

Decision tree (grid search cross validation)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9987539803   0.9987537425   0.9987539803   0.9987538136   0.9986367780   0.9962622495   

Confusion Matrix:
1036	0	5
0	3834	0
4	0	2344


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9961538461538462 0.9951969260326609 0.9956751561749159 1041           
1               1.0             1.0             1.0             3834           
2               0.9978714346530438 0.9982964224872232 0.99808

In [33]:
params = [{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}]

# Kreiranje instance MLModel
dtc_model_gscv_over = MLModel('Decision Tree (Grid Search Cross Validation, Oversampling)', DecisionTreeClassifier, params)

# Obuka modela
dtc_model_gscv_over.train(X_train_over, y_train_over)

print(f"{dtc_model_gscv_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
dtc_model_gscv_over.evaluate(X_valid, y_valid, data_type='validation')
dtc_model_gscv_over.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'criterion': 'entropy', 'splitter': 'random'}
Best F1 score with cross-validation: 99.95

Decision tree (grid search cross validation, oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9986155337   0.9986150373   0.9986155337   0.9986152434   0.9985166168   0.9960795155   

Confusion Matrix:
1036	0	5
0	3834	0
4	1	2343


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9961538461538462 0.9951969260326609 0.9956751561749159 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.997870528109029 0.99787

In [34]:
params = [{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}]

# Kreiranje instance MLModel
dtc_model_gscv_under = MLModel('Decision Tree (Grid Search Cross Validation, Undersampling)', DecisionTreeClassifier, params)

# Obuka modela
dtc_model_gscv_under.train(X_train_under, y_train_under)

print(f"{dtc_model_gscv_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
dtc_model_gscv_under.evaluate(X_valid, y_valid, data_type='validation')
dtc_model_gscv_under.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'criterion': 'gini', 'splitter': 'random'}
Best F1 score with cross-validation: 99.86

Decision tree (grid search cross validation, undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9984770871   0.9984767578   0.9984770871   0.9984762497   0.9982259782   0.9956627948   

Confusion Matrix:
1034	0	7
0	3834	0
3	1	2344


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9971070395371263 0.9932756964457252 0.9951876804619827 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9970225435984688 0.998296

### Random Forest

In [35]:
# Kreiranje instance MLModel
rf_model = MLModel('Random Forest', RandomForestClassifier)

# Obuka modela
rf_model.train(X_train, y_train)

print(f"{rf_model.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
rf_model.evaluate(X_valid, y_valid, data_type='validation')
rf_model.evaluate(X_test, y_test, data_type='test')

Random forest


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9982001938   0.9982011114   0.9982001938   0.9981986183   0.9999944216   0.9999778924   

Confusion Matrix:
1032	0	9
0	3834	0
2	2	2344


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980657640232108 0.9913544668587896 0.9946987951807229 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9961750956226094 0.9982964224872232 0.9972346309295893 2348           
macro avg       0.9979064944040116 0.996550296448671 0.9972242233057842 7223           
weighted avg    0.9982011113973304 0.9982001938252804 0.99819861829738

In [36]:
# Kreiranje instance MLModel
rf_model_over = MLModel('Random Forest (Oversampling)', RandomForestClassifier)

# Obuka modela
rf_model_over.train(X_train_over, y_train_over)

print(f"{rf_model_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
rf_model_over.evaluate(X_valid, y_valid, data_type='validation')
rf_model_over.evaluate(X_test, y_test, data_type='test')

Random forest (oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9986155337   0.9986150582   0.9986155337   0.9986149157   0.9999961320   0.9999857277   

Confusion Matrix:
1036	0	5
0	3834	0
2	3	2343


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980732177263969 0.9951969260326609 0.9966329966329965 1041           
1               0.9992181391712275 1.0             0.999608916699257 3834           
2               0.997870528109029 0.997870528109029 0.997870528109029 2348           
macro avg       0.9983872950022178 0.9976891513805631 0.9980374804804275 7223           
weighted avg    0.9986150581802112 0.9986155337117542 0.99

In [37]:
# Kreiranje instance MLModel
rf_model_under = MLModel('Random Forest (Undersampling)', RandomForestClassifier)

# Obuka modela
rf_model_under.train(X_train_under, y_train_under)

print(f"{rf_model_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
rf_model_under.evaluate(X_valid, y_valid, data_type='validation')
rf_model_under.evaluate(X_test, y_test, data_type='test')

Random forest (undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9984770871   0.9984766974   0.9984770871   0.9984762644   0.9999960609   0.9999861760   

Confusion Matrix:
1035	0	6
0	3834	0
2	3	2343


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980713596914176 0.9942363112391931 0.9961501443695862 1041           
1               0.9992181391712275 1.0             0.999608916699257 3834           
2               0.9974457215836526 0.997870528109029 0.9976580796252928 2348           
macro avg       0.9982450734820992 0.9973689464494072 0.9978057135647119 7223           
weighted avg    0.9984766973971574 0.9984770870829295 0

### Random Forest(Grid Search Cross Validation)

In [38]:
params = [{'criterion': ['gini', 'entropy', 'log_loss']}]

# Kreiranje instance MLModel
rf_model_gscv = MLModel('Random Forest(Grid Search Cross Validation)', RandomForestClassifier, params)

rf_model_gscv.train(X_train, y_train)

print(f"{rf_model_gscv.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
rf_model_gscv.evaluate(X_valid, y_valid, data_type='validation')
rf_model_gscv.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'criterion': 'entropy'}
Best F1 score with cross-validation: 99.88

Random forest(grid search cross validation)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9987539803   0.9987545165   0.9987539803   0.9987532477   0.9999968837   0.9999898258   

Confusion Matrix:
1035	0	6
0	3834	0
1	2	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.999034749034749 0.9942363112391931 0.9966297544535387 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9974478945129732 0.9987223168654173 0.9980846988721004 2348  

In [39]:
params = [{'criterion': ['gini', 'entropy', 'log_loss']}]

# Kreiranje instance MLModel
rf_model_gscv_over = MLModel('Random Forest(Grid Search Cross Validation, Oversampling)', RandomForestClassifier, params)

rf_model_gscv_over.train(X_train_over, y_train_over)

print(f"{rf_model_gscv_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
rf_model_gscv_over.evaluate(X_valid, y_valid, data_type='validation')
rf_model_gscv_over.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'criterion': 'entropy'}
Best F1 score with cross-validation: 99.96

Random forest(grid search cross validation, oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9988924270   0.9988920383   0.9988924270   0.9988920637   0.9999958462   0.9999840193   

Confusion Matrix:
1037	0	4
0	3834	0
2	2	2344


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980750721847931 0.9961575408261287 0.9971153846153846 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9982964224872232 0.9982964224872232 0.99829642

In [40]:
params = [{'criterion': ['gini', 'entropy', 'log_loss']}]

# Kreiranje instance MLModel
rf_model_gscv_under = MLModel('Random Forest(Grid Search Cross Validation, Undersampling)', RandomForestClassifier, params)

rf_model_gscv_under.train(X_train_under, y_train_under)

print(f"{rf_model_gscv_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
rf_model_gscv_under.evaluate(X_valid, y_valid, data_type='validation')
rf_model_gscv_under.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'criterion': 'entropy'}
Best F1 score with cross-validation: 99.89

Random forest(grid search cross validation, undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9986155337   0.9986153170   0.9986155337   0.9986147985   0.9999969879   0.9999895861   

Confusion Matrix:
1035	0	6
0	3834	0
2	2	2344


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980713596914176 0.9942363112391931 0.9961501443695862 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9974468085106383 0.9982964224872232 0.9978714

### XGBoost

In [41]:
# Kreiranje instance MLModel
xgb_model = MLModel('XGBoost', XGBClassifier)

# Obuka modela
xgb_model.train(X_train, y_train)

print(f"{xgb_model.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
xgb_model.evaluate(X_valid, y_valid, data_type='validation')
xgb_model.evaluate(X_test, y_test, data_type='test')

Xgboost


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9991693202   0.9991692285   0.9991693202   0.9991691794   0.9999757924   0.9999598126   

Confusion Matrix:
1039	0	2
0	3834	0
2	2	2344


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980787704130644 0.9980787704130644 0.9980787704130644 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9991474850809889 0.9982964224872232 0.9987217724755005 2348           
macro avg       0.9989016263534226 0.9987917309667625 0.9988465955652018 7223           
weighted avg    0.9991692285370385 0.9991693202270525 0.9991691793615767 72

In [42]:
# Kreiranje instance MLModel
xgb_model_over = MLModel('XGBoost (Oversampling)', XGBClassifier)

# Obuka modela
xgb_model_over.train(X_train_over, y_train_over)

print(f"{xgb_model_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
xgb_model_over.evaluate(X_valid, y_valid, data_type='validation')
xgb_model_over.evaluate(X_test, y_test, data_type='test')

Xgboost (oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9986155337   0.9986153170   0.9986155337   0.9986147985   0.9999897241   0.9999782282   

Confusion Matrix:
1035	0	6
0	3834	0
2	2	2344


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980713596914176 0.9942363112391931 0.9961501443695862 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9974468085106383 0.9982964224872232 0.9978714346530438 2348           
macro avg       0.998332263922757 0.9975109112421388 0.9979202742765567 7223           
weighted avg    0.9986153169838864 0.9986155337117542 0.99861

In [43]:
# Kreiranje instance MLModel
xgb_model_under = MLModel('XGBoost (Undersampling)', XGBClassifier)

# Obuka modela
xgb_model_under.train(X_train_under, y_train_under)

print(f"{xgb_model_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
xgb_model_under.evaluate(X_valid, y_valid, data_type='validation')
xgb_model_under.evaluate(X_test, y_test, data_type='test')

Xgboost (undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9993077669   0.9993076601   0.9993077669   0.9993076294   0.9999886320   0.9999777060   

Confusion Matrix:
1039	0	2
0	3834	0
1	2	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990384615384615 0.9980787704130644 0.9985583853916387 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9991478483170004 0.9987223168654173 0.9989350372736955 2348           
macro avg       0.9992216444738923 0.998933695759494 0.9990775554907915 7223           
weighted avg    0.9993076601222101 0.9993077668558771 0.9993

### XGBoost (Grid Search Cross Validation)

In [44]:
params = [{'max_depth': [3, 6, 9], 'learning_rate': [0.01, 0.1], 'n_estimators': [50, 100]}]

# Kreiranje instance MLModel
xgb_model_gscv = MLModel('XGBoost (Grid Search Cross Validation)', XGBClassifier, params)

# Obuka modela
xgb_model_gscv.train(X_train, y_train)

print(f"{xgb_model_gscv.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
xgb_model_gscv.evaluate(X_valid, y_valid, data_type='validation')
xgb_model_gscv.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100}
Best F1 score with cross-validation: 99.87

Xgboost (grid search cross validation)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9991693202   0.9991691981   0.9991693202   0.9991690901   0.9999492695   0.9999320208   

Confusion Matrix:
1038	0	3
0	3834	0
1	2	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990375360923965 0.9971181556195965 0.9980769230769231 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.9987223168654173 0.99872231686

In [45]:
params = [{'max_depth': [3, 6, 9], 'learning_rate': [0.01, 0.1], 'n_estimators': [50, 100]}]

# Kreiranje instance MLModel
xgb_model_gscv_over = MLModel('XGBoost (Grid Search Cross Validation, Oversampling)', XGBClassifier, params)

# Obuka modela
xgb_model_gscv_over.train(X_train_over, y_train_over)

print(f"{xgb_model_gscv_over.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
xgb_model_gscv_over.evaluate(X_valid, y_valid, data_type='validation')
xgb_model_gscv_over.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100}
Best F1 score with cross-validation: 99.96

Xgboost (grid search cross validation, oversampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9987539803   0.9987540084   0.9987539803   0.9987533211   0.9999793423   0.9999637480   

Confusion Matrix:
1035	0	6
0	3834	0
2	1	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9980713596914176 0.9942363112391931 0.9961501443695862 1041           
1               0.9997392438070404 1.0             0.9998696049028557 3834           
2               0.9974478945129732

In [46]:
params = [{'max_depth': [3, 6, 9], 'learning_rate': [0.01, 0.1], 'n_estimators': [50, 100]}]

# Kreiranje instance MLModel
xgb_model_gscv_under = MLModel('XGBoost (Grid Search Cross Validation, Undersampling)', XGBClassifier, params)

# Obuka modela
xgb_model_gscv_under.train(X_train_under, y_train_under)

print(f"{xgb_model_gscv_under.name.capitalize()}\n")

# Evaluacija modela na validacijskim i testnim podacima
xgb_model_gscv_under.evaluate(X_valid, y_valid, data_type='validation')
xgb_model_gscv_under.evaluate(X_test, y_test, data_type='test')

Tuning hyperparameters using GridSearchCV...

Best found parameters:  {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100}
Best F1 score with cross-validation: 99.89

Xgboost (grid search cross validation, undersampling)


Validation Metrics: 

Metric         Accuracy       Precision      Recall         F1 Score       AUC-ROC        AUC-PR         
---------------------------------------------------------------------------------------------------------
Value          0.9993077669   0.9993076601   0.9993077669   0.9993076294   0.9999568458   0.9999398533   

Confusion Matrix:
1039	0	2
0	3834	0
1	2	2345


Classification Report:
Class           Precision       Recall          F1 Score        Support        
----------------------------------------------------------------------
0               0.9990384615384615 0.9980787704130644 0.9985583853916387 1041           
1               0.9994786235662148 1.0             0.9997392438070404 3834           
2               0.999147848317000

## Usporedba

In [47]:
results = {
    'names': [logistic_model.name, logistic_model_over.name, logistic_model_under.name, 
              knn_model.name, knn_model_over.name, knn_model_under.name,
              knn_model_gscv.name, knn_model_gscv_over.name, knn_model_gscv_under.name,
              svm_model.name, svm_model_over.name, svm_model_under.name, 
              svm_model_gscv.name, svm_model_gscv_over.name, svm_model_gscv_under.name,
              nb_model.name, nb_model_over.name, nb_model_under.name, 
              dtc_model.name, dtc_model_over.name, dtc_model_under.name,
              dtc_model_gscv.name, dtc_model_gscv_over.name, dtc_model_gscv_under.name,
              rf_model.name, rf_model_over.name, rf_model_under.name, 
              rf_model_gscv.name, rf_model_gscv_over.name, rf_model_gscv_under.name, 
              xgb_model.name, xgb_model_over.name, xgb_model_under.name, 
              xgb_model_gscv.name, xgb_model_gscv_over.name, xgb_model_gscv_under.name],
    'test': [logistic_model.test, logistic_model_over.test, logistic_model_under.test, 
              knn_model.test, knn_model_over.test, knn_model_under.test,
              knn_model_gscv.test, knn_model_gscv_over.test, knn_model_gscv_under.test,
              svm_model.test, svm_model_over.test, svm_model_under.test, 
              svm_model_gscv.test, svm_model_gscv_over.test, svm_model_gscv_under.test,
              nb_model.test, nb_model_over.test, nb_model_under.test, 
              dtc_model.test, dtc_model_over.test, dtc_model_under.test,
              dtc_model_gscv.test, dtc_model_gscv_over.test, dtc_model_gscv_under.test,
              rf_model.test, rf_model_over.test, rf_model_under.test, 
              rf_model_gscv.test, rf_model_gscv_over.test, rf_model_gscv_under.test, 
              xgb_model.test, xgb_model_over.test, xgb_model_under.test, 
              xgb_model_gscv.test, xgb_model_gscv_over.test, xgb_model_gscv_under.test]
}


names = results['names']
validations = results['test']

metrics_data = []
confusion_matrices = {}

for name, validation in zip(names, validations):
    metrics_data.append({
        'Model': name,
        'F1 Score': validation['F1 Score'],
        'Accuracy': validation['Accuracy'],
        'Precision': validation['Precision'],
        'Recall': validation['Recall'],
        'AUC-ROC': validation['AUC-ROC'],
        'AUC-PR': validation['AUC-PR']
    })
    confusion_matrices[name] = validation['Confusion Matrix']

metrics_df = pd.DataFrame(metrics_data)
metrics_df = metrics_df.sort_values(by='F1 Score', ascending=False)

print("Model Evaluation Metrics:\n")
print(tabulate(metrics_df, headers='keys', tablefmt='grid', showindex=False))


Model Evaluation Metrics:

+-------------------------------------------------------------------------+------------+------------+-------------+----------+-----------+----------+
| Model                                                                   |   F1 Score |   Accuracy |   Precision |   Recall |   AUC-ROC |   AUC-PR |
| k-Nearest Neighbour (Oversampling)                                      |   0.999585 |   0.999585 |    0.999585 | 0.999585 |  0.999902 | 0.999582 |
+-------------------------------------------------------------------------+------------+------------+-------------+----------+-----------+----------+
| k-Nearest Neighbour (Undersampling)                                     |   0.999585 |   0.999585 |    0.999585 | 0.999585 |  1        | 0.999998 |
+-------------------------------------------------------------------------+------------+------------+-------------+----------+-----------+----------+
| Decision Tree (Grid Search Cross Validation, Oversampling)             