In [1]:
import imblearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import time
from joblib import dump, load
import os

In [4]:
def fit_model(model, xs_train, xs_test, ys_train, ys_test, mode, method):
    if mode == 'save':
        model.fit(xs_train, ys_train.ravel())
        dump(model, os.path.join('models', method + '.joblib'))
    print(f"Rezultat trening skupa: {model.score(xs_train, ys_train):.3f}")
    print(f"Rezultat test skupa: {model.score(xs_test, ys_test):.3f}")


def prediction(model, x_train, x_test, y_train, y_test):
    y_train_predicted = model.predict(x_train)
    y_test_predicted = model.predict(x_test)
    print("Matrica kofuzije trening skupa:\n" + str(confusion_matrix(y_train, y_train_predicted)))
    print("Matrica kofuzije test skupa:\n" + str(confusion_matrix(y_test, y_test_predicted)))
    # print("Metrika: \n" + str(metrics.classification_report(y_test, y_test_predicted)))


# Decision tree classifier
def dtc(xs_train, xs_test, ys_train, ys_test, mode, rtype, criteria, depth):
    print("-----Drvo odlucivanja: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'dtc_' + rtype + '.joblib'))
    else:
        model = DecisionTreeClassifier(criterion=criteria, max_depth=depth)
    fit_model(model, xs_train, xs_test, ys_train, ys_test, mode, 'dtc_' + rtype)
    prediction(model, xs_train, xs_test, ys_train, ys_test)
    print(f"Vreme izvrsavanja: {time.time() - start:.3f} \n")
    

# Random forest classifier
def rfc(x_train, x_test, y_train, y_test, n_est, mode, rtype, maxd):
    print("-----Nasumicna suma: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'rfc_' + rtype + '.joblib'))
    else:
        model = RandomForestClassifier(n_estimators=n_est, max_depth=maxd, criterion='gini')
    # noinspection PyTypeChecker
    fit_model(model, x_train, x_test, y_train, y_test, mode, 'rfc_' + rtype)
    prediction(model, x_train, x_test, y_train, y_test)
    print(f"Vreme izvrsavanja: {time.time() - start:.3f} \n")

In [7]:
file = 'data_for_classification.csv'
df = pd.read_csv(file)

x = df.loc[:, df.columns != 'url']
x = x.loc[:, x.columns != 'class']
x = x.values[:, :]
y = df.values[:, -1]
y = y.astype('int')

# print(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

ratio = 1/2
friends_ros = imblearn.over_sampling.RandomOverSampler(random_state=0, sampling_strategy=ratio)
x_train_res, y_train_res = friends_ros.fit_resample(x_train, y_train)
x_test_res, y_test_res = friends_ros.fit_resample(x_test, y_test)

smote = imblearn.over_sampling.SMOTE(random_state=0, k_neighbors=5)
x_train_res_smote, y_train_res_smote = smote.fit_resample(x_train, y_train)
x_test_res_smote, y_test_res_smote = smote.fit_resample(x_test, y_test)

In [None]:
dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalGiniNone', criteria='gini', depth=None)
dtc(x_train_res, x_test_res, y_train_res, y_test_res, mode='save', rtype='RatioGiniNone', criteria='gini', depth=None)
dtc(x_train_res_smote, x_test_res_smote, y_train_res_smote, y_test_res_smote, mode='save', rtype='SmoteGiniNone', criteria='gini', depth=None)

In [None]:
dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalEntropyNone', criteria='entropy', depth=None)
dtc(x_train_res, x_test_res, y_train_res, y_test_res, mode='save', rtype='RatioEntropyNone', criteria='entropy', depth=None)
dtc(x_train_res_smote, x_test_res_smote, y_train_res_smote, y_test_res_smote, mode='save', rtype='SmoteEntropyNone', criteria='entropy', depth=None)

In [None]:
dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalGini5', criteria='gini', depth=5)
dtc(x_train_res, x_test_res, y_train_res, y_test_res, mode='save', rtype='RatioGini5', criteria='gini', depth=5)
dtc(x_train_res_smote, x_test_res_smote, y_train_res_smote, y_test_res_smote, mode='save', rtype='SmoteGini5', criteria='gini', depth=5)

In [None]:
 dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalEntropy5', criteria='entropy', depth=5)
dtc(x_train_res, x_test_res, y_train_res, y_test_res, mode='save', rtype='RatioEntropy5', criteria='entropy', depth=5)
dtc(x_train_res_smote, x_test_res_smote, y_train_res_smote, y_test_res_smote, mode='save', rtype='SmoteEntropy5', criteria='entropy', depth=5)

In [None]:
rfc(x_train, x_test, y_train, y_test, n_est=10, mode='save', rtype='Normal10', maxd=None)
rfc(x_train_res, x_test_res, y_train_res, y_test_res, n_est=10, mode='save', rtype='Ratio10', maxd=None)
rfc(x_train_res_smote, x_test_res_smote, y_train_res_smote, y_test_res_smote, n_est=10, mode='save', rtype='Smote10', maxd=None)

In [None]:
rfc(x_train, x_test, y_train, y_test, n_est=50, mode='save', rtype='Normal0', maxd=None)
rfc(x_train_res, x_test_res, y_train_res, y_test_res, n_est=50, mode='save', rtype='Ratio50', maxd=None)
rfc(x_train_res_smote, x_test_res_smote, y_train_res_smote, y_test_res_smote, n_est=50, mode='save', rtype='Smote50', maxd=None)

In [None]:
rfc(x_train, x_test, y_train, y_test, n_est=100, mode='save', rtype='Normal100', maxd=None)
rfc(x_train_res, x_test_res, y_train_res, y_test_res, n_est=100, mode='save', rtype='Ratio100', maxd=None)
rfc(x_train_res_smote, x_test_res_smote, y_train_res_smote, y_test_res_smote, n_est=100, mode='save', rtype='Smote100', maxd=None)