In [1]:
import imblearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn import metrics
import time
from joblib import dump, load
import os

In [2]:
def fit_model(model, xs_train, xs_test, ys_train, ys_test, mode, method):
    if mode == 'save':
        model.fit(xs_train, ys_train.ravel())
        dump(model, os.path.join('models', method + '.joblib'))
    print(f"Rezultat trening skupa: {model.score(xs_train, ys_train):.3f}")
    print(f"Rezultat test skupa: {model.score(xs_test, ys_test):.3f}")


def prediction(model, x_train, x_test, y_train, y_test):
    y_train_predicted = model.predict(x_train)
    y_test_predicted = model.predict(x_test)
    print("Matrica kofuzije trening skupa:\n" + str(confusion_matrix(y_train, y_train_predicted)))
    print("Matrica kofuzije test skupa:\n" + str(confusion_matrix(y_test, y_test_predicted)))
    # print("Metrika: \n" + str(metrics.classification_report(y_test, y_test_predicted)))


# K nearest neighbours
def knn(x_train, x_test, y_train, y_test, non, mode, rtype):
    print("-----K najblizih suseda: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'knn_' + rtype + '.joblib'))
    else:
        model = KNeighborsClassifier(n_neighbors=non, weights='uniform')
    fit_model(model, x_train, x_test, y_train, y_test, mode, 'knn_' + rtype)
    prediction(model, x_train, x_test, y_train, y_test)
    print(f"Vreme izvrsavanja: {time.time()-start:.3f} \n")


# Decision tree classifier
def dtc(xs_train, xs_test, ys_train, ys_test, mode, rtype, criteria, depth):
    print("-----Drvo odlucivanja: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'dtc_' + rtype + '.joblib'))
    else:
        model = DecisionTreeClassifier(criterion=criteria, max_depth=depth)
    fit_model(model, xs_train, xs_test, ys_train, ys_test, mode, 'dtc_' + rtype)
    prediction(model, xs_train, xs_test, ys_train, ys_test)
    print(f"Vreme izvrsavanja: {time.time() - start:.3f} \n")
    

# Random forest classifier
def rfc(x_train, x_test, y_train, y_test, n_est, mode, rtype, maxd):
    print("-----Nasumicna suma: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'rfc_' + rtype + '.joblib'))
    else:
        model = RandomForestClassifier(n_estimators=n_est, max_depth=maxd, criterion='gini')
    # noinspection PyTypeChecker
    fit_model(model, x_train, x_test, y_train, y_test, mode, 'rfc_' + rtype)
    prediction(model, x_train, x_test, y_train, y_test)
    print(f"Vreme izvrsavanja: {time.time() - start:.3f} \n")

In [4]:
file = 'data_for_classification.csv'
df = pd.read_csv(file)

x = df.loc[:, df.columns != 'url']  # string
x = x.loc[:, x.columns != 'class']  # class

# With ejecting this columns, results are getting worse for 0.02 - 0.03 => Do not eject them
# x = x.loc[:, x.columns != 'deep_url_len']  # correlation with url_len is 0.98
# x = x.loc[:, x.columns != 'tld_in_abused']  # correlation if NaN for some reason
# x = x.loc[:, x.columns != 'num_of_reserved_char']  # correlation with num_of_sub_deli is 0.96

x = x.values[:, :]
y = df.values[:, -1]
y = y.astype('int')

In [5]:
x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.3)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2)

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [8]:
n_est = [5, 10, 20, 50, 75, 100]
max_depths = [None, 5, 10, 20]
for n in n_est:
    for max_d in max_depths:
        model = RandomForestClassifier(n_estimators=n, max_depth=max_d, criterion='gini')
        model.fit(x_train, y_train)
        f1_score = metrics.f1_score(y_val, model.predict(x_val))
        print(str(n) + " " + str(max_d) + " " + str(f1_score))

5 None 0.9613199614430157
5 5 0.9347419070069749
5 10 0.9521800843645668
5 20 0.9611661333739652
10 None 0.9628412044504259
10 5 0.9377366387323465
10 10 0.9500113767893711
10 20 0.9623947082313015
20 None 0.9641817361709107
20 5 0.9359464441641603
20 10 0.9524912399796348
20 20 0.9635950404837761
50 None 0.9651273542876456
50 5 0.9366904097874781
50 10 0.9530961712992051
50 20 0.9640677828431023
75 None 0.9650514250626822
75 5 0.9378413372143962
75 10 0.9532508974870363
75 20 0.963978964039879
100 None 0.9651910397641181
100 5 0.9370772101702822
100 10 0.9533337327023303
100 20 0.964114225384998


In [16]:
non = [1, 2, 3, 5, 7, 10, 20]
for n in non:
    model = KNeighborsClassifier(n_neighbors=n, weights='uniform')
    model.fit(x_train, y_train)
    f1_score = metrics.f1_score(y_val, model.predict(x_val))
    print(f1_score)

0.9492091986752916
0.9378218577150486
0.9492039423805912
0.9504145700490965
0.9497377089178968
0.9478814381903078
0.9468790828159656


In [10]:
ratio = 1/2
friends_ros = imblearn.under_sampling.RandomUnderSampler(random_state=0, sampling_strategy=ratio)
x_train_res, y_train_res = friends_ros.fit_resample(x_train, y_train)
x_test_res, y_test_res = friends_ros.fit_resample(x_test, y_test)

smote = imblearn.over_sampling.SMOTE(random_state=0, k_neighbors=5)
x_train_res_smote, y_train_res_smote = smote.fit_resample(x_train, y_train)
x_test_res_smote, y_test_res_smote = smote.fit_resample(x_test, y_test)

In [11]:
dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalGiniNone', criteria='gini', depth=None)
dtc(x_train_res, x_test, y_train_res, y_test, mode='save', rtype='RatioGiniNone', criteria='gini', depth=None)
dtc(x_train_res_smote, x_test, y_train_res_smote, y_test, mode='save', rtype='SmoteGiniNone', criteria='gini', depth=None)

-----Drvo odlucivanja: NormalGiniNone -----
Rezultat trening skupa: 0.981
Rezultat test skupa: 0.925
Matrica kofuzije trening skupa:
[[ 35479   2155]
 [  2120 190666]]
Matrica kofuzije test skupa:
[[15457  4431]
 [ 4811 98741]]
Vreme izvrsavanja: 3.435 

-----Drvo odlucivanja: RatioGiniNone -----
Rezultat trening skupa: 0.978
Rezultat test skupa: 0.903
Matrica kofuzije trening skupa:
[[36514  1120]
 [ 1382 73886]]
Matrica kofuzije test skupa:
[[16752  3136]
 [ 8830 94722]]
Vreme izvrsavanja: 1.401 

-----Drvo odlucivanja: SmoteGiniNone -----
Rezultat trening skupa: 0.980
Rezultat test skupa: 0.921
Matrica kofuzije trening skupa:
[[188875   3911]
 [  3733 189053]]
Matrica kofuzije test skupa:
[[15948  3940]
 [ 5755 97797]]
Vreme izvrsavanja: 4.323 



In [12]:
dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalEntropyNone', criteria='entropy', depth=None)
dtc(x_train_res, x_test, y_train_res, y_test, mode='save', rtype='RatioEntropyNone', criteria='entropy', depth=None)
dtc(x_train_res_smote, x_test, y_train_res_smote, y_test, mode='save', rtype='SmoteEntropyNone', criteria='entropy', depth=None)

-----Drvo odlucivanja: NormalEntropyNone -----
Rezultat trening skupa: 0.981
Rezultat test skupa: 0.927
Matrica kofuzije trening skupa:
[[ 35479   2155]
 [  2120 190666]]
Matrica kofuzije test skupa:
[[15424  4464]
 [ 4563 98989]]
Vreme izvrsavanja: 1.904 

-----Drvo odlucivanja: RatioEntropyNone -----
Rezultat trening skupa: 0.978
Rezultat test skupa: 0.905
Matrica kofuzije trening skupa:
[[36514  1120]
 [ 1382 73886]]
Matrica kofuzije test skupa:
[[16822  3066]
 [ 8635 94917]]
Vreme izvrsavanja: 0.842 

-----Drvo odlucivanja: SmoteEntropyNone -----
Rezultat trening skupa: 0.980
Rezultat test skupa: 0.921
Matrica kofuzije trening skupa:
[[188875   3911]
 [  3733 189053]]
Matrica kofuzije test skupa:
[[15859  4029]
 [ 5720 97832]]
Vreme izvrsavanja: 3.430 



In [13]:
rfc(x_train, x_test, y_train, y_test, n_est=50, mode='save', rtype='Norma50', maxd=None)
rfc(x_train_res, x_test, y_train_res, y_test, n_est=50, mode='save', rtype='Ratio50', maxd=None)
rfc(x_train_res_smote, x_test, y_train_res_smote, y_test, n_est=50, mode='save', rtype='Smote50', maxd=None)

-----Nasumicna suma: Norma50 -----
Rezultat trening skupa: 0.981
Rezultat test skupa: 0.941
Matrica kofuzije trening skupa:
[[ 35005   2629]
 [  1666 191120]]
Matrica kofuzije test skupa:
[[ 15420   4468]
 [  2784 100768]]
Vreme izvrsavanja: 24.705 

-----Nasumicna suma: Ratio50 -----
Rezultat trening skupa: 0.978
Rezultat test skupa: 0.928
Matrica kofuzije trening skupa:
[[36257  1377]
 [ 1146 74122]]
Matrica kofuzije test skupa:
[[17030  2858]
 [ 6015 97537]]
Vreme izvrsavanja: 9.690 

-----Nasumicna suma: Smote50 -----
Rezultat trening skupa: 0.980
Rezultat test skupa: 0.938
Matrica kofuzije trening skupa:
[[188546   4240]
 [  3420 189366]]
Matrica kofuzije test skupa:
[[16235  3653]
 [ 4061 99491]]
Vreme izvrsavanja: 36.111 



In [14]:
knn(x_train, x_test, y_train, y_test, non=5, mode='save', rtype='Normal5')
knn(x_train_res, x_test, y_train_res, y_test, non=5, mode='save', rtype='Ratio5')
knn(x_train_res_smote, x_test, y_train_res_smote, y_test, non=5, mode='save', rtype='Smote5')

-----K najblizih suseda: Normal5 -----
Rezultat trening skupa: 0.936
Rezultat test skupa: 0.914
Matrica kofuzije trening skupa:
[[ 27722   9912]
 [  4766 188020]]
Matrica kofuzije test skupa:
[[13202  6686]
 [ 3939 99613]]
Vreme izvrsavanja: 76.832 

-----K najblizih suseda: Ratio5 -----
Rezultat trening skupa: 0.885
Rezultat test skupa: 0.856
Matrica kofuzije trening skupa:
[[33152  4482]
 [ 8550 66718]]
Matrica kofuzije test skupa:
[[16451  3437]
 [14330 89222]]
Vreme izvrsavanja: 34.467 

-----K najblizih suseda: Smote5 -----
Rezultat trening skupa: 0.935
Rezultat test skupa: 0.890
Matrica kofuzije trening skupa:
[[181157  11629]
 [ 13268 179518]]
Matrica kofuzije test skupa:
[[16188  3700]
 [ 9926 93626]]
Vreme izvrsavanja: 197.852 

