In [4]:
import imblearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn import metrics
import time
from joblib import dump, load
import os

In [18]:
def fit_model(model, xs_train, xs_test, ys_train, ys_test, mode, method):
    if mode == 'save':
        model.fit(xs_train, ys_train.ravel())
        dump(model, os.path.join('models', method + '.joblib'))
    print(f"Rezultat trening skupa: {model.score(xs_train, ys_train):.3f}")
    print(f"Rezultat test skupa: {model.score(xs_test, ys_test):.3f}")


def prediction(model, x_train, x_test, y_train, y_test):
    y_train_predicted = model.predict(x_train)
    y_test_predicted = model.predict(x_test)
    print("Matrica kofuzije trening skupa:\n" + str(confusion_matrix(y_train, y_train_predicted)))
    print("Matrica kofuzije test skupa:\n" + str(confusion_matrix(y_test, y_test_predicted)))
    print("Metrika: \n" + str(metrics.classification_report(y_test, y_test_predicted)))


# K nearest neighbours
def knn(x_train, x_test, y_train, y_test, non, mode, rtype):
    print("-----K najblizih suseda: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'knn_' + rtype + '.joblib'))
    else:
        model = KNeighborsClassifier(n_neighbors=non, weights='uniform')
    fit_model(model, x_train, x_test, y_train, y_test, mode, 'knn_' + rtype)
    prediction(model, x_train, x_test, y_train, y_test)
    print(f"Vreme izvrsavanja: {time.time()-start:.3f} \n")


# Decision tree classifier
def dtc(xs_train, xs_test, ys_train, ys_test, mode, rtype, criteria, depth):
    print("-----Drvo odlucivanja: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'dtc_' + rtype + '.joblib'))
    else:
        model = DecisionTreeClassifier(criterion=criteria, max_depth=depth)
    fit_model(model, xs_train, xs_test, ys_train, ys_test, mode, 'dtc_' + rtype)
    prediction(model, xs_train, xs_test, ys_train, ys_test)
    print(f"Vreme izvrsavanja: {time.time() - start:.3f} \n")
    

# Random forest classifier
def rfc(x_train, x_test, y_train, y_test, n_est, mode, rtype, maxd):
    print("-----Nasumicna suma: " + rtype + " -----")
    start = time.time()
    if mode == 'load':
        model = load(os.path.join('models', 'rfc_' + rtype + '.joblib'))
    else:
        model = RandomForestClassifier(n_estimators=n_est, max_depth=maxd, criterion='gini')
    # noinspection PyTypeChecker
    fit_model(model, x_train, x_test, y_train, y_test, mode, 'rfc_' + rtype)
    prediction(model, x_train, x_test, y_train, y_test)
    print(f"Vreme izvrsavanja: {time.time() - start:.3f} \n")

In [8]:
file = 'data_for_classification.csv'
df = pd.read_csv(file)

In [9]:
print(df.corr())

                                         url_len  tld_in_known  tld_in_abused  \
url_len                                 1.000000      0.054655       0.017623   
tld_in_known                            0.054655      1.000000       0.004444   
tld_in_abused                           0.017623      0.004444       1.000000   
contain_ip                             -0.010510     -0.098099      -0.000436   
deep_url_len                            0.983431      0.041488       0.012436   
num_of_gen_deli                         0.277748      0.023596       0.027620   
num_of_sub_deli                         0.349419      0.028439      -0.000060   
num_of_reserved_char                    0.342232      0.028272       0.011611   
num_of_unreserved_spec_char             0.397850      0.077650      -0.008112   
num_of_sub_domains                      0.478932      0.026997       0.004802   
contain_http                            0.259301      0.003289       0.000234   
number_of_suspicious_words  

In [10]:
x = df.loc[:, df.columns != 'url']  # string
x = x.loc[:, x.columns != 'class']  # class

# With ejecting this columns, results are getting worse for 0.02 - 0.03 => Do not eject them
# x = x.loc[:, x.columns != 'deep_url_len']  # correlation with url_len is 0.98
# x = x.loc[:, x.columns != 'num_of_reserved_char']  # correlation with num_of_sub_deli is 0.96

x = x.values[:, :]
y = df.values[:, -1]
y = y.astype('int')

In [11]:
print(df.describe())

             url_len   tld_in_known  tld_in_abused     contain_ip  \
count  411263.000000  411263.000000  411263.000000  411263.000000   
mean       48.428648       0.988757       0.001734       0.000109   
std        35.063820       0.105437       0.041601       0.010460   
min         1.000000       0.000000       0.000000       0.000000   
25%        29.000000       1.000000       0.000000       0.000000   
50%        41.000000       1.000000       0.000000       0.000000   
75%        59.000000       1.000000       0.000000       0.000000   
max      2307.000000       1.000000       1.000000       1.000000   

        deep_url_len  num_of_gen_deli  num_of_sub_deli  num_of_reserved_char  \
count  411263.000000    411263.000000    411263.000000         411263.000000   
mean       32.546478         1.154633         0.244892              1.399525   
std        34.587544         0.390959         0.597133              0.926967   
min         1.000000         0.000000         0.000000    

In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411263 entries, 0 to 411262
Data columns (total 22 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   url                                     411263 non-null  object 
 1   url_len                                 411263 non-null  int64  
 2   tld_in_known                            411263 non-null  int64  
 3   tld_in_abused                           411263 non-null  int64  
 4   contain_ip                              411263 non-null  int64  
 5   deep_url_len                            411263 non-null  int64  
 6   num_of_gen_deli                         411263 non-null  int64  
 7   num_of_sub_deli                         411263 non-null  int64  
 8   num_of_reserved_char                    411263 non-null  int64  
 9   num_of_unreserved_spec_char             411263 non-null  int64  
 10  num_of_sub_domains                      4112

In [13]:
x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.3)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2)

In [14]:
scaler = preprocessing.StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [15]:
n_est = [5, 10, 20, 50, 75, 100]
max_depths = [None, 5, 10, 20]
for n in n_est:
    for max_d in max_depths:
        model = RandomForestClassifier(n_estimators=n, max_depth=max_d, criterion='gini')
        model.fit(x_train, y_train)
        f1_score = metrics.f1_score(y_val, model.predict(x_val))
        print(str(n) + " " + str(max_d) + " " + str(f1_score))

5 None 0.9623791271464245
5 5 0.9349044486825204
5 10 0.9527939883284036
5 20 0.9623033239023794
10 None 0.9641100570040952
10 5 0.9361354349095155
10 10 0.9538295274881423
10 20 0.9633415293328729
20 None 0.9649612848898107
20 5 0.9363657605732534
20 10 0.9532866798253484
20 20 0.9643347190177565
50 None 0.9653233749179252
50 5 0.9368348279924829
50 10 0.9549249751921978
50 20 0.9645267523174501
75 None 0.9655844488895727
75 5 0.9380779585625658
75 10 0.9544445111337854
75 20 0.9647953168828001
100 None 0.9655596808270598
100 5 0.9364094384033302
100 10 0.9541229385307346
100 20 0.9643967566908491


In [16]:
non = [1, 2, 3, 5, 7, 10, 20]
for n in non:
    model = KNeighborsClassifier(n_neighbors=n, weights='uniform')
    model.fit(x_train, y_train)
    f1_score = metrics.f1_score(y_val, model.predict(x_val))
    print(f1_score)

0.9492091986752916
0.9378218577150486
0.9492039423805912
0.9504145700490965
0.9497377089178968
0.9478814381903078
0.9468790828159656


In [16]:
ratio = 1/2
friends_ros = imblearn.under_sampling.RandomUnderSampler(random_state=0, sampling_strategy=ratio)
x_train_res, y_train_res = friends_ros.fit_resample(x_train, y_train)
x_test_res, y_test_res = friends_ros.fit_resample(x_test, y_test)

smote = imblearn.over_sampling.SMOTE(random_state=0, k_neighbors=5)
x_train_res_smote, y_train_res_smote = smote.fit_resample(x_train, y_train)
x_test_res_smote, y_test_res_smote = smote.fit_resample(x_test, y_test)

In [19]:
dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalGiniNone', criteria='gini', depth=None)
dtc(x_train_res, x_test, y_train_res, y_test, mode='save', rtype='RatioGiniNone', criteria='gini', depth=None)
dtc(x_train_res_smote, x_test, y_train_res_smote, y_test, mode='save', rtype='SmoteGiniNone', criteria='gini', depth=None)

-----Drvo odlucivanja: NormalGiniNone -----
Rezultat trening skupa: 0.982
Rezultat test skupa: 0.925
Matrica kofuzije trening skupa:
[[ 35157   2097]
 [  2054 190999]]
Matrica kofuzije test skupa:
[[15457  4495]
 [ 4729 98698]]
Metrika: 
              precision    recall  f1-score   support

           0       0.77      0.77      0.77     19952
           1       0.96      0.95      0.96    103427

    accuracy                           0.93    123379
   macro avg       0.86      0.86      0.86    123379
weighted avg       0.93      0.93      0.93    123379

Vreme izvrsavanja: 3.528 

-----Drvo odlucivanja: RatioGiniNone -----
Rezultat trening skupa: 0.978
Rezultat test skupa: 0.904
Matrica kofuzije trening skupa:
[[36157  1097]
 [ 1388 73120]]
Matrica kofuzije test skupa:
[[16788  3164]
 [ 8734 94693]]
Metrika: 
              precision    recall  f1-score   support

           0       0.66      0.84      0.74     19952
           1       0.97      0.92      0.94    103427

    accurac

In [20]:
dtc(x_train, x_test, y_train, y_test, mode='save', rtype='NormalEntropyNone', criteria='entropy', depth=None)
dtc(x_train_res, x_test, y_train_res, y_test, mode='save', rtype='RatioEntropyNone', criteria='entropy', depth=None)
dtc(x_train_res_smote, x_test, y_train_res_smote, y_test, mode='save', rtype='SmoteEntropyNone', criteria='entropy', depth=None)

-----Drvo odlucivanja: NormalEntropyNone -----
Rezultat trening skupa: 0.982
Rezultat test skupa: 0.926
Matrica kofuzije trening skupa:
[[ 35157   2097]
 [  2054 190999]]
Matrica kofuzije test skupa:
[[15413  4539]
 [ 4631 98796]]
Metrika: 
              precision    recall  f1-score   support

           0       0.77      0.77      0.77     19952
           1       0.96      0.96      0.96    103427

    accuracy                           0.93    123379
   macro avg       0.86      0.86      0.86    123379
weighted avg       0.93      0.93      0.93    123379

Vreme izvrsavanja: 2.031 

-----Drvo odlucivanja: RatioEntropyNone -----
Rezultat trening skupa: 0.978
Rezultat test skupa: 0.905
Matrica kofuzije trening skupa:
[[36157  1097]
 [ 1388 73120]]
Matrica kofuzije test skupa:
[[16798  3154]
 [ 8538 94889]]
Metrika: 
              precision    recall  f1-score   support

           0       0.66      0.84      0.74     19952
           1       0.97      0.92      0.94    103427

    a

In [21]:
rfc(x_train, x_test, y_train, y_test, n_est=50, mode='save', rtype='Norma50', maxd=None)
rfc(x_train_res, x_test, y_train_res, y_test, n_est=50, mode='save', rtype='Ratio50', maxd=None)
rfc(x_train_res_smote, x_test, y_train_res_smote, y_test, n_est=50, mode='save', rtype='Smote50', maxd=None)

-----Nasumicna suma: Norma50 -----
Rezultat trening skupa: 0.982
Rezultat test skupa: 0.941
Matrica kofuzije trening skupa:
[[ 34663   2591]
 [  1591 191462]]
Matrica kofuzije test skupa:
[[ 15392   4560]
 [  2752 100675]]
Metrika: 
              precision    recall  f1-score   support

           0       0.85      0.77      0.81     19952
           1       0.96      0.97      0.96    103427

    accuracy                           0.94    123379
   macro avg       0.90      0.87      0.89    123379
weighted avg       0.94      0.94      0.94    123379

Vreme izvrsavanja: 21.832 

-----Nasumicna suma: Ratio50 -----
Rezultat trening skupa: 0.978
Rezultat test skupa: 0.929
Matrica kofuzije trening skupa:
[[35899  1355]
 [ 1143 73365]]
Matrica kofuzije test skupa:
[[16991  2961]
 [ 5854 97573]]
Metrika: 
              precision    recall  f1-score   support

           0       0.74      0.85      0.79     19952
           1       0.97      0.94      0.96    103427

    accuracy           

In [21]:
knn(x_train, x_test, y_train, y_test, non=5, mode='save', rtype='Normal5')
knn(x_train_res, x_test, y_train_res, y_test, non=5, mode='save', rtype='Ratio5')
knn(x_train_res_smote, x_test, y_train_res_smote, y_test, non=5, mode='save', rtype='Smote5')

-----K najblizih suseda: Normal5 -----
Rezultat trening skupa: 0.945
Rezultat test skupa: 0.930
Matrica kofuzije trening skupa:
[[ 29043   8379]
 [  4299 188699]]
Matrica kofuzije test skupa:
[[ 14468   5524]
 [  3140 100308]]
Metrika: 
              precision    recall  f1-score   support

           0       0.82      0.72      0.77     19992
           1       0.95      0.97      0.96    103448

    accuracy                           0.93    123440
   macro avg       0.88      0.85      0.86    123440
weighted avg       0.93      0.93      0.93    123440

Vreme izvrsavanja: 2226.056 

-----K najblizih suseda: Ratio5 -----
Rezultat trening skupa: 0.899
Rezultat test skupa: 0.881
Matrica kofuzije trening skupa:
[[33383  4039]
 [ 7311 67533]]
Matrica kofuzije test skupa:
[[17059  2933]
 [11809 91639]]
Metrika: 
              precision    recall  f1-score   support

           0       0.59      0.85      0.70     19992
           1       0.97      0.89      0.93    103448

    accuracy  