### Setup

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
from sklearn import tree 
from matplotlib import pyplot as plt


### Dataset

In [3]:
X = pd.read_csv("datasets/feature_updated_dataset_X.csv")
y = pd.read_csv("datasets/feature_updated_dataset_y.csv")

### Split

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=69)
y_train, y_val = y_train.values.ravel(), y_val.values.ravel()

In [5]:
print(pd.Series(y_train).value_counts())
print(pd.Series(y_val).value_counts())

0    342451
1     77182
2     75393
3     25926
Name: count, dtype: int64
0    85652
1    19275
2    18718
3     6594
Name: count, dtype: int64


### Model

In [6]:
model = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=2, min_samples_leaf=1)

### Evaluation

In [7]:
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [8]:
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, support = precision_recall_fscore_support(y_val, y_pred, average='weighted')
print(f"Accuracy: {accuracy} | Precision: {precision} | Recall: {recall} | f1 score: {f1} | {model.__class__.__name__}")

Accuracy: 0.9621158024861984 | Precision: 0.9614548136474571 | Recall: 0.9621158024861984 | f1 score: 0.9612759333976157 | RandomForestClassifier


In [9]:
# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(y_val, y_pred)
for i in range(4):
    print(f"Class {i}:\nPrecision: {precision[i]} | Recall: {recall[i]} | f1 score: {f1[i]} | {model.__class__.__name__}")

Class 0:
Precision: 0.9688847213804618 | Recall: 0.9892121608368748 | f1 score: 0.9789429292725057 | RandomForestClassifier
Class 1:
Precision: 0.9597145604534643 | Recall: 0.9838132295719845 | f1 score: 0.9716144899318543 | RandomForestClassifier
Class 2:
Precision: 0.9206500956022945 | Recall: 0.8231648680414574 | f1 score: 0.869182602809274 | RandomForestClassifier
Class 3:
Precision: 0.9858617950754567 | Recall: 0.9411586290567182 | f1 score: 0.962991698347428 | RandomForestClassifier


### Extract model structure

In [10]:
X_test = pd.read_csv("datasets/single_X.csv")
y_test = pd.read_csv("datasets/single_y.csv")

In [11]:
print(X_test, y_test)

   http  https  www  url_length  digit_count  percentage_count  dot_count  \
0   0.0    0.0  0.0        16.0          0.0               0.0       16.0   

   bs_count  dash_count  url_entropy  params_count  subdomains_count  \
0       0.0         1.0       -3.375           0.0               0.0   

   domain_extension  
0               0.0      type_val
0         2


In [12]:
feature_name_ls = ["csr_http", "csr_https", "csr_www", "csr_url_len", "csr_digit_count", 
      "csr_percentage_count", "csr_dot_count", "csr_bs_count", "csr_dash_count", 
      "csr_url_entropy", "csr_url_num_params", "csr_url_num_subdomains", "csr_domain_extension"]

print("Feature Importance")
for i in range(len(feature_name_ls)):
    print(f'{feature_name_ls[i]}: {model.feature_importances_[i]}')

Feature Importance
csr_http: 0.19271972011089436
csr_https: 0.03517554943973927
csr_www: 0.17264609049344912
csr_url_len: 0.05567823215436183
csr_digit_count: 0.05329402486790133
csr_percentage_count: 0.009422722099244284
csr_dot_count: 0.05557625316959293
csr_bs_count: 0.10422209645555534
csr_dash_count: 0.03164923254525475
csr_url_entropy: 0.04634592900664131
csr_url_num_params: 0.04964989537466519
csr_url_num_subdomains: 0.0985291565040882
csr_domain_extension: 0.09509109777861231


In [46]:
y_pred_proba = model.predict_proba(X_test)
for prediction in y_pred_proba:
    print(f'Classification: {max(range(len(prediction)), key=prediction.__getitem__)}')
    for i in range(4):
        print(f'Class {i} confidence: {(prediction[i] * 100):.4f}%')
    print('\n')

Classification: 2
Class 0 confidence: 0.0000%
Class 1 confidence: 0.0000%
Class 2 confidence: 99.5721%
Class 3 confidence: 0.4279%




In [None]:
tree.plot_tree(estimators[0],
               max_depth=1,
               feature_names=["csr_http", "csr_https", "csr_www", "csr_url_len", "csr_digit_count",
                              "csr_percentage_count", "csr_dot_count", "csr_bs_count", "csr_dash_count",
                              "csr_url_entropy", "csr_url_num_params", "csr_url_num_subdomains", "csr_domain_extension"],
               class_names=True,
               filled=True,
               label='none',
               impurity=False,
               fontsize=8)