### Setup

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import tree 
from matplotlib import pyplot as plt
import tldextract
from urllib.parse import urlparse, parse_qs
import re

### Dataset

In [2]:
df = pd.read_csv('../datasets/malicious_phish.csv')
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
X = df.iloc[:, 0]
y = df.iloc[:, 1]

print(X.head())
print(y.head())
print(y.value_counts())

0                                     br-icloud.com.br
1                  mp3raid.com/music/krizz_kaliko.html
2                      bopsecrets.org/rexroth/cr/1.htm
3    http://www.garage-pirenne.be/index.php?option=...
4    http://adventure-nicaragua.net/index.php?optio...
Name: url, dtype: object
0      phishing
1        benign
2        benign
3    defacement
4    defacement
Name: type, dtype: object
type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [4]:
def classification_type(type):
    '''
    Convert classification type into values:
        1) Benign = 0
        2) Defacement = 1
        3) Phishing = 2
        4) Malware = 3    
    '''
    if type == "benign":
        return 0
    elif type == "defacement":
        return 1
    elif type == "phishing":
        return 2
    elif type == "malware":
        return 3
    else:
        print(f"Unable to find proper type: {type}")

y = y.apply(classification_type)
y.value_counts()

type
0    428103
1     96457
2     94111
3     32520
Name: count, dtype: int64

### Feature extraction

#### Trigram

In [5]:
# Initialize CountVectorizer for trigrams, 1000 features
tf_vectorizer = CountVectorizer(ngram_range=(3,3), max_features=1000)

In [6]:
def get_trigram_feature_set(X_, vectorizer, fit=False):
    if fit:
        return vectorizer.fit_transform(X_)
    else:
        return vectorizer.transform(X_)

In [7]:
feature_set = get_trigram_feature_set(X, tf_vectorizer, True)

In [8]:
print(tf_vectorizer.vocabulary_)
print(feature_set)

{'mp3raid com music': 732, 'be index php': 178, 'index php option': 660, 'php option com_content': 808, 'option com_content view': 769, 'com_content view article': 341, 'view article id': 934, 'net index php': 751, 'php option com_mailto': 813, 'option com_mailto tmpl': 774, 'com_mailto tmpl component': 347, 'tmpl component link': 903, 'http buzzfil net': 567, 'espn go com': 500, 'nl index php': 757, 'de index php': 432, 'uk linkedin com': 925, 'linkedin com pub': 695, 'baseball reference com': 161, 'reference com players': 843, '192 com atoz': 4, 'com atoz people': 277, 'en wikipedia org': 494, 'wikipedia org wiki': 966, 'com index php': 306, 'php option com_virtuemart': 818, 'option com_virtuemart page': 779, 'com_virtuemart page shop': 352, 'page shop browse': 792, 'shop browse category_id': 865, 'movies yahoo com': 731, 'com app member': 275, 'app member sportoption': 115, 'member sportoption php': 717, 'sportoption php uid': 880, 'php uid guest': 822, 'uid guest langx': 920, 'gues

### Train Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(feature_set, y, test_size=0.2, random_state=69)

In [10]:
print(y_train.value_counts())
print(y_test.value_counts())

type
0    342451
1     77182
2     75393
3     25926
Name: count, dtype: int64
type
0    85652
1    19275
2    18718
3     6594
Name: count, dtype: int64


### Model

In [11]:
model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)

In [12]:
model.fit(X_train, y_train)

### Evaluation

In [13]:
def calc_FNR_accuracy(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    for label_class in range(4):
        FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
        
        TP = conf_matrix[label_class][label_class]  
        
        TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
        
        accuracy = (TP + TN) / np.sum(conf_matrix)
        print("Accuracy for class", label_class, ":", accuracy)

        FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
        print("FNR for class", label_class, ":", FNR)

#### Malicious URLS (multi-class)
1) Benign       = 0
2) Defacement   = 1
3) Phishing     = 2
4) Malware      = 3 

In [14]:
y_pred = model.predict(X_test)

In [15]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)

In [16]:
print(f'OVERALL: Accuracy: {accuracy:.8f}, Precision: {precision:.8f}, Recall: {recall:.8f}, f1 Score: {f1:.8f}')

OVERALL: Accuracy: 0.74511475, Precision: 0.81467527, Recall: 0.74511475, f1 Score: 0.67663037


In [17]:
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')

Class 0:	Test Precision: 0.72116876,	Test Recall: 0.99905431,	Test f1: 0.83766666
Class 1:	Test Precision: 0.98806294,	Test Recall: 0.47237354,	Test f1: 0.63917164
Class 2:	Test Precision: 0.99871959,	Test Recall: 0.04167112,	Test f1: 0.08000410
Class 3:	Test Precision: 1.00000000,	Test Recall: 0.24067334,	Test f1: 0.38797213


In [18]:
calc_FNR_accuracy(y_test, y_pred)

Accuracy for class 0 : 0.745345096322914
FNR for class 0 : 0.0009456871993648718
Accuracy for class 1 : 0.9210681900198865
FNR for class 1 : 0.5276264591439689
Accuracy for class 2 : 0.862260920308049
FNR for class 2 : 0.9583288812907362
Accuracy for class 3 : 0.96155529449704
FNR for class 3 : 0.759326660600546


#### External Datasets

In [19]:
# phishing
df_phish = pd.read_csv('../datasets/phishtank_phish_urls.csv').iloc[:, 0]
df_phish.head()

0                       https://servicecu.mobi/Service
1          https://sp581716.sitebeat.crazydomains.com/
2          https://sp774647.sitebeat.crazydomains.com/
3          https://sp779562.sitebeat.crazydomains.com/
4    https://manaliindiancuisine.es//gmx/rechnung/d...
Name: url, dtype: object

In [20]:
y_pred_phish = model.predict(get_trigram_feature_set(df_phish, vectorizer=tf_vectorizer))
y_test_phish = np.full((df_phish.shape[0],), 2)

In [21]:
phish_accuracy = accuracy_score(y_test_phish, y_pred_phish)
phish_precision = precision_score(y_test_phish, y_pred_phish, average='weighted')
phish_recall = recall_score(y_test_phish, y_pred_phish, average='weighted')
phish_f1 = f1_score(y_test_phish, y_pred_phish, average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
print(f'OVERALL: Accuracy: {phish_accuracy:.8f}, Precision: {phish_precision:.8f}, Recall: {phish_recall:.8f}, f1 Score: {phish_f1:.8f}')

OVERALL: Accuracy: 0.00000000, Precision: 0.00000000, Recall: 0.00000000, f1 Score: 0.00000000


In [23]:
calc_FNR_accuracy(y_test_phish, y_pred_phish)

Accuracy for class 0 : 8.434263351438886e-05
FNR for class 0 : -1
Accuracy for class 1 : 0.9999156573664856
FNR for class 1 : -1
Accuracy for class 2 : 0.0
FNR for class 2 : 1.0


IndexError: index 3 is out of bounds for axis 0 with size 3

___

In [None]:
# benign
df_benign = pd.read_csv('../datasets/benign_urls.csv').iloc[:, 0]
df_benign.head()

In [None]:
y_pred_benign = model.predict(get_trigram_feature_set(df_benign, vectorizer=tf_vectorizer))
y_test_benign = np.full((df_benign.shape[0],), 0)

In [None]:
benign_accuracy = accuracy_score(y_test_benign, y_pred_benign)
benign_precision = precision_score(y_test_benign, y_pred_benign, average='weighted')
benign_recall = recall_score(y_test_benign, y_pred_benign, average='weighted')
benign_f1 = f1_score(y_test_benign, y_pred_benign, average='weighted')

In [None]:
print(f'OVERALL: Accuracy: {benign_accuracy:.8f}, Precision: {benign_precision:.8f}, Recall: {benign_recall:.8f}, f1 Score: {benign_f1:.8f}')

In [None]:
calc_FNR_accuracy(y_test_benign, y_pred_benign)