### Setup

In [1]:
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import tldextract
from urllib.parse import urlparse, parse_qs
import re

### Dataset

In [2]:
df = pd.read_csv('../datasets/malicious_phish.csv', header=None, skiprows=1)
df.head()

Unnamed: 0,0,1
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
X = df.iloc[:, 0]
y = df.iloc[:, 1]

print(X.head())
print(y.head())
print(y.value_counts())

0                                     br-icloud.com.br
1                  mp3raid.com/music/krizz_kaliko.html
2                      bopsecrets.org/rexroth/cr/1.htm
3    http://www.garage-pirenne.be/index.php?option=...
4    http://adventure-nicaragua.net/index.php?optio...
Name: 0, dtype: object
0      phishing
1        benign
2        benign
3    defacement
4    defacement
Name: 1, dtype: object
1
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [4]:
def classification_type(type):
    '''
    Convert classification type into values:
        1) Benign = 0
        2) Defacement = 1
        3) Phishing = 2
        4) Malware = 3    
    '''
    if type == "benign":
        return 0
    elif type == "defacement":
        return 1
    elif type == "phishing":
        return 2
    elif type == "malware":
        return 3
    else:
        print(f"Unable to find proper type: {type}")

y = y.apply(classification_type)
y.value_counts()

1
0    428103
1     96457
2     94111
3     32520
Name: count, dtype: int64

### Feature extraction

#### `general`

In [5]:
# www
def get_www(url):
    return len(re.findall(r"www", url))

# url_length
# DUPLICATE

# digit_count
def get_num_digits(url):
    return len(re.findall(r'\d', url))

# dot_count
def get_num_dot(url):
    return len(re.findall(r'.', url))

# bs_count
def get_num_backslash(url):
    return len(re.findall(r'/', url))

# dash_count
def get_num_dash(url):
    return len(re.findall(r'-', url))

# url_entropy
def get_url_entropy(url):
    url_str = url.strip()
    prob = [float(url_str.count(c)) / len(url_str) for c in dict.fromkeys(list(url_str))]
    entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
    return entropy

# params_count
def get_num_params(url):
    params = url.split('&')
    return len(params) - 1

# subdomain_count
def get_num_subdomains(url):
    extracted = tldextract.extract(url)
    return len(extracted.subdomain.split('.'))

# domain_extension
def get_domain_extension(url):
    extracted = tldextract.extract(url)
    if extracted.suffix == '':
      return 'None'
    return extracted.suffix

# underscores_count
def get_num_underscores(url):
    return len(re.findall(r'_', url))

# questionmarks_count
def get_num_questionmarks(url):
    return len(re.findall(r'\?', url))

# equals_count
def get_num_equals(url):
    return len(re.findall(r'=', url))

# ampersands_count
def get_num_ampersands(url):
    return len(re.findall(r'\&', url))

# digit_letter_ratio
# DUPLICATE

# pd_num_count
# DUPLICATE

# pd_non_alphanumeric_count
# DUPLICATE

# uppercase_dirs
# DIFF
def uppercase_dirs_count(url):
    path = urlparse(url).path
    return sum(1 for dir_name in path.split('/') if any(c.isupper() for c in dir_name))

# path_count_special_chars
# DIFF
def special_chars_count(url):
    path = urlparse(url).path
    return sum(1 for c in path if not c.isalnum() and c != '/')

# path_uppercase_to_lowercase_ratio
# DIFF
def uppercase_to_lowercase_ratio(url):
    # Extract the path from the URL
    path = urlparse(url).path
    # Calculate the ratio of uppercase to lowercase characters
    uppercase_count = sum(1 for c in path if c.isupper())
    lowercase_count = sum(1 for c in path if c.islower())
    return uppercase_count / lowercase_count if lowercase_count > 0 else 0

# params_length
# DIFF
def params_get_length(url):
    query = urlparse(url).query
    return len(query)

# queries_count
# DIFF
def queries_get_count(url):
    query = urlparse(url).query
    return len(parse_qs(query))

def get_general_feature_set(X_):
    return pd.concat([
        X_.apply(get_www),
        X_.apply(get_num_digits),
        X_.apply(get_num_dot),
        X_.apply(get_num_backslash),
        X_.apply(get_num_dash),
        X_.apply(get_url_entropy),
        X_.apply(get_num_params),
        X_.apply(get_num_subdomains),
        pd.DataFrame(X_.apply(get_domain_extension).factorize()[0]),
        X_.apply(get_num_underscores),
        X_.apply(get_num_questionmarks),
        X_.apply(get_num_equals),
        X_.apply(get_num_ampersands),
        X_.apply(uppercase_dirs_count),
        X_.apply(special_chars_count),
        X_.apply(uppercase_to_lowercase_ratio),
        X_.apply(params_get_length),
        X_.apply(queries_get_count)],
        axis=1)

# general_feat = get_general_feat(X)
# general_feat.head()

___

#### `lexical`

#### URL

In [6]:
# Length
def get_url_length(url):
    return len(url)

# Number of ; _ ? = &
def get_url_punct_count(url):
    return len(re.findall(r'[;_\?=&]', url))

# Digit to Letter ratio
def get_url_dlr(url):
    num_digits = len(re.findall(r'\d', url))
    num_letters = len(re.findall(r'[a-zA-Z]', url))
    if num_letters:
        return num_digits / num_letters
    else:
        return 0

def get_url_feat(X_):
    return pd.concat([
        X_.apply(get_url_length),       # Length
        X_.apply(get_url_punct_count),  # Number of ; _ ? = &
        X_.apply(get_url_dlr)],             # Digit to Letter ratio
    axis=1)

# URL_feat = get_url_feat(X)
# URL_feat.head()

#### Primary domain

In [7]:
def get_primary_domain(url):
    extracted = tldextract.extract(url)
    return extracted.domain + '.' + extracted.suffix

# Presence of IP
def get_PD_has_IP(url):
    return int(bool(re.match(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', url)))

# Length
def get_PD_len(url):
    return len(url)

# Number of digits
def get_PD_digits(url):
    return len(re.findall(r'\d', url))

# Number of non-alphanumeric characters
def get_PD_num_non_alphanumeric(url):
    return len(re.findall(r'[^a-zA-Z0-9]', url))

# Number of hyphens
def get_PD_num_hyphens(url):
    return len(re.findall(r'-', url))

# Number of @s
def get_PD_num_at(url):
    return len(re.findall(r'@', url))

def get_PD_feat(X_):
    PD = X_.apply(lambda x: get_primary_domain(x))
    return pd.concat([
        PD.apply(get_PD_has_IP),               # Presence of IP
        PD.apply(get_PD_len),                  # Length
        PD.apply(get_PD_digits),               # Number of digits
        PD.apply(get_PD_num_non_alphanumeric), # Number of non-alphanumeric characters
        PD.apply(get_PD_num_hyphens),          # Number of hyphens
        PD.apply(get_PD_num_at)],              # Number of @s
        axis=1)

# PD_feat = get_PD_feat(X)
# PD_feat.head()

#### Subdomain

In [8]:
# Number of dots
def get_SD_num_dots(url):
    return url.count('.')

# Number of subdomains
def get_num_SD(url):
    return sum(1 for subdomain in url.split('.') if subdomain)

def get_SD_feat(X_):
    SD = X_.apply(lambda x: tldextract.extract(x).subdomain)
    return pd.concat([
        SD.apply(get_SD_num_dots),  # Number of dots
        SD.apply(get_num_SD)],      # Number of subdomains
        axis=1)

# SD_feat = get_SD_feat(X)
# SD_feat.head()

#### Path

In [9]:
# ensure proper parsing of paths
def parse_path(url):
    try:
        if url.startswith('https://') or url.startswith('http://'):
            return urlparse(url)
        else:
            return urlparse('https://' + url)
    except:
        return urlparse(url)

# Number of //
def get_path_num_dbl_fwdslash(url):
    return len(re.findall(r'//', url))

# Number of subdirectories
def get_path_num_subdirs(url):
    return url.count('/')

# Presence of %20
def get_path_has_percent20(url):
    return int('%20' in url)

# Presence of uppercase directories
def get_path_has_uppercase_dirs(url):
    return int(any(any(c.isupper() for c in dir_name) for dir_name in url.split('/')))

# Presence of single character directories
def get_path_has_char_dirs(url):
    return int(any((len(dir_name) == 1) for dir_name in url.split('/')))

# Number of special characters
def get_path_num_special_chars(url):
    return sum(len(re.findall(r'[^A-Za-z0-9]', dir_name)) for dir_name in url.split('/'))

# Number of 0s
def get_path_num_zeroes(url):
    return url.count('0')

# Ratio of uppercase to lowercase characters
def get_path_upper_to_lower_ratio(url):
    upper_count = sum(1 for c in url if c.isupper())
    lower_count = sum(1 for c in url if c.islower())
    if lower_count:
        return upper_count / lower_count
    else:
        return 0

def get_path_feat(X_):
    path_ = X_.apply(lambda x: parse_path(x).path)
    return pd.concat([
        path_.apply(get_path_num_dbl_fwdslash),         # Number of //
        path_.apply(get_path_num_subdirs),              # Number of subdirectories
        path_.apply(get_path_has_percent20),            # Presence of %20
        path_.apply(get_path_has_uppercase_dirs),       # Presence of uppercase directories
        path_.apply(get_path_has_char_dirs),            # Presence of single character directories
        path_.apply(get_path_num_special_chars),        # Number of special characters
        path_.apply(get_path_num_zeroes),               # Number of 0s
        path_.apply(get_path_upper_to_lower_ratio)],    # Ratio of uppercase to lowercase characters
        axis=1)

# path_feat = get_path_feat(X)
# path_feat.head()

#### Parameters

In [10]:
# Length
def get_param_length(url):
    return sum(len(value[0]) for value in parse_qs(urlparse(url).query).values())

def get_param_feat(X_):
    return pd.concat([
        X_.apply(get_param_length)],    # Length
        axis=1)

# param_feat = get_param_feat(X)
# param_feat.head()

#### Query

In [11]:
# Number of queries
def get_num_queries(url):
    return len(parse_qs(urlparse(url).query))

def get_query_feat(X_):
    return pd.concat([
        X_.apply(get_num_queries)
    ])

# query_feat = get_query_feat(X)
# query_feat.head()

In [12]:
def get_lexical_feature_set(X_):
    return pd.concat([
        get_url_feat(X_),
        get_PD_feat(X_),
        get_SD_feat(X_),
        get_path_feat(X_),
        get_param_feat(X_),
        get_query_feat(X_)],
        axis=1)

# feature_set = get_lexical_feature_set(X)
# feature_set.head()

In [13]:
feature_set = pd.concat([
    get_general_feature_set(X),
    get_lexical_feature_set(X)],
    axis=1)
feature_set.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
0,0,0,16,0,1,-3.375,0,1,0,0,...,0,0,0,0,0,0,0,0.0,0,0
1,0,1,35,2,0,-4.079143,0,1,1,1,...,0,2,0,0,0,2,0,0.0,0,0
2,0,1,31,3,0,-3.708093,0,1,2,0,...,0,3,0,0,0,1,0,0.0,0,0
3,1,7,88,3,1,-4.660343,3,1,3,2,...,0,1,0,0,0,1,0,0.0,22,4
4,0,22,235,3,1,-5.491293,2,1,4,1,...,0,1,0,0,0,1,0,0.0,175,3


### Train Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(feature_set, y, test_size=0.2, random_state=69)

In [15]:
print(y_train.value_counts())
print(y_test.value_counts())

1
0    342451
1     77182
2     75393
3     25926
Name: count, dtype: int64
1
0    85652
1    19275
2    18718
3     6594
Name: count, dtype: int64


### Model

In [16]:
model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)

In [17]:
model.fit(X_train, y_train)

In [37]:
np.argmin(model.feature_importances_)

26

### Evaluation

In [19]:
def calc_FNR_accuracy(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    for label_class in range(4):
        FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
        
        TP = conf_matrix[label_class][label_class]  
        
        TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
        
        accuracy = (TP + TN) / np.sum(conf_matrix)
        print("Accuracy for class", label_class, ":", accuracy)

        FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
        print("FNR for class", label_class, ":", FNR)

#### Malicious URLS (multi-class)
1) Benign       = 0
2) Defacement   = 1
3) Phishing     = 2
4) Malware      = 3 

In [20]:
y_pred = model.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)

In [22]:
print(f'OVERALL: Accuracy: {accuracy:.8f}, Precision: {precision:.8f}, Recall: {recall:.8f}, f1 Score: {f1:.8f}')

OVERALL: Accuracy: 0.96562474, Precision: 0.96522551, Recall: 0.96562474, f1 Score: 0.96501186


In [23]:
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')

Class 0:	Test Precision: 0.97060270,	Test Recall: 0.99067155,	Test f1: 0.98053445
Class 1:	Test Precision: 0.96559107,	Test Recall: 0.98272374,	Test f1: 0.97408207
Class 2:	Test Precision: 0.93030462,	Test Recall: 0.85003740,	Test f1: 0.88836158
Class 3:	Test Precision: 0.99343832,	Test Recall: 0.91841068,	Test f1: 0.95445232


In [24]:
calc_FNR_accuracy(y_test, y_pred)

Accuracy for class 0 : 0.9741321723907586
FNR for class 0 : 0.009328445336944846
Accuracy for class 1 : 0.9922603828346348
FNR for class 1 : 0.01727626459143969
Accuracy for class 2 : 0.9692949116624052
FNR for class 2 : 0.149962602842184
Accuracy for class 3 : 0.9955620052365267
FNR for class 3 : 0.08158932362754019


#### External Datasets

In [25]:
# phishing
df_phish = pd.read_csv('../datasets/phishtank_phish_urls.csv', header=None, skiprows=1).iloc[:, 0]
df_phish.head()

0                       https://servicecu.mobi/Service
1          https://sp581716.sitebeat.crazydomains.com/
2          https://sp774647.sitebeat.crazydomains.com/
3          https://sp779562.sitebeat.crazydomains.com/
4    https://manaliindiancuisine.es//gmx/rechnung/d...
Name: 0, dtype: object

In [26]:
y_pred_phish = model.predict(pd.concat([
    get_general_feature_set(df_phish),
    get_lexical_feature_set(df_phish)],
    axis=1))
y_test_phish = np.full((df_phish.shape[0],), 2)

In [27]:
phish_accuracy = accuracy_score(y_test_phish, y_pred_phish)
phish_precision = precision_score(y_test_phish, y_pred_phish, average='weighted')
phish_recall = recall_score(y_test_phish, y_pred_phish, average='weighted')
phish_f1 = f1_score(y_test_phish, y_pred_phish, average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
print(f'OVERALL: Accuracy: {phish_accuracy:.8f}, Precision: {phish_precision:.8f}, Recall: {phish_recall:.8f}, f1 Score: {phish_f1:.8f}')

OVERALL: Accuracy: 0.91864310, Precision: 1.00000000, Recall: 0.91864310, f1 Score: 0.95759665


In [29]:
calc_FNR_accuracy(y_test_phish, y_pred_phish)

Accuracy for class 0 : 0.9457170810701393
FNR for class 0 : -1
Accuracy for class 1 : 0.9754731621740157
FNR for class 1 : -1
Accuracy for class 2 : 0.9186430957120205
FNR for class 2 : 0.08135690428797948
Accuracy for class 3 : 0.9974528524678654
FNR for class 3 : -1


___

In [30]:
# benign
df_benign = pd.read_csv('../datasets/benign_urls.csv', header=None, skiprows=1).iloc[:, 0]
df_benign.head()

0       https://www.google.com
1      https://www.youtube.com
2     https://www.facebook.com
3        https://www.baidu.com
4    https://www.wikipedia.org
Name: 0, dtype: object

In [31]:
y_pred_benign = model.predict(pd.concat([
    get_general_feature_set(df_benign),
    get_lexical_feature_set(df_benign)],
    axis=1))
y_test_benign = np.full((df_benign.shape[0],), 0)

In [32]:
benign_accuracy = accuracy_score(y_test_benign, y_pred_benign)
benign_precision = precision_score(y_test_benign, y_pred_benign, average='weighted')
benign_recall = recall_score(y_test_benign, y_pred_benign, average='weighted')
benign_f1 = f1_score(y_test_benign, y_pred_benign, average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
print(f'OVERALL: Accuracy: {benign_accuracy:.8f}, Precision: {benign_precision:.8f}, Recall: {benign_recall:.8f}, f1 Score: {benign_f1:.8f}')

OVERALL: Accuracy: 0.01429985, Precision: 1.00000000, Recall: 0.01429985, f1 Score: 0.02819649


In [34]:
calc_FNR_accuracy(y_test_benign, y_pred_benign)

Accuracy for class 0 : 0.014299845547784738
FNR for class 0 : 0.9857001544522153
Accuracy for class 1 : 0.5303206474266641
FNR for class 1 : -1
Accuracy for class 2 : 0.48652737043657335
FNR for class 2 : -1
Accuracy for class 3 : 0.9974518276845472
FNR for class 3 : -1
