### Setup

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import tldextract
from urllib.parse import urlparse, parse_qs
import re
from scipy.sparse import csr_matrix, hstack

### Dataset

In [2]:
df = pd.read_csv('../datasets/malicious_phish.csv')
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
X = df.iloc[:, 0]
y = df.iloc[:, 1]

print(X.head())
print(y.head())
print(y.value_counts())

0                                     br-icloud.com.br
1                  mp3raid.com/music/krizz_kaliko.html
2                      bopsecrets.org/rexroth/cr/1.htm
3    http://www.garage-pirenne.be/index.php?option=...
4    http://adventure-nicaragua.net/index.php?optio...
Name: url, dtype: object
0      phishing
1        benign
2        benign
3    defacement
4    defacement
Name: type, dtype: object
type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [4]:
def classification_type(type):
    '''
    Convert classification type into values:
        1) Benign = 0
        2) Defacement = 1
        3) Phishing = 2
        4) Malware = 3    
    '''
    if type == "benign":
        return 0
    elif type == "defacement":
        return 1
    elif type == "phishing":
        return 2
    elif type == "malware":
        return 3
    else:
        print(f"Unable to find proper type: {type}")

y = y.apply(classification_type)
y.value_counts()

type
0    428103
1     96457
2     94111
3     32520
Name: count, dtype: int64

### Feature extraction

Extract any features not reliant on a lookup

#### URL

In [5]:
# Length
def get_url_length(url):
    return len(url)

# Number of ; _ ? = &
def get_url_punct_count(url):
    return len(re.findall(r'[;_\?=&]', url))

# Digit to Letter ratio
def get_dlr(url):
    num_digits = len(re.findall(r'\d', url))
    num_letters = len(re.findall(r'[a-zA-Z]', url))
    if num_letters:
        return num_digits / num_letters
    else:
        return 0

def get_url_feat(X_):
    return pd.concat([
        X_.apply(get_url_length),       # Length
        X_.apply(get_url_punct_count),  # Number of ; _ ? = &
        X_.apply(get_dlr)],             # Digit to Letter ratio
    axis=1)

# URL_feat = get_url_feat(X)
# URL_feat.head()

#### Primary domain

In [6]:
def get_primary_domain(url):
    extracted = tldextract.extract(url)
    return extracted.domain + '.' + extracted.suffix

# Presence of IP
def get_PD_has_IP(url):
    return int(bool(re.match(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', url)))

# Length
def get_PD_len(url):
    return len(url)

# Number of digits
def get_PD_digits(url):
    return len(re.findall(r'\d', url))

# Number of non-alphanumeric characters
def get_PD_num_non_alphanumeric(url):
    return len(re.findall(r'[^a-zA-Z0-9]', url))

# Number of hyphens
def get_PD_num_hyphens(url):
    return len(re.findall(r'-', url))

# Number of @s
def get_PD_num_at(url):
    return len(re.findall(r'@', url))

def get_PD_feat(X_):
    PD = X_.apply(lambda x: get_primary_domain(x))
    return pd.concat([
        PD.apply(get_PD_has_IP),               # Presence of IP
        PD.apply(get_PD_len),                  # Length
        PD.apply(get_PD_digits),               # Number of digits
        PD.apply(get_PD_num_non_alphanumeric), # Number of non-alphanumeric characters
        PD.apply(get_PD_num_hyphens),          # Number of hyphens
        PD.apply(get_PD_num_at)],              # Number of @s
        axis=1)

# PD_feat = get_PD_feat(X)
# PD_feat.head()

#### Subdomain

In [7]:
# Number of dots
def get_SD_num_dots(url):
    return url.count('.')

# Number of subdomains
def get_num_SD(url):
    return sum(1 for subdomain in url.split('.') if subdomain)

def get_SD_feat(X_):
    SD = X_.apply(lambda x: tldextract.extract(x).subdomain)
    return pd.concat([
        SD.apply(get_SD_num_dots),  # Number of dots
        SD.apply(get_num_SD)],      # Number of subdomains
        axis=1)

# SD_feat = get_SD_feat(X)
# SD_feat.head()

#### Path

In [8]:
# ensure proper parsing of paths
def parse_path(url):
    try:
        if url.startswith('https://') or url.startswith('http://'):
            return urlparse(url)
        else:
            return urlparse('https://' + url)
    except:
        return urlparse(url)

# Number of //
def get_path_num_dbl_fwdslash(url):
    return len(re.findall(r'//', url))

# Number of subdirectories
def get_path_num_subdirs(url):
    return url.count('/')

# Presence of %20
def get_path_has_percent20(url):
    return int('%20' in url)

# Presence of uppercase directories
def get_path_has_uppercase_dirs(url):
    return int(any(any(c.isupper() for c in dir_name) for dir_name in url.split('/')))

# Presence of single character directories
def get_path_has_char_dirs(url):
    return int(any((len(dir_name) == 1) for dir_name in url.split('/')))

# Number of special characters
def get_path_num_special_chars(url):
    return sum(len(re.findall(r'[^A-Za-z0-9]', dir_name)) for dir_name in url.split('/'))

# Number of 0s
def get_path_num_zeroes(url):
    return url.count('0')

# Ratio of uppercase to lowercase characters
def get_path_upper_to_lower_ratio(url):
    upper_count = sum(1 for c in url if c.isupper())
    lower_count = sum(1 for c in url if c.islower())
    if lower_count:
        return upper_count / lower_count
    else:
        return 0

def get_path_feat(X_):
    path_ = X_.apply(lambda x: parse_path(x).path)
    return pd.concat([
        path_.apply(get_path_num_dbl_fwdslash),         # Number of //
        path_.apply(get_path_num_subdirs),              # Number of subdirectories
        path_.apply(get_path_has_percent20),            # Presence of %20
        path_.apply(get_path_has_uppercase_dirs),       # Presence of uppercase directories
        path_.apply(get_path_has_char_dirs),            # Presence of single character directories
        path_.apply(get_path_num_special_chars),        # Number of special characters
        path_.apply(get_path_num_zeroes),               # Number of 0s
        path_.apply(get_path_upper_to_lower_ratio)],    # Ratio of uppercase to lowercase characters
        axis=1)

# path_feat = get_path_feat(X)
# path_feat.head()

#### Parameters

In [9]:
# Length
def get_param_length(url):
    return sum(len(value[0]) for value in parse_qs(urlparse(url).query).values())

def get_param_feat(X_):
    return pd.concat([
        X_.apply(get_param_length)],    # Length
        axis=1)

# param_feat = get_param_feat(X)
# param_feat.head()

#### Query

In [10]:
# Number of queries
def get_num_queries(url):
    return len(parse_qs(urlparse(url).query))

def get_query_feat(X_):
    return pd.concat([
        X_.apply(get_num_queries)
    ])

# query_feat = get_query_feat(X)
# query_feat.head()

In [11]:
def get_lexical_feature_set(X_):
    return pd.concat([
        get_url_feat(X_),
        get_PD_feat(X_),
        get_SD_feat(X_),
        get_path_feat(X_),
        get_param_feat(X_),
        get_query_feat(X_)],
        axis=1)

#### Trigram

In [12]:
# Initialize CountVectorizer for trigrams, 1000 features
tf_vectorizer = CountVectorizer(ngram_range=(3,3), max_features=1000)

In [13]:
def get_trigram_feature_set(X_, vectorizer, fit=False):
    if fit:
        return vectorizer.fit_transform(X_)
    else:
        return vectorizer.transform(X_)

In [14]:
feature_set = hstack([
    csr_matrix(get_lexical_feature_set(X).values),
    get_trigram_feature_set(X, tf_vectorizer, True)])

print(feature_set)

  (0, 0)	16.0
  (0, 4)	16.0
  (0, 6)	3.0
  (0, 7)	1.0
  (1, 0)	35.0
  (1, 1)	1.0
  (1, 2)	0.034482758620689655
  (1, 4)	11.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 12)	2.0
  (1, 16)	2.0
  (1, 753)	1.0
  (2, 0)	31.0
  (2, 2)	0.04
  (2, 4)	14.0
  (2, 6)	1.0
  (2, 12)	3.0
  (2, 16)	1.0
  (3, 0)	88.0
  (3, 1)	10.0
  (3, 2)	0.1111111111111111
  (3, 4)	17.0
  (3, 6)	2.0
  (3, 7)	1.0
  :	:
  (651187, 18)	0.18181818181818182
  (651188, 0)	42.0
  (651188, 2)	0.09090909090909091
  (651188, 4)	12.0
  (651188, 6)	1.0
  (651188, 10)	1.0
  (651188, 12)	4.0
  (651188, 17)	1.0
  (651189, 0)	45.0
  (651189, 1)	3.0
  (651189, 4)	13.0
  (651189, 6)	1.0
  (651189, 10)	1.0
  (651189, 12)	2.0
  (651189, 14)	1.0
  (651189, 16)	5.0
  (651189, 18)	0.1
  (651189, 515)	1.0
  (651189, 987)	1.0
  (651190, 0)	41.0
  (651190, 4)	13.0
  (651190, 6)	1.0
  (651190, 10)	1.0
  (651190, 12)	3.0
  (651190, 1000)	1.0


### Train Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(feature_set, y, test_size=0.2, random_state=69)

In [16]:
print(y_train.value_counts())
print(y_test.value_counts())

type
0    342451
1     77182
2     75393
3     25926
Name: count, dtype: int64
type
0    85652
1    19275
2    18718
3     6594
Name: count, dtype: int64


### Model

In [17]:
model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)

In [18]:
model.fit(X_train, y_train)

### Evaluation

In [19]:
def calc_FNR_accuracy(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    for label_class in range(4):
        FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
        
        TP = conf_matrix[label_class][label_class]  
        
        TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
        
        accuracy = (TP + TN) / np.sum(conf_matrix)
        print("Accuracy for class", label_class, ":", accuracy)

        FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
        print("FNR for class", label_class, ":", FNR)

#### Malicious URLS (multi-class)
1) Benign       = 0
2) Defacement   = 1
3) Phishing     = 2
4) Malware      = 3 

In [20]:
y_pred = model.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)

In [22]:
print(f'OVERALL: Accuracy: {accuracy:.8f}, Precision: {precision:.8f}, Recall: {recall:.8f}, f1 Score: {f1:.8f}')

OVERALL: Accuracy: 0.81705941, Precision: 0.84974928, Recall: 0.81705941, f1 Score: 0.79083978


In [23]:
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')

Class 0:	Test Precision: 0.78542197,	Test Recall: 0.99712791,	Test f1: 0.87870323
Class 1:	Test Precision: 0.98051576,	Test Recall: 0.53260700,	Test f1: 0.69026727
Class 2:	Test Precision: 0.95945725,	Test Recall: 0.31354846,	Test f1: 0.47263942
Class 3:	Test Precision: 0.99165479,	Test Recall: 0.73885350,	Test f1: 0.84678891


In [24]:
calc_FNR_accuracy(y_test, y_pred)

Accuracy for class 0 : 0.8189559195018389
FNR for class 0 : 0.002872087049922944
Accuracy for class 1 : 0.9292608204915578
FNR for class 1 : 0.4673929961089494
Accuracy for class 2 : 0.8994387241916784
FNR for class 2 : 0.6864515439683727
Accuracy for class 3 : 0.9864633481522432
FNR for class 3 : 0.2611464968152866


#### External Datasets

In [25]:
# phishing
df_phish = pd.read_csv('../datasets/phishtank_phish_urls.csv').iloc[:, 0]
df_phish.head()

0                       https://servicecu.mobi/Service
1          https://sp581716.sitebeat.crazydomains.com/
2          https://sp774647.sitebeat.crazydomains.com/
3          https://sp779562.sitebeat.crazydomains.com/
4    https://manaliindiancuisine.es//gmx/rechnung/d...
Name: url, dtype: object

In [26]:
y_pred_phish = model.predict(
    hstack([
        csr_matrix(get_lexical_feature_set(df_phish).values),
        get_trigram_feature_set(df_phish, tf_vectorizer)]))
y_test_phish = np.full((df_phish.shape[0],), 2)

In [27]:
phish_accuracy = accuracy_score(y_test_phish, y_pred_phish)
phish_precision = precision_score(y_test_phish, y_pred_phish, average='weighted')
phish_recall = recall_score(y_test_phish, y_pred_phish, average='weighted')
phish_f1 = f1_score(y_test_phish, y_pred_phish, average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
print(f'OVERALL: Accuracy: {phish_accuracy:.8f}, Precision: {phish_precision:.8f}, Recall: {phish_recall:.8f}, f1 Score: {phish_f1:.8f}')

OVERALL: Accuracy: 0.10962856, Precision: 1.00000000, Recall: 0.10962856, f1 Score: 0.19759505


In [29]:
calc_FNR_accuracy(y_test_phish, y_pred_phish)

Accuracy for class 0 : 0.11121419655207314
FNR for class 0 : -1
Accuracy for class 1 : 0.9993589959852907
FNR for class 1 : -1
Accuracy for class 2 : 0.10962855504200263
FNR for class 2 : 0.8903714449579974
Accuracy for class 3 : 0.9990553625046389
FNR for class 3 : -1


___

In [30]:
# benign
df_benign = pd.read_csv('../datasets/benign_urls.csv').iloc[:, 0]
df_benign.head()

0       https://www.google.com
1      https://www.youtube.com
2     https://www.facebook.com
3        https://www.baidu.com
4    https://www.wikipedia.org
Name: url, dtype: object

In [31]:
y_pred_benign = model.predict(
    hstack([
        csr_matrix(get_lexical_feature_set(df_benign).values),
        get_trigram_feature_set(df_benign, tf_vectorizer)]))
y_test_benign = np.full((df_benign.shape[0],), 0)

In [32]:
benign_accuracy = accuracy_score(y_test_benign, y_pred_benign)
benign_precision = precision_score(y_test_benign, y_pred_benign, average='weighted')
benign_recall = recall_score(y_test_benign, y_pred_benign, average='weighted')
benign_f1 = f1_score(y_test_benign, y_pred_benign, average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
print(f'OVERALL: Accuracy: {benign_accuracy:.8f}, Precision: {benign_precision:.8f}, Recall: {benign_recall:.8f}, f1 Score: {benign_f1:.8f}')

OVERALL: Accuracy: 0.98275573, Precision: 1.00000000, Recall: 0.98275573, f1 Score: 0.99130288


In [34]:
calc_FNR_accuracy(y_test_benign, y_pred_benign)

Accuracy for class 0 : 0.9827557283260735
FNR for class 0 : 0.0172442716739265
Accuracy for class 1 : 0.9922802816005183
FNR for class 1 : -1
Accuracy for class 2 : 0.9904754467255552
FNR for class 2 : -1


IndexError: index 3 is out of bounds for axis 0 with size 3