### Setup

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
from sklearn import tree 
from matplotlib import pyplot as plt
import tldextract
from urllib.parse import urlparse, parse_qs
import re

### Dataset

In [2]:
df = pd.read_csv('../datasets/malicious_phish.csv')
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
X = df.iloc[:, 0]
y = df.iloc[:, 1]

print(X.head())
print(y.head())
print(y.value_counts())

0                                     br-icloud.com.br
1                  mp3raid.com/music/krizz_kaliko.html
2                      bopsecrets.org/rexroth/cr/1.htm
3    http://www.garage-pirenne.be/index.php?option=...
4    http://adventure-nicaragua.net/index.php?optio...
Name: url, dtype: object
0      phishing
1        benign
2        benign
3    defacement
4    defacement
Name: type, dtype: object
type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [11]:
def classification_type(type):
    '''
    Convert classification type into values:
        1) Benign = 0
        2) Defacement = 1
        3) Phishing = 2
        4) Malware = 3    
    '''
    if type == "benign":
        return 0
    elif type == "defacement":
        return 1
    elif type == "phishing":
        return 2
    elif type == "malware":
        return 3
    else:
        print(f"Unable to find proper type: {type}")

y = y.apply(classification_type)
y.value_counts()

type
0    428103
1     96457
2     94111
3     32520
Name: count, dtype: int64

### Feature extraction

Extract any features not reliant on a lookup

#### URL

In [4]:
# Length
URL_length = X.apply(lambda x: len(x))

# Number of ; _ ? = &
URL_punct_count = X.apply(lambda x: len(re.findall(r'[;_\?=&]', x)))

# Digit to Letter ratio
def dlr(url):
    num_digits = len(re.findall(r'\d', url))
    num_letters = len(re.findall(r'[a-zA-Z]', url))
    if num_letters:
        return num_digits / num_letters
    else:
        return 0

URL_dlr = X.apply(lambda x: dlr(x))

URL_feat = pd.concat([URL_length, URL_punct_count, URL_dlr], axis=1)
URL_feat.head()

Unnamed: 0,url,url.1,url.2
0,16,0,0.0
1,35,1,0.034483
2,31,0,0.04
3,88,10,0.111111
4,235,7,0.110553


#### Primary domain

In [5]:
def get_primary_domain(url):
    extracted = tldextract.extract(url)
    return extracted.domain + '.' + extracted.suffix

PD = X.apply(lambda x: get_primary_domain(x))

# Presence of IP
PD_has_IP = PD.apply(lambda x: int(bool(re.match(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', x))))

# Length
PD_length = PD.apply(lambda x: len(x))

# Number of digits
PD_digits = PD.apply(lambda x: len(re.findall(r'\d', x)))

# Number of non-alphanumeric characters
PD_non_alphanumeric = PD.apply(lambda x: len(re.findall(r'[^a-zA-Z0-9]', x)))

# Number of hyphens
PD_hyphens = PD.apply(lambda x: len(re.findall(r'-', x)))
PD_hyphens.head()

# Number of @s
PD_at = PD.apply(lambda x: len(re.findall(r'@', x)))

PD_feat = pd.concat([PD_has_IP, PD_length, PD_digits, PD_non_alphanumeric, PD_hyphens, PD_at], axis=1)
PD_feat.head()

Unnamed: 0,url,url.1,url.2,url.3,url.4,url.5
0,0,16,0,3,1,0
1,0,11,1,1,0,0
2,0,14,0,1,0,0
3,0,17,0,2,1,0
4,0,23,0,2,1,0


#### Subdomain

In [6]:
SD = X.apply(lambda x: tldextract.extract(x).subdomain)

# Number of dots
SD_num_dots = SD.apply(lambda x: x.count('.'))
# Number of subdomains
SD_num_subdomains = SD.apply(lambda x: sum(1 for subdomain in x.split('.') if subdomain))

SD_feat = pd.concat([SD_num_dots, SD_num_subdomains], axis=1)
SD_feat.head()

Unnamed: 0,url,url.1
0,0,0
1,0,0
2,0,0
3,0,1
4,0,0


#### Path

In [7]:
# ensure proper parsing of paths
def parse_path(url):
    try:
        if url.startswith('https://') or url.startswith('http://'):
            return urlparse(url)
        else:
            return urlparse('https://' + url)
    except:
        return urlparse(url)

path_ = X.apply(lambda x: parse_path(x).path)

# Number of //
path_num_dbl_fwdslash = path_.apply(lambda x: (len(re.findall(r'//', x))))

# Number of subdirectories
path_num_subdirs = path_.apply(lambda x: x.count('/'))

# Presence of %20
path_has_percent20 = path_.apply(lambda x: int('%20' in x))

# Presence of uppercase directories
path_has_uppercase_dirs = path_.apply(lambda x: int(any(any(c.isupper() for c in dir_name) for dir_name in x.split('/'))))

# Presence of single character directories
path_has_char_dirs = path_.apply(lambda x: int(any((len(dir_name) == 1) for dir_name in x.split('/'))))

# Number of special characters
path_num_special_chars = path_.apply(lambda x: sum(len(re.findall(r'[^A-Za-z0-9]', dir_name)) for dir_name in x.split('/')))

# Number of 0s
path_num_zeroes = path_.apply(lambda x: x.count('0'))

# Ratio of uppercase to lowercase characters
def upper_to_lower_ratio(url):
    upper_count = sum(1 for c in url if c.isupper())
    lower_count = sum(1 for c in url if c.islower())
    if lower_count:
        return upper_count / lower_count
    else:
        return 0

path_upper_to_lower = path_.apply(lambda x: upper_to_lower_ratio(x))

path_feat = pd.concat([path_num_dbl_fwdslash, path_num_subdirs, path_has_percent20, path_has_uppercase_dirs, path_has_char_dirs, path_num_special_chars, path_num_zeroes, path_upper_to_lower], axis=1)
path_feat.head()

Unnamed: 0,url,url.1,url.2,url.3,url.4,url.5,url.6,url.7
0,0,0,0,0,0,0,0,0.0
1,0,2,0,0,0,2,0,0.0
2,0,3,0,0,0,1,0,0.0
3,0,1,0,0,0,1,0,0.0
4,0,1,0,0,0,1,0,0.0


#### Parameters

In [8]:
# Length
param_length = X.apply(lambda x: sum(len(value[0]) for value in parse_qs(urlparse(x).query).values()))

param_feat = param_length
param_feat.head()

0      0
1      0
2      0
3     22
4    175
Name: url, dtype: int64

#### Query

In [9]:
# Number of queries
num_queries = X.apply(lambda x: len(parse_qs(urlparse(x).query)))

query_feat = num_queries
query_feat.head()

0    0
1    0
2    0
3    4
4    3
Name: url, dtype: int64

In [10]:
feature_set = pd.concat([URL_feat, PD_feat, SD_feat, path_feat, param_feat, query_feat], axis=1)
feature_set.head()

Unnamed: 0,url,url.1,url.2,url.3,url.4,url.5,url.6,url.7,url.8,url.9,...,url.10,url.11,url.12,url.13,url.14,url.15,url.16,url.17,url.18,url.19
0,16,0,0.0,0,16,0,3,1,0,0,...,0,0,0,0,0,0,0,0.0,0,0
1,35,1,0.034483,0,11,1,1,0,0,0,...,0,2,0,0,0,2,0,0.0,0,0
2,31,0,0.04,0,14,0,1,0,0,0,...,0,3,0,0,0,1,0,0.0,0,0
3,88,10,0.111111,0,17,0,2,1,0,0,...,0,1,0,0,0,1,0,0.0,22,4
4,235,7,0.110553,0,23,0,2,1,0,0,...,0,1,0,0,0,1,0,0.0,175,3


### Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(feature_set, y, test_size=0.2, random_state=69)

In [13]:
print(y_train.value_counts())
print(y_test.value_counts())

type
0    342451
1     77182
2     75393
3     25926
Name: count, dtype: int64
type
0    85652
1    19275
2    18718
3     6594
Name: count, dtype: int64


### Model

In [14]:
model = RandomForestClassifier(n_estimators=50, max_depth=20, min_samples_split=2, min_samples_leaf=2)

In [15]:
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

In [20]:
print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__}")

Accuracy: 0.9001681523967475 | F1 score: 0.8973312296986894 | RandomForestClassifier
