### Anomaly Detection

#### Importing Libraries

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mae
from tensorflow.keras.metrics import Mean
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers

In [2]:
df = pd.read_csv('preprocessed_data.csv', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,url,type,url_len,use_of_ip,domain,count_https,count_http,abnormal_url,@,...,sum_count_special_chars,pri_domain,redirection,hostname_length,tld,tld_length,short_url,count_digits,fd_len,is_sus_words
0,0,br-icloud.com.br,phishing,16,0,com.br,0,0,0,0,...,3,br-icloud.com.br,0,0,,-1,0,0,0,0
1,1,mp3raid.com/music/krizz_kaliko.html,benign,35,0,com,0,0,0,0,...,2,mp3raid.com,0,0,,-1,0,1,5,0
2,2,bopsecrets.org/rexroth/cr/1.htm,benign,31,0,org,0,0,0,0,...,2,bopsecrets.org,0,0,,-1,0,1,7,0
3,3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,0,be,0,1,1,0,...,9,www.garage-pirenne.be,0,21,be,2,0,7,9,0
4,4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,0,net,0,1,1,0,...,7,adventure-nicaragua.net,0,23,net,3,0,22,9,0


In [3]:
df['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

#### Combining benign URL and a small fraction(0.1) of malicious URLs to help the model learn better distinctions.

In [4]:
benign_data = df[df['type'] == 'benign']
malicious_data = df[df['type'] != 'benign'].sample(frac=0.1, random_state=42)  

* Combining both the datasets

In [5]:
combined_data = pd.concat([benign_data, malicious_data])

In [6]:
df.columns

Index(['Unnamed: 0', 'url', 'type', 'url_len', 'use_of_ip', 'domain',
       'count_https', 'count_http', 'abnormal_url', '@', '?', '-', '=', '.',
       '#', '%', '+', '$', '!', '*', ',', '//', 'sum_count_special_chars',
       'pri_domain', 'redirection', 'hostname_length', 'tld', 'tld_length',
       'short_url', 'count_digits', 'fd_len', 'is_sus_words'],
      dtype='object')

In [7]:
features = ['url_len', 'use_of_ip', 'count_https', 'count_http', 'abnormal_url', '@', '?', 
    '-', '=', '.','#', '%', '+', '$', 'sum_count_special_chars', 'fd_len', 'is_sus_words',
    'redirection', 'short_url', 'count_digits' , 'hostname_length', 'tld_length'
]

In [8]:
print(f"Selecting {len(features)} features from the dataset.")

Selecting 22 features from the dataset.


* Extracting the above features from the dataset

In [28]:
X_train = combined_data[features]

In [29]:
from sklearn.ensemble import IsolationForest

#### Note: The below contamination value is obtained by experimenting with different values.

In [83]:
model = IsolationForest(contamination=0.28, random_state=42) 
model.fit(X_train)

#### Dropping the Target Column(Type) from the main dataframe df and extracting only the selected features for prediction.
1. Here, I converted it into unsupervised learning.

In [30]:
df_features = df.drop(columns=['type'])

In [31]:
X_full = df_features[features]

In [86]:
df['anomaly'] = model.predict(X_full)

* The model predicts anomalies (outliers) in the dataset
1. Anomalies are labeled as -1(potential outliers/malicious).
2. Non-anomalies are labeled as 1(considered normal/benign by the model).

In [87]:
df.head(6)

Unnamed: 0.1,Unnamed: 0,url,type,url_len,use_of_ip,domain,count_https,count_http,abnormal_url,@,...,pri_domain,redirection,hostname_length,tld,tld_length,short_url,count_digits,fd_len,is_sus_words,anomaly
0,0,br-icloud.com.br,phishing,16,0,com.br,0,0,0,0,...,br-icloud.com.br,0,0,,-1,0,0,0,0,1
1,1,mp3raid.com/music/krizz_kaliko.html,benign,35,0,com,0,0,0,0,...,mp3raid.com,0,0,,-1,0,1,5,0,1
2,2,bopsecrets.org/rexroth/cr/1.htm,benign,31,0,org,0,0,0,0,...,bopsecrets.org,0,0,,-1,0,1,7,0,1
3,3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,0,be,0,1,1,0,...,www.garage-pirenne.be,0,21,be,2,0,7,9,0,-1
4,4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,0,net,0,1,1,0,...,adventure-nicaragua.net,0,23,net,3,0,22,9,0,-1
5,5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign,118,0,net,0,1,1,0,...,buzzfil.net,0,11,net,3,0,1,1,0,-1


### Results:

#### Benign and detected as Anomaly are 1,10,205 out of 4,28,103 benign url.

In [88]:
df[(df['anomaly'] == -1) & (df['type'] == 'benign')]

Unnamed: 0.1,Unnamed: 0,url,type,url_len,use_of_ip,domain,count_https,count_http,abnormal_url,@,...,pri_domain,redirection,hostname_length,tld,tld_length,short_url,count_digits,fd_len,is_sus_words,anomaly
5,5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign,118,0,net,0,1,1,0,...,buzzfil.net,0,11,net,3,0,1,1,0,-1
7,7,yourbittorrent.com/?q=anthony-hamilton-soulife,benign,46,0,com,0,0,0,0,...,yourbittorrent.com,0,0,,-1,1,0,0,0,-1
27,27,http://hollywoodlife.com/2014/05/01/rihanna-ih...,benign,85,0,com,0,1,1,0,...,hollywoodlife.com,0,17,com,3,0,12,4,0,-1
34,34,vanderbilt.rivals.com/viewcoach.asp?coach=2079...,benign,64,0,com,0,0,0,0,...,vanderbilt.rivals.com,0,0,,-1,0,9,13,0,-1
36,36,movies.yahoo.com/shop?d=hv&cf=info&id=1800340831,benign,48,0,com,0,0,0,0,...,movies.yahoo.com,0,0,,-1,0,10,4,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603169,603169,www.ssmf.co.uk//login.aspx.htm,benign,30,0,co.uk,0,0,0,0,...,www.ssmf.co.uk,1,0,,-1,0,0,0,1,-1
603171,603171,cephtex.com/Paypal/Processing.htm?cmd=_Process...,benign,191,0,com,0,0,0,0,...,cephtex.com,0,0,,-1,1,79,6,1,-1
603172,603172,petmarket.com.sg/js/asb/index.php,benign,33,0,com.sg,0,0,0,0,...,petmarket.com.sg,0,0,,-1,1,0,2,0,-1
603174,603174,www.crissoares.com.br/9af12aea97de54ed3a434b2f...,benign,135,0,com.br,0,0,0,0,...,www.crissoares.com.br,0,0,,-1,0,29,32,0,-1


#### Malicious and detected as non-anomaly are- 64,550 out of 2,23,088 malicious.

In [89]:
df[(df['anomaly'] == 1) & (df['type'] != 'benign')]

Unnamed: 0.1,Unnamed: 0,url,type,url_len,use_of_ip,domain,count_https,count_http,abnormal_url,@,...,pri_domain,redirection,hostname_length,tld,tld_length,short_url,count_digits,fd_len,is_sus_words,anomaly
0,0,br-icloud.com.br,phishing,16,0,com.br,0,0,0,0,...,br-icloud.com.br,0,0,,-1,0,0,0,0,1
72,72,retajconsultancy.com,phishing,20,0,com,0,0,0,0,...,retajconsultancy.com,0,0,,-1,0,0,0,0,1
94,94,alexpay2.beget.tech,phishing,19,0,tech,0,0,0,0,...,alexpay2.beget.tech,0,0,,-1,0,1,0,0,1
105,105,facebook.unitedcolleges.net,phishing,27,0,net,0,0,0,0,...,facebook.unitedcolleges.net,0,0,,-1,0,0,0,0,1
126,126,halkbankparaf-para.com,phishing,22,0,com,0,0,0,0,...,halkbankparaf-para.com,0,0,,-1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651184,651184,wii.ign.com/objects/142/14270799.html,phishing,37,0,com,0,0,0,0,...,wii.ign.com,0,0,,-1,0,11,7,0,1
651185,651185,xbox360.gamespy.com/xbox-360/dead-space/,phishing,40,0,com,0,0,0,0,...,xbox360.gamespy.com,0,0,,-1,0,6,8,0,1
651186,651186,xbox360.ign.com/objects/850/850402.html,phishing,39,0,com,0,0,0,0,...,xbox360.ign.com,0,0,,-1,0,12,7,0,1
651189,651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,0,org,0,0,0,0,...,en.wikipedia.org,0,0,,-1,0,0,4,0,1


#### Model is giving False Positives and False Negatives.

In [21]:
correlation_matrix = X_train.corr()

In [22]:
correlation_matrix.style.background_gradient(cmap='coolwarm').format("{:.6f}")

Unnamed: 0,url_len,use_of_ip,count_https,count_http,abnormal_url,@,?,-,=,.,#,%,+,$,sum_count_special_chars,fd_len,is_sus_words,redirection,short_url,count_digits,hostname_length,tld_length
url_len,1.0,-0.014861,0.126497,0.402252,0.371947,0.038905,0.314695,0.466458,0.410869,0.4041,0.021585,0.366763,0.136707,0.012697,0.717593,0.186052,0.352178,0.033089,0.017582,0.733825,0.321712,0.364704
use_of_ip,-0.014861,1.0,-0.00052,0.096983,0.103623,-0.000357,-0.014962,-0.020721,-0.011099,0.05695,-0.000751,-0.004589,-0.00533,-0.000313,-0.003826,-0.004743,-0.004717,-0.002194,-0.011395,0.024525,0.090298,-0.013903
count_https,0.126497,-0.00052,1.0,0.290561,0.220366,0.093239,0.065387,0.044687,0.046711,0.037458,0.017545,0.091475,0.059286,0.009942,0.120855,-0.005474,0.044155,0.023318,0.023667,0.09072,0.204028,0.220689
count_http,0.402252,0.096983,0.290561,1.0,0.961963,0.027619,0.159722,0.311515,0.227022,0.089602,0.000215,0.307541,0.108476,0.002653,0.473876,0.07296,0.008859,0.124803,-0.007747,0.233244,0.871324,0.908509
abnormal_url,0.371947,0.103623,0.220366,0.961963,1.0,0.028103,0.11557,0.335194,0.212371,0.054899,0.000658,0.2527,0.077737,0.002613,0.427137,0.082486,-0.023613,0.055019,-0.01198,0.204811,0.910778,0.945503
@,0.038905,-0.000357,0.093239,0.027619,0.028103,1.0,0.03587,0.018086,0.029465,0.023966,0.034602,-0.000891,-0.001676,0.03311,0.029319,0.006462,0.019436,-2.3e-05,-0.002496,0.02011,0.023299,0.035975
?,0.314695,-0.014962,0.065387,0.159722,0.11557,0.03587,1.0,-0.089086,0.672139,0.27122,0.016161,0.054529,0.050386,0.007316,0.247044,-0.042529,0.239234,0.048825,-0.038439,0.206436,0.118745,0.112898
-,0.466458,-0.020721,0.044687,0.311515,0.335194,0.018086,-0.089086,1.0,-0.051593,-0.071748,-0.004934,0.030281,-0.039365,0.002022,0.493231,0.252876,0.005411,-0.014159,0.053237,0.158795,0.254701,0.326694
=,0.410869,-0.011099,0.046711,0.227022,0.212371,0.029465,0.672139,-0.051593,1.0,0.218676,0.030566,0.036795,0.040443,0.009998,0.273285,-0.013544,0.183352,0.028455,-0.028071,0.20997,0.224214,0.207924
.,0.4041,0.05695,0.037458,0.089602,0.054899,0.023966,0.27122,-0.071748,0.218676,1.0,0.03187,-0.019452,-0.002997,0.006486,0.243439,0.086704,0.429762,0.046947,0.049295,0.352789,0.094914,0.058264
