Importing Libraries

In [None]:
import numpy as np 
import pandas as pd 
from urllib.parse import urlparse
from tld import get_tld
import os.path

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
urldata = pd.read_csv("/content/drive/MyDrive/dataset1.csv", engine='python', encoding='ISO-8859-9')

In [None]:
urldata.head()

Unnamed: 0,url,type
0,http://members.tripod.com/russiastation/,benign
1,http://www.ddj.com/cpp/184403822,benign
2,http://www.naef-usa.com/,benign
3,http://www.ff-b2b.de/,malicious
4,http://us.imdb.com/title/tt0176269/,benign


In [None]:
#urldata.shape

(435774, 2)

In [None]:
#urldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435774 entries, 0 to 435773
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     435774 non-null  object
 1   type    435774 non-null  object
dtypes: object(2)
memory usage: 6.6+ MB


In [None]:
#urldata.isnull().sum()

url     0
type    0
dtype: int64

Labelling malicious as 1 and benign as 0

In [None]:
l = {"result": {"benign": 0, "malicious": 1}}
urldata['result'] = urldata['type']
urldata = urldata.replace(l)

In [None]:
!pip install tld

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tld
  Downloading tld-0.12.6-py38-none-any.whl (412 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.2/412.2 KB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tld
Successfully installed tld-0.12.6


In [None]:
urldata['url_length'] = urldata['url'].apply(lambda i: len(str(i)))

In [None]:
urldata['hostname_length'] = urldata['url'].apply(lambda i: len(urlparse(i).netloc))

In [None]:
def fd_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

urldata['fd_length'] = urldata['url'].apply(lambda i: fd_length(i))

In [None]:
urldata['path_length'] = urldata['url'].apply(lambda i: len(urlparse(i).path))

In [None]:
urldata['tld'] = urldata['url'].apply(lambda i: get_tld(i,fail_silently=True))
def tld_length(tld):
    try:
        return len(tld)
    except:
        return 0

urldata['tld_length'] = urldata['tld'].apply(lambda i: tld_length(i))

In [None]:
urldata.head()

Unnamed: 0,url,type,result,url_length,hostname_length,path_length,fd_length,tld,tld_length
0,http://members.tripod.com/russiastation/,benign,0,40,18,15,13,com,3
1,http://www.ddj.com/cpp/184403822,benign,0,32,11,14,3,com,3
2,http://www.naef-usa.com/,benign,0,24,16,1,0,com,3
3,http://www.ff-b2b.de/,malicious,1,21,13,1,0,de,2
4,http://us.imdb.com/title/tt0176269/,benign,0,35,11,17,5,com,3


In [None]:
urldata = urldata.drop("tld",1)

  urldata = urldata.drop("tld",1)


Dataset after extracting length features

In [None]:
urldata.head()

Unnamed: 0,url,type,result,url_length,hostname_length,path_length,fd_length,tld_length
0,http://members.tripod.com/russiastation/,benign,0,40,18,15,13,3
1,http://www.ddj.com/cpp/184403822,benign,0,32,11,14,3,3
2,http://www.naef-usa.com/,benign,0,24,16,1,0,3
3,http://www.ff-b2b.de/,malicious,1,21,13,1,0,2
4,http://us.imdb.com/title/tt0176269/,benign,0,35,11,17,5,3


In [None]:
urldata['count-'] = urldata['url'].apply(lambda i: i.count('-'))

In [None]:
#urldata['count@'] = urldata['url'].apply(lambda i: i.count('@'))

In [None]:
urldata['count%'] = urldata['url'].apply(lambda i: i.count('%'))

In [None]:
urldata['count?'] = urldata['url'].apply(lambda i: i.count('?'))

In [None]:
urldata['count='] = urldata['url'].apply(lambda i: i.count('='))

In [None]:
urldata['count.'] = urldata['url'].apply(lambda i: i.count('.'))

In [None]:
urldata['count-www'] = urldata['url'].apply(lambda i: i.count('www'))

In [None]:
urldata['count-http'] = urldata['url'].apply(lambda i : i.count('http'))

In [None]:
urldata['count-https'] = urldata['url'].apply(lambda i : i.count('https'))

In [None]:
def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')
urldata['count_dir'] = urldata['url'].apply(lambda i: no_of_dir(i))

In [None]:
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits
urldata['count-digits']= urldata['url'].apply(lambda i: digit_count(i))

In [None]:
def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters
urldata['count-letters']= urldata['url'].apply(lambda i: letter_count(i))

Data after extracting Count Features

In [None]:
urldata.head()

Unnamed: 0,url,type,result,url_length,hostname_length,path_length,fd_length,tld_length,count-,count?,count%,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir
0,http://members.tripod.com/russiastation/,benign,0,40,18,15,13,3,0,0,0,2,0,1,0,0,0,33,2
1,http://www.ddj.com/cpp/184403822,benign,0,32,11,14,3,3,0,0,0,2,0,1,0,1,9,16,2
2,http://www.naef-usa.com/,benign,0,24,16,1,0,3,1,0,0,2,0,1,0,1,0,17,1
3,http://www.ff-b2b.de/,malicious,1,21,13,1,0,2,1,0,0,2,0,1,0,1,1,13,1
4,http://us.imdb.com/title/tt0176269/,benign,0,35,11,17,5,3,0,0,0,2,0,1,0,0,7,20,3


In [None]:
import re

In [None]:
def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0
urldata['short_url'] = urldata['url'].apply(lambda i: shortening_service(i))

Data after extracting Binary Features

In [None]:
urldata

Unnamed: 0,url,type,result,url_length,hostname_length,path_length,fd_length,tld_length,count-,count?,count%,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir,short_url
0,http://members.tripod.com/russiastation/,benign,0,40,18,15,13,3,0,0,0,2,0,1,0,0,0,33,2,0
1,http://www.ddj.com/cpp/184403822,benign,0,32,11,14,3,3,0,0,0,2,0,1,0,1,9,16,2,0
2,http://www.naef-usa.com/,benign,0,24,16,1,0,3,1,0,0,2,0,1,0,1,0,17,1,0
3,http://www.ff-b2b.de/,malicious,1,21,13,1,0,2,1,0,0,2,0,1,0,1,1,13,1,0
4,http://us.imdb.com/title/tt0176269/,benign,0,35,11,17,5,3,0,0,0,2,0,1,0,0,7,20,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435769,xbox360.ign.com/objects/850/850402.html,malicious,1,39,0,39,7,0,0,0,0,3,0,0,0,0,12,21,3,0
435770,games.teamxbox.com/xbox-360/1860/Dead-Space/,malicious,1,44,0,44,8,0,2,0,0,2,0,0,0,0,7,29,4,1
435771,www.gamespot.com/xbox360/action/deadspace/,malicious,1,42,0,42,7,0,0,0,0,2,0,0,0,1,3,33,4,1
435772,en.wikipedia.org/wiki/Dead_Space_(video_game),malicious,1,45,0,45,4,0,0,0,0,2,0,0,0,0,0,36,2,0


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
x = urldata[['url_length', 'hostname_length',
       'path_length', 'fd_length', 'tld_length', 'count-', 'count?',
       'count%', 'count.', 'count=', 'count-http', 'count-https', 'count-www', 'count-digits',
       'count-letters', 'count_dir', 'short_url']]

y = urldata['result']

##Feature selection

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X_scaled =  StandardScaler().fit_transform(x)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
# Feature extraction
test = SelectKBest(score_func=chi2, k=6)
fit = test.fit(x, y)
np.set_printoptions(precision=5)
print(fit.scores_)
features = fit.transform(x)
print(features[0:6,:])

[1.64864e+04 3.17683e+05 1.13357e+06 1.31165e+05 6.12332e+04 1.73459e+04
 1.65353e+03 4.19509e+03 5.02093e+02 6.58647e+03 1.82227e+04 9.30683e+00
 3.59813e+02 1.23632e+04 2.00171e+04 1.09164e+04 6.06785e+02]
[[18 15 13  3  1 33]
 [11 14  3  3  1 16]
 [16  1  0  3  1 17]
 [13  1  0  2  1 13]
 [11 17  5  3  1 20]
 [15  8  7  3  1 29]]


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
# Feature extraction
model = LogisticRegression()
rfe = RFE(model, step=6)
fit = rfe.fit(x, y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Num Features: 8
Selected Features: [False False False False  True  True False  True  True  True  True False
  True False False False  True]
Feature Ranking: [2 3 3 3 1 1 3 1 1 1 1 3 1 3 2 2 1]


## 3.1 Train test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1234)
print("Shape of x_train: ", x_train.shape)
print("Shape of x_valid: ", x_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_valid: ", y_test.shape)

Shape of x_train:  (348619, 17)
Shape of x_valid:  (87155, 17)
Shape of y_train:  (348619,)
Shape of y_valid:  (87155,)


In [None]:
urldata

Unnamed: 0,url,type,result,url_length,hostname_length,path_length,fd_length,tld_length,count-,count?,count%,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir,short_url
0,http://members.tripod.com/russiastation/,benign,0,40,18,15,13,3,0,0,0,2,0,1,0,0,0,33,2,0
1,http://www.ddj.com/cpp/184403822,benign,0,32,11,14,3,3,0,0,0,2,0,1,0,1,9,16,2,0
2,http://www.naef-usa.com/,benign,0,24,16,1,0,3,1,0,0,2,0,1,0,1,0,17,1,0
3,http://www.ff-b2b.de/,malicious,1,21,13,1,0,2,1,0,0,2,0,1,0,1,1,13,1,0
4,http://us.imdb.com/title/tt0176269/,benign,0,35,11,17,5,3,0,0,0,2,0,1,0,0,7,20,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435769,xbox360.ign.com/objects/850/850402.html,malicious,1,39,0,39,7,0,0,0,0,3,0,0,0,0,12,21,3,0
435770,games.teamxbox.com/xbox-360/1860/Dead-Space/,malicious,1,44,0,44,8,0,2,0,0,2,0,0,0,0,7,29,4,1
435771,www.gamespot.com/xbox360/action/deadspace/,malicious,1,42,0,42,7,0,0,0,0,2,0,0,0,1,3,33,4,1
435772,en.wikipedia.org/wiki/Dead_Space_(video_game),malicious,1,45,0,45,4,0,0,0,0,2,0,0,0,0,0,36,2,0


##Input-data preprocessing

In [None]:
def pre_processing(url):
    data_to_predict = pd.DataFrame([[url]], columns=['url'])
    data_to_predict['url'] = data_to_predict['url'].replace('www.', '', regex=True)
    data_to_predict['url_length'] = data_to_predict['url'].apply(lambda x: len(str(x)))
    data_to_predict['hostname_length'] = data_to_predict['url'].apply(lambda i: len(urlparse(i).netloc))
    data_to_predict['path_length'] = data_to_predict['url'].apply(lambda i: len(urlparse(i).path))
    data_to_predict['fd_length'] = data_to_predict['url'].apply(lambda i: fd_length(i))
    data_to_predict['tld'] = data_to_predict['url'].apply(lambda i: get_tld(i,fail_silently=True))
    data_to_predict['tld_length'] = data_to_predict['tld'].apply(lambda i: tld_length(i))
    data_to_predict = data_to_predict.drop("tld",1)
    data_to_predict['count-'] = data_to_predict['url'].apply(lambda i: i.count('-'))
    #data_to_predict['count@'] = data_to_predict['url'].apply(lambda i: i.count('@'))
    data_to_predict['count?'] = data_to_predict['url'].apply(lambda i: i.count('?'))
    data_to_predict['count%'] = data_to_predict['url'].apply(lambda i: i.count('%'))
    data_to_predict['count.'] = data_to_predict['url'].apply(lambda i: i.count('.'))
    data_to_predict['count='] = data_to_predict['url'].apply(lambda i: i.count('='))
    data_to_predict['count-http'] = data_to_predict['url'].apply(lambda i : i.count('http'))
    data_to_predict['count-https'] = data_to_predict['url'].apply(lambda i : i.count('https'))
    data_to_predict['count-www'] = data_to_predict['url'].apply(lambda i: i.count('www'))
    data_to_predict['count-digits']= data_to_predict['url'].apply(lambda i: digit_count(i))
    data_to_predict['count-letters']= data_to_predict['url'].apply(lambda i: letter_count(i))
    data_to_predict['count_dir'] = data_to_predict['url'].apply(lambda i: no_of_dir(i))
    #data_to_predict['use_of_ip'] = data_to_predict['url'].apply(lambda i: having_ip_address(i))
    data_to_predict['short_url'] = data_to_predict['url'].apply(lambda i: shortening_service(i))
    data_to_predict = data_to_predict.drop("url",1)


    return data_to_predict
#print(data_to_predict)

## 3.3 ML Models 

In [None]:
!pip install joblib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sklearn.externals

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement sklearn.externals (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sklearn.externals[0m[31m
[0m

In [None]:
from sklearn import model_selection, datasets
import joblib
import pickle

In [None]:

url='https://www.datacamp.com/tutorial/feature-selection-python'
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

# save the model to disk
filename = 'ml_model'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
url='https://www.datacamp.com/tutorial/feature-selection-python'
test_dataframe = pre_processing(url)
pred = loaded_model.predict(test_dataframe)
print(pred)

[0]


  data_to_predict = data_to_predict.drop("tld",1)
  data_to_predict = data_to_predict.drop("url",1)


References:

1) https://scikit-learn.org/stable/

2) https://www.kaggle.com/code/hamzamanssor/detection-malicious-url-using-ml-models/notebook

3)https://github.com/wisdomml2020/Malicious_url_detection_using_MachineLearning

4)https://machinelearningmastery.com/feature-selection-machine-learning-python/
5)

> Indented block

