In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import re
from sklearn.metrics import accuracy_score

### Malicious URL Detection Using Machine Learning 

In [3]:
dataset=pd.read_csv("dataNN.csv") #read csv dataset file

In [4]:
dataset.head() #first 5 data are displayed

Unnamed: 0,url,label
0,diaryofagameaddict.com,1
1,espdesign.com.au,1
2,iamagameaddict.com,1
3,kalantzis.net,1
4,slightlyoffcenter.net,1


In [5]:
dataset.shape

(388447, 2)

In [6]:
dataset.isnull().any() #data dont contain any null values

url      False
label    False
dtype: bool

In [7]:
dataset['url'].values

array(['diaryofagameaddict.com', 'espdesign.com.au', 'iamagameaddict.com',
       ..., 'owens.edu/news-releases/?p=2052',
       '1.safesecureweb.com/egale/index.asp?item=1173',
       'yurika.otakuthon.com/reg/main.pl/en/'], dtype=object)

In [8]:
len(dataset) #length of data

388447

In [9]:
#this function is taken from https://github.com/faizann24/Using-machine-learning-to-detect-malicious-URLs
def Tokens(input):
    tokensBySlash = str(input.encode('utf-8')).split('/')
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')
        tokensByDot = []
        for j in range(0,len(tokens)):
            tempTokens = str(tokens[j]).split('.')
            tokensByDot = tokensByDot + tempTokens
        allTokens = allTokens + tokens + tokensByDot
    allTokens = list(set(allTokens))
    if 'com' in allTokens:
        allTokens.remove('com')
    return allTokens

#function to remove "http://" from URL
def trim(url):
    return re.match(r'(?:\w*://)?(?:.*\.)?([a-zA-Z-1-9]*\.[a-zA-Z]{1,}).*', url).groups()[0]

In [10]:
dataset = np.array(dataset)
random.shuffle(dataset)

## We can use default TfidfVectorizer without parameters also

In [None]:
y = [d[1] for d in dataset]
corpus = [d[0] for d in dataset]
vectorizer = TfidfVectorizer(tokenizer=Tokens)
X = vectorizer.fit_transform(corpus)

In [12]:
vectorizer.vocabulary_

{"com'": 214531,
 "b'diaryofagameaddict": 100250,
 "b'diaryofagameaddict.com'": 100251,
 "b'espdesign": 107584,
 "au'": 57710,
 "b'espdesign.com.au'": 107585,
 "b'iamagameaddict": 125312,
 "b'iamagameaddict.com'": 125313,
 "b'toddscarwash": 182706,
 "b'toddscarwash.com'": 182707,
 "b'kalantzis.net'": 133124,
 "b'kalantzis": 133123,
 "net'": 292834,
 "b'tubemoviez.com'": 184487,
 "b'tubemoviez": 184486,
 "b'slightlyoffcenter": 173655,
 "b'slightlyoffcenter.net'": 173656,
 'kupang': 271230,
 'kupang.com': 271231,
 "'": 523,
 "b'pos": 161912,
 "b'rupor": 168788,
 "b'rupor.info'": 168789,
 "info'": 260998,
 'office': 297772,
 "office.js?google_ad_format=728x90_as'": 297788,
 'ma': 279945,
 "js?google_ad_format=728x90_as'": 267107,
 "b'officeon.ch.ma": 155644,
 "b'officeon": 155643,
 'ch': 209671,
 "b'ipl.hk'": 128308,
 "b'ipl": 128306,
 "hk'": 250224,
 "b'xindalawyer.com'": 191462,
 "b'xindalawyer": 191461,
 'toolbar': 346980,
 'install': 261519,
 "php?pack=exe'": 309368,
 'us': 351810,
 "

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model = LogisticRegression(C=1)
model.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
print(model.score(X_test,y_test))

0.985043120092676


In [20]:
a = "https://github.com"
aa = vectorizer.transform([trim(a)])
s = model.predict(aa)
s[0] #0 for good

0