In [1]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\user\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [2]:
import numpy as np
import pandas as pd
from urllib.parse import urlparse
import tld
from tld import get_tld
import os.path
import scipy as sp
import re

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
urldata = pd.read_csv("all_data_featured_merged.csv")

In [5]:
# urldata = urldata.drop("Unnamed: 0",axis=1)

In [6]:
def processing(url):
    tokens_slash = str(url.encode('utf-8')).split('/')# make tokens after splitting by slash
    total_Tokens = []
    for i in tokens_slash:
        tokens = str(i).split('-')# make tokens after splitting by dash
        tokens_dot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')# make tokens after splitting by dot
            tokens_dot = tokens_dot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tokens_dot
    finaltest = list(set(total_Tokens))#remove redundant tokens
    return finaltest

In [7]:
vectorizer = TfidfVectorizer(tokenizer=processing)

In [8]:
X = vectorizer.fit_transform(urldata['url'])

In [9]:
features = sp.sparse.csr_matrix(urldata[['url_length', 'hostname_length',
       'path_length', 'fd_length', 'tld_length', 'count@', 'count?',
       'count%', 'count=', 'count-http', 'count-digits',
       'count-letters', 'count_dir', 'use_of_ip', 'short_url']].values)

In [10]:
from scipy.sparse import hstack

In [11]:
testing = hstack([X, features])

In [12]:
Y = urldata['label']

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [14]:
rfc = RandomForestClassifier(max_depth= 20)

In [15]:
x_train, x_test, y_train, y_test = train_test_split(testing, Y, train_size=0.3, random_state=42)

In [16]:
from collections import Counter

counter = Counter(y_train)
print("Before: ",counter)

from imblearn.over_sampling import SMOTE
#Oversampling the data
smote = SMOTE(random_state = 42)

X, y = smote.fit_resample(x_train, y_train)

counter = Counter(y)
print("After: ",counter)

Before:  Counter({'benign': 114176, 'malicious': 67435})
After:  Counter({'malicious': 114176, 'benign': 114176})


In [17]:
rfc.fit(X,y)

RandomForestClassifier(max_depth=20)

In [18]:
pred_results = rfc.predict(x_test)

In [19]:
accuracy_score(y_test,pred_results)

0.9424979764441581

In [20]:
confusion_matrix(y_test, pred_results)

array([[251174,  15766],
       [  8601, 148218]], dtype=int64)

In [21]:
print(classification_report(y_test,pred_results,digits=3))

              precision    recall  f1-score   support

      benign      0.967     0.941     0.954    266940
   malicious      0.904     0.945     0.924    156819

    accuracy                          0.942    423759
   macro avg      0.935     0.943     0.939    423759
weighted avg      0.944     0.942     0.943    423759



In [22]:
import joblib
joblib.dump(vectorizer, 'tfidfvectorizer.pkl')

['tfidfvectorizer.pkl']

In [23]:
joblib.dump(rfc, 'tfidfModel.pkl')

['tfidfModel.pkl']