In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import re
from urllib.parse import urlparse
from urllib.parse import urlparse
from tld import get_tld

In [2]:
# pip install tld

In [28]:
feature_dict = {}

input = ' https://www.google.com'
label_mapping = {'benign': 0, 'defacement': 1, 'malware': 2, 'phishing': 3}

feature_dict['use_of_ip'] = []
feature_dict['abnormal_url'] = []
feature_dict['count.'] = []
feature_dict['count-www'] = []
feature_dict['count@'] = []
feature_dict['count_dir'] = []
feature_dict['count_embed_domian'] = []
feature_dict['sus_words'] = []
feature_dict['short_url'] = []
feature_dict['count-https'] = []
feature_dict['count-http'] =  []
feature_dict['count%'] =[]
feature_dict['count-'] = []
feature_dict['count='] = []
feature_dict['count-digits'] = []
feature_dict['count-letters'] = []
feature_dict['url_length'] = []
feature_dict['hostname_length'] = []
feature_dict['fd_length'] = []

feature_dict['tld_length'] = []


In [6]:
input = 'https://www.my.com'

In [10]:
input = 'https://www.my.com2'

In [29]:

def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    return 1 if match else 0

def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    
    return 1 if match else 0

def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')

def no_of_embed(url):
    urldir = urlparse(url).path
    return urldir.count('//')

def suspicious_words(url):
    match = re.search('PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr',
                      url)
    return 1 if match else 0

def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    return 1 if match else 0

def count_digit(url):
    return sum(c.isdigit() for c in url)

def count_letter(url):
    return sum(c.isalpha() for c in url)

def fd_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0
    
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1
    
    
feature_dict['use_of_ip'].append(having_ip_address(input)) 
feature_dict['abnormal_url'].append(abnormal_url(input)) 
feature_dict['count.'].append(input.count('.'))
feature_dict['count-www'].append(input.count('www')) 
feature_dict['count@'].append(input.count('@')) 
feature_dict['count_dir'].append(no_of_dir(input)) 
feature_dict['count_embed_domian'].append(no_of_embed(input)) 
feature_dict['sus_words'].append(suspicious_words(input))
feature_dict['short_url'].append(shortening_service(input))
feature_dict['count-https'].append(input.count('https'))
feature_dict['count-http'].append(input.count('http'))
feature_dict['count%'].append(input.count('%'))
feature_dict['count-'].append(input.count('-'))
feature_dict['count='].append(input.count('='))
feature_dict['count-digits'].append(count_digit(input))
feature_dict['count-letters'].append(count_letter(input))
feature_dict['url_length'].append(len(str(input)))
feature_dict['hostname_length'].append(len(urlparse(input).netloc))
feature_dict['fd_length'].append(fd_length(input))


tld = get_tld(input, fail_silently=True)
feature_dict['tld_length'].append(tld_length(tld))


In [30]:
df = pd.DataFrame(feature_dict)

In [31]:
df

Unnamed: 0,use_of_ip,abnormal_url,count.,count-www,count@,count_dir,count_embed_domian,sus_words,short_url,count-https,count-http,count%,count-,count=,count-digits,count-letters,url_length,hostname_length,fd_length,tld_length
0,0,0,2,1,0,2,1,0,0,1,1,0,0,0,0,17,23,0,0,-1


In [32]:
import joblib
with open('rf.joblib', 'rb') as f:
    loaded_rf = joblib.load(f)
    
rf_y_pred = loaded_rf.predict(df)
print(type(rf_y_pred.tolist()[0]))

FileNotFoundError: [Errno 2] No such file or directory: 'rf.joblib'

In [19]:
print(type(rf_y_pred.tolist()[0]))


<class 'int'>


In [22]:
label_mapping = ['safe', 'defacement', 'malware', 'phishing']
result = rf_y_pred.tolist()
url_types = [label_mapping[i] for i in result]

In [23]:
url_types

['phishing', 'phishing', 'phishing']

In [27]:
test_string = 'A|BC'
test_string.split('|')

['A', 'BC']