In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import re
from urllib.parse import urlparse
from urllib.parse import urlparse
from tld import get_tld

In [29]:
type_to_int = {'benign': 0, 'defacement': 1, 'malware': 2, 'phishing': 3}
int_to_type = {0: 'benign', 1: 'defacement', 2: 'malware', 3: 'phishing'}

In [34]:
def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    return 1 if match else 0

def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    
    return 1 if match else 0

def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')

def no_of_embed(url):
    urldir = urlparse(url).path
    return urldir.count('//')

def suspicious_words(url):
    match = re.search('PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr',
                      url)
    return 1 if match else 0

def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    return 1 if match else 0

def count_digit(url):
    return sum(c.isdigit() for c in url)

def count_letter(url):
    return sum(c.isalpha() for c in url)

def fd_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0
    
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1
    

def get_features(url):
    feature_dict = {}
    
    feature_dict['use_of_ip'] = [having_ip_address(url)]
    feature_dict['abnormal_url'] = [abnormal_url(url)]
    feature_dict['count.'] = [url.count('.')]
    feature_dict['count-www'] = [url.count('www')]
    feature_dict['count@'] = [url.count('@')]
    feature_dict['count_dir'] = [no_of_dir(url)]
    feature_dict['count_embed_domian'] = [no_of_embed(url)]
    feature_dict['sus_words'] = [suspicious_words(url)]
    feature_dict['short_url'] = [shortening_service(url)]
    feature_dict['count-https'] = [url.count('https')]
#     feature_dict['count-http'] =  [url.count('http:')]
    feature_dict['count%'] =[url.count('%')]
    feature_dict['count-'] = [url.count('-')]
    feature_dict['count='] = [url.count('=')]
    feature_dict['count-digits'] = [count_digit(url)]
    feature_dict['count-letters'] = [count_letter(url)]
    feature_dict['url_length'] = [len(str(url))]
    feature_dict['hostname_length'] = [len(urlparse(url).netloc)]
    feature_dict['fd_length'] = [fd_length(url)]
    
    tld = get_tld(url, fail_silently=True)
    feature_dict['tld_length'] = [tld_length(tld)]

    return feature_dict

In [35]:
features = get_features('https://www.google.com')

df = pd.DataFrame(features)

In [36]:
df

Unnamed: 0,use_of_ip,abnormal_url,count.,count-www,count@,count_dir,count_embed_domian,sus_words,short_url,count-https,count%,count-,count=,count-digits,count-letters,url_length,hostname_length,fd_length,tld_length
0,0,1,2,1,0,0,0,0,0,1,0,0,0,0,17,22,14,0,3


In [37]:
import joblib
with open('rf.joblib', 'rb') as f:
    loaded_rf = joblib.load(f)
    
results = loaded_rf.predict(df)

In [38]:
url_types = [int_to_type[i] for i in results]
url_types

['phishing']