<a href="https://colab.research.google.com/github/BladeArya/phising-url-detection-ml/blob/main/legitmate_url_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Phishing Website Detection Feature Extraction(Part - 2)**

# **Legitimate URLs**

## **Data Collection**
For the legitimate URLs, I found a source that has a collection of benign, spam, phishing, malware & defacement URLs. The source of the dataset is University of New Brunswick, https://www.unb.ca/cic/datasets/url-2016.html. The number of legitimate URLs in this collection are 35,300. The URL collection is downloaded & from that, *'Benign_list_big_final.csv'* is the file of our interest. This file is then uploaded to the Colab for the feature extraction. 

In [None]:
import pandas as pd

In [None]:
raw_data = pd.read_csv("https://raw.githubusercontent.com/BladeArya/phising-url-detection-ml/main/dataset/Benign_list_big_final.csv")

In [None]:
raw_data.head()
raw_data.columns = ['URLs']
raw_data.head()

Unnamed: 0,URLs
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...


In [None]:
legitdataset = raw_data.sample(n = 1000, random_state = 12).copy()
legitdataset = legitdataset.reset_index(drop=True)
legitdataset.head()

Unnamed: 0,URLs
0,http://graphicriver.net/search?date=this-month...
1,http://ecnavi.jp/redirect/?url=http://www.cros...
2,https://hubpages.com/signin?explain=follow+Hub...
3,http://extratorrent.cc/torrent/4190536/AOMEI+B...
4,http://icicibank.com/Personal-Banking/offers/o...


In [None]:
legitdataset.shape

(1000, 1)

In [None]:
seperation_of_protocol = legitdataset['URLs'].str.split("://",expand = True)

In [None]:
seperation_of_protocol.head()

Unnamed: 0,0,1,2
0,http,graphicriver.net/search?date=this-month&length...,
1,http,ecnavi.jp/redirect/?url=http,www.cross-a.net/x.php?id=1845_3212_22061_26563...
2,https,hubpages.com/signin?explain=follow+Hubs&url=%2...,
3,http,extratorrent.cc/torrent/4190536/AOMEI+Backuppe...,
4,http,icicibank.com/Personal-Banking/offers/offer-de...,


In [None]:
seperation_domain_name = seperation_of_protocol[1].str.split("/",1,expand = True)

In [None]:
seperation_domain_name.columns=["domain_name","address"]

In [None]:
seperation_domain_name.head()

Unnamed: 0,domain_name,address
0,graphicriver.net,search?date=this-month&length_max=&length_min=...
1,ecnavi.jp,redirect/?url=http
2,hubpages.com,signin?explain=follow+Hubs&url=%2Fhub%2FComfor...
3,extratorrent.cc,torrent/4190536/AOMEI+Backupper+Technician+%2B...
4,icicibank.com,Personal-Banking/offers/offer-detail.page?id=o...


In [None]:
legiturldata = pd.concat([seperation_of_protocol[0],seperation_domain_name],axis=1)

In [None]:
legiturldata.columns = ['protocol','domain_name','address']

In [None]:
legiturldata.head()

Unnamed: 0,protocol,domain_name,address
0,http,graphicriver.net,search?date=this-month&length_max=&length_min=...
1,http,ecnavi.jp,redirect/?url=http
2,https,hubpages.com,signin?explain=follow+Hubs&url=%2Fhub%2FComfor...
3,http,extratorrent.cc,torrent/4190536/AOMEI+Backupper+Technician+%2B...
4,http,icicibank.com,Personal-Banking/offers/offer-detail.page?id=o...


In [None]:
legiturldata.shape

(1000, 3)

# **Feature Extraction**
In this step, features are extracted from the URLs dataset.

The extracted features are categorized into


1.   Address Bar based Features
2.   Domain/Abnormal based Features
3.   HTML & Javascript based Features


### **1. Address Bar based Features**


*   IP Address in URL
*   Length of URL
*   Using URL Shortening Services “TinyURL”
*   "@" Symbol in URL
*   Redirection "//" in URL
*   Prefix or Suffix "-" in Domain
*   Depth of URL(Domain and Sub-Domain)
*   "http/https" in Domain name

In [None]:
import ipaddress
import re

In [None]:
#1. Using the IP Address
def having_ip_address(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip

In [None]:
legiturldata['having_ip_address'] = legitdataset['URLs'].apply(having_ip_address)

In [None]:
#2. Long URL
def long_url(l):
    if len(l) < 54:
        return 0
    elif len(l) >= 54 and len(l) <= 75:
        return 2
    return 1

In [None]:
legiturldata['long_url'] = legitdataset['URLs'].apply(long_url) 

In [None]:
#3. Using URL Shortening Services “TinyURL”
def shortening_service(url):
    match=re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net',url)
    if match:
        return 1
    else:
        return 0

In [None]:
legiturldata['shortening_service'] = legitdataset['URLs'].apply(shortening_service)

In [None]:
#4. URL’s having “@” Symbol
def have_at_symbol(l):
    if "@" in l:
        return 1
    return 0   

In [None]:
legiturldata['having_@_symbol'] = legitdataset['URLs'].apply(have_at_symbol)

In [None]:
#5. Redirecting using “//”
def redirection(l):
    if "//" in l:
        return 1
    return 0

In [None]:
legiturldata['redirection_//_symbol'] = seperation_of_protocol[1].apply(redirection)

In [None]:
#6. Adding Prefix or Suffix Separated by (-) to the Domain
def prefix_suffix_seperation(l):
    if '-' in l:
        return 1
    return 0

In [None]:
legiturldata['prefix_suffix_seperation'] = seperation_domain_name['domain_name'].apply(prefix_suffix_seperation)

In [None]:
#7. Sub Domain and Multi Sub Domains
def sub_domains(l):
    if l.count('.') < 3:
        return 0
    elif l.count('.') == 3:
        return 2
    return 1

In [None]:
legiturldata['sub_domains'] = legiturldata['domain_name'].apply(sub_domains)

In [None]:
#8. The Existence of “HTTPS” Token in the Domain Part of the URL
def https_token(url):
    match=re.search('https://|http://',url)
    if match.start(0)==0:
        url=url[match.end(0):]
    match=re.search('http|https',url)
    if match:
        return 1
    else:
        return 0

In [None]:
legiturldata['https_token'] = legitdataset['URLs'].apply(https_token)

### **2. Domain based Features**


*   Age of Domain
*   DNS Record
*   Website Traffic
*   Domain Registration Length
*   Statical-Report Based Feature

In [None]:
!pip install python-whois

Collecting python-whois
  Downloading python-whois-0.7.3.tar.gz (91 kB)
[?25l[K     |███▋                            | 10 kB 24.9 MB/s eta 0:00:01[K     |███████▏                        | 20 kB 31.2 MB/s eta 0:00:01[K     |██████████▊                     | 30 kB 35.3 MB/s eta 0:00:01[K     |██████████████▎                 | 40 kB 34.7 MB/s eta 0:00:01[K     |██████████████████              | 51 kB 33.8 MB/s eta 0:00:01[K     |█████████████████████▌          | 61 kB 26.3 MB/s eta 0:00:01[K     |█████████████████████████       | 71 kB 22.2 MB/s eta 0:00:01[K     |████████████████████████████▋   | 81 kB 24.0 MB/s eta 0:00:01[K     |████████████████████████████████| 91 kB 8.9 MB/s 
Building wheels for collected packages: python-whois
  Building wheel for python-whois (setup.py) ... [?25l[?25hdone
  Created wheel for python-whois: filename=python_whois-0.7.3-py3-none-any.whl size=87721 sha256=c36657534022357fcf0b5f57c58f5f78ca02e6e91ed829f29275b477565bde85
  Stored in d

In [None]:
import whois
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import quote
from datetime import datetime
import time
import socket
import re

In [None]:
#9. Age of Domain
def age_of_domain_sub(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 2
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

In [None]:
def age_of_domain_main(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
    except:
        dns = 1
        
    if dns == 1:
        return 1
    else:
        return age_of_domain_sub(domain_name)

In [None]:
legiturldata['age_of_domain'] = legitdataset['URLs'].apply(age_of_domain_main)

Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket


In [None]:
#10.DNS Record
def dns_record(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
        print(domain_name)
    except:
        dns = 1
        
    if dns == 1:
        return 1
    else:
        return dns

In [None]:
legiturldata['dns_record'] = legiturldata['domain_name'].apply(dns_record)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    "clientTransferProhibited (https://www.icann.org/epp#clientTransferProhibited)",
    "clientDeleteProhibited (https://www.icann.org/epp#clientDeleteProhibited)",
    "serverUpdateProhibited (https://www.icann.org/epp#serverUpdateProhibited)",
    "serverTransferProhibited (https://www.icann.org/epp#serverTransferProhibited)",
    "serverDeleteProhibited (https://www.icann.org/epp#serverDeleteProhibited)"
  ],
  "emails": [
    "abusecomplaints@markmonitor.com",
    "whoisrequest@markmonitor.com"
  ],
  "dnssec": "signedDelegation",
  "name": null,
  "org": "Envato Pty Ltd",
  "address": null,
  "city": null,
  "state": "Victoria",
  "zipcode": null,
  "country": "AU"
}
Error trying to connect to socket: closing socket
{
  "domain_name": null,
  "registrar": null,
  "whois_server": null,
  "referral_url": null,
  "updated_date": null,
  "creation_date": null,
  "expiration_date": null,
  "name_servers": null,
  "status

In [None]:
# 11. Web traffic 
def web_traffic(url):
  try:
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  if rank <100000:
    return 1
  else:
    return 2

In [None]:
legiturldata['web_traffic'] = legitdataset['URLs'].apply(web_traffic)

In [None]:
#12. Domain Registration Length
def domain_registration_length_sub(domain):
    expiration_date = domain.expiration_date
    today = time.strftime('%Y-%m-%d')
    today = datetime.strptime(today, '%Y-%m-%d')
    if expiration_date is None:
        return 1
    elif type(expiration_date) is list or type(today) is list :
        return 2             #If it is a type of list then we can't select a single value from list. So,it is regarded as suspected website  
    else:
        registration_length = abs((expiration_date - today).days)
        if registration_length / 365 <= 1:
            return 1
        else:
            return 0

In [None]:
def domain_registration_length_main(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
    except:
        dns = 1
        
    if dns == 1:
        return 1
    else:
        return domain_registration_length_sub(domain_name)

In [None]:
legiturldata['domain_registration_length'] = legitdataset['URLs'].apply(domain_registration_length_main)

Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket


In [None]:
#13.Statical-Report Based Feature
def statistical_report(url):
    hostname = url
    h = [(x.start(0), x.end(0)) for x in re.finditer('https://|http://|www.|https://www.|http://www.', hostname)]
    z = int(len(h))
    if z != 0:
        y = h[0][1]
        hostname = hostname[y:]
        h = [(x.start(0), x.end(0)) for x in re.finditer('/', hostname)]
        z = int(len(h))
        if z != 0:
            hostname = hostname[:h[0][0]]
    url_match=re.search('at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly',url)
    try:
        ip_address = socket.gethostbyname(hostname)
        ip_match=re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42',ip_address)  
    except:
        return 1

    if url_match:
        return 1
    else:
        return 0

In [None]:
legiturldata['statistical_report'] = legiturldata['domain_name'].apply(statistical_report)

### **3.HTML and JavaScript based Features**


*   IFrame Redirection
*   Status Bar Customization


In [None]:
import requests
import re

In [None]:
#14.iFrame Redirection
def iframe_sub(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
          return 0
      else:
          return 1

In [None]:
def iframe_main(url):
  try:
    response = requests.get(url)
  except:
    response = ''
  
  return iframe_sub(response)

In [None]:
legiturldata['iframe'] = legitdataset['URLs'].apply(iframe_main)

In [None]:
#15. Status Bar Customization 
def mouse_over_sub(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

In [None]:
def mouse_over_main(url):
  try:
    response = requests.get(url)
  except:
    response = ''
  
  return mouse_over_sub(response)

In [None]:
legiturldata['mouse_over'] = legitdataset['URLs'].apply(mouse_over_main)

In [None]:
legiturldata['label'] = 1

In [None]:
legiturldata.shape

(1000, 19)

In [None]:
legiturldata.head()

Unnamed: 0,protocol,domain_name,address,having_ip_address,long_url,shortening_service,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains,https_token,age_of_domain,dns_record,web_traffic,domain_registration_length,statistical_report,iframe,mouse_over,label
0,http,graphicriver.net,search?date=this-month&length_max=&length_min=...,0,1,0,0,0,0,0,0,2,0,1,2,0,0,0,1
1,http,ecnavi.jp,redirect/?url=http,0,1,0,0,0,0,0,1,2,0,1,1,0,0,0,1
2,https,hubpages.com,signin?explain=follow+Hubs&url=%2Fhub%2FComfor...,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1
3,http,extratorrent.cc,torrent/4190536/AOMEI+Backupper+Technician+%2B...,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,1
4,http,icicibank.com,Personal-Banking/offers/offer-detail.page?id=o...,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1


## Storing the extracted legitimate URLs fatures to csv file

In [None]:
legiturldata.to_csv('extracted_legitmate_dataset.csv',index=False)