In [3]:
#importing required packages for this module
import pandas as pd
from google.colab import files

In [2]:
# Uploading 2-beniganurl.csv file to Colab WorkSpace
files.upload()

Saving 2-beniganurl.csv to 2-beniganurl.csv




# **2.2. Legitimate URLs:**
From the uploaded Begin_list_big_final.csv file, the URLs are loaded into a dataframe

In [4]:
#loading the benigan URLs data to dataframe
legiurl = pd.read_csv("2-beniganurl.csv")
legiurl.head() 

Unnamed: 0,URLs
0,http://graphicriver.net/search?date=this-month...
1,http://ecnavi.jp/redirect/?url=http://www.cros...
2,https://hubpages.com/signin?explain=follow+Hub...
3,http://extratorrent.cc/torrent/4190536/AOMEI+B...
4,http://icicibank.com/Personal-Banking/offers/o...


# **3.Feature Extraction**
In this step,features are extracted from the URLs dataset.
The extracted features are categorized into


1.   Address Bar based Features
2.   Domain based Features
3.   HTML & Javascript Features

## **3.1.Address Bar Based Features:**
Many features can be extracted that can be consided as address bar base them, below mentioned were considered for this project.


*   Domain of URL 
*   IP Address in URL
*   "@"Symbol in URL
* Length of URL
* Depth of URL
* Redirection "//" in URL
* "http/https" in Domain name
* Using URL Shortening Services "TinyURL"
* Prefix or Suffix "-" in Domain

Each of these features are explained and the coded below:








In [5]:
#importing required packages for this section
from urllib.parse import urlparse,urlencode
import ipaddress
import re

# **3.1.1.Domain of the URL**

In [6]:
# 1.Domain of the URL(Domain) 
def getDomain(url):
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
       domain = domain.replace("www.","")
  return domain

# **3.1.2.IP Address in the URL**

In [7]:
# 2.Checks for IP address in URL (Have_IP)
def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip

# **3.1.3. "@" Symbol in URL**

In [8]:
# 3.Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at

# **3.1.4. Length of URL**

In [9]:
# 4.Finding the length of URL and categorizing (URL_Length)
def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length

# **3.1.5. Depth of URL**

In [10]:
# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

# **3.1.6. Redirection "//" in URL**

In [11]:
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

# **3.1.7. "http/https" in Domain name**

In [12]:
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

# **3.1.8. Using URL Shortening Services “TinyURL”**

In [13]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [14]:
# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

# **3.1.9. Prefix or Suffix "-" in Domain**

In [15]:
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

## **3.2. Domain Based Features:**
Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.


*  DNS Record
* Website Traffic
*Age of Domain
*End Period of Domain

In [16]:
!pip install python-whois



In [17]:
# importing required packages for this section
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

**3.2.1.DNS Record**

In [18]:
# 11.DNS Record availability (DNS_Record)
# obtained in the featureExtraction function itself

**3.2.2. Web Traffic**

In [19]:
# 12.Web traffic (Web_Traffic)
def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  if rank <100000:
    return 1
  else:
    return 0

**3.2.3. Age of Domain**

In [20]:
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)  
def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

**3.2.4. End Period of Domain**

In [21]:
# 14.End time of domain: The difference between termination time and current time (Domain_End) 
def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end

# **3.3. HTML and JavaScript based Features**
Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.



*  IFrame Redirection
*Status Bar Customization
*Disabling Right Click
*Website Forwarding

In [22]:
# importing required packages for this section
import requests

### **3.3.1. IFrame Redirection**

In [23]:
# 15. IFrame Redirection (iFrame)
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
          return 0
      else:
          return 1

### **3.3.2. Status Bar Customization**

In [24]:
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

### **3.3.3. Disabling Right Click**

In [25]:
# 17.Checks the status of the right click attribute (Right_Click)
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

### **3.3.4. Website Forwarding**

In [26]:
# 18.Checks the number of forwardings (Web_Forwards)    
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

## **4. Computing URL Features**

In [27]:
#Function to extract features
def featureExtraction(url,label):

  features = []
  #Address bar based features (10)
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  
  #Domain based features (4)
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1

  features.append(dns)
  features.append(web_traffic(url))
  features.append(1 if dns == 1 else domainAge(domain_name))
  features.append(1 if dns == 1 else domainEnd(domain_name))
  
  # HTML & Javascript based features (4)
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  features.append(label)
  
  return features

### **4.1. Reviewing Legitimate URLs:**

In [28]:
legiurl.shape

(5000, 1)

We will Reviewing URls 2500 by 2500 and storing seperate files and merge them.

In [29]:
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

label = 0

0 - 2500 legitmate URLs

In [None]:
#Extracting the feautres & storing them in a list
legi_features = []

for i in range(0, 2500):
  url = legiurl['URLs'][i]
  print(i)
  legi_features.append(featureExtraction(url,label))
  

0
1
2
3
4
5
6
Error trying to connect to socket: closing socket
7
Error trying to connect to socket: closing socket
8
9
10
11
Error trying to connect to socket: closing socket
12
13
14
15
16
17
18
19
20
21
22
23
24
25
Error trying to connect to socket: closing socket
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Error trying to connect to socket: closing socket
50
51
Error trying to connect to socket: closing socket
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
Error trying to connect to socket: closing socket
67
68
69
70
71
72
Error trying to connect to socket: closing socket
73
74
75
76
77
78
Error trying to connect to socket: closing socket
79
80
81
82
83
84
85
86
87
88
Error trying to connect to socket: closing socket
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
15

In [None]:
#converting the list to dataframe
legitimate = pd.DataFrame(legi_features, columns= feature_names)
legitimate.head()

In [None]:
# Storing the extracted legitimate URLs fatures to csv file
legitimate.to_csv('5-legitimate.csv', index= False)
#downloading csv file to local machine
files.download('5-legitimate.csv')

2500 - 5000 legitmate URLs

In [None]:
#Extracting the feautres & storing them in a list
legi_features = []

for i in range(2500, 5000):
  url = legiurl['URLs'][i]
  print(i)
  legi_features.append(featureExtraction(url,label))

In [None]:
#converting the list to dataframe
legitimate = pd.DataFrame(legi_features, columns= feature_names)
legitimate.head()

In [None]:
# Storing the extracted legitimate URLs fatures to csv file
legitimate.to_csv('6-legitimate.csv', index= False)
#downloading csv file to local machine
files.download('6-legitimate.csv')