In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **3.1. Address Bar Based Features:**

Many features can be extracted that can be consided as address bar base features. Out of them, below mentioned were considered for this project.


*   Domain of URL
*   IP Address in URL
*   "@" Symbol in URL
*   Length of URL
*   Depth of URL
*   Redirection "//" in URL
*   "http/https" in Domain name
*   Using URL Shortening Services “TinyURL”
*   Prefix or Suffix "-" in Domain

Each of these features are explained and the coded below:

In [3]:
from urllib.parse import urlparse,urlencode
import ipaddress
import re

In [4]:
# 1.Domain of the URL (Domain) 
def getDomain(url):  
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
	       domain = domain.replace("www.","")
  return domain

In [5]:
# 2.Checks for IP address in URL (Have_IP)
def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip


In [6]:
# 3.Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at

In [7]:
# 4.Finding the length of URL and categorizing (URL_Length)
def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length

In [8]:
# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

In [9]:
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

In [43]:
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
  if 'https' in url.lower():
    return 1
  elif 'http' in url.lower():
    return 0
  else:
    return 0

In [11]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [12]:


# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

In [13]:

# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

In [14]:
def has_client_in_string(url):
  if 'client' in url.lower():
    return 1
  else:
    return 0

In [15]:
def has_admin_in_string(url):
  if 'admin' in url.lower():
    return 1
  else:
    return 0

In [16]:
def has_server_in_string(url):
  if 'server' in url.lower():
    return 1
  else:
    return 0

In [17]:
def has_login_in_string(url):
  if 'login' in url.lower():
    return 1
  else:
    return 0

In [18]:
def have_php(url):
  if '.php' in url.lower():
    return 1
  else:
    return 0

In [19]:
def have_html(url):
  if '.html' in url.lower():
    return 1
  else:
    if '.htm' in url.lower():
      return 1
    else:
      return 0

In [20]:
def have_info(url):
  if '.info' in url.lower():
    return 1
  else:
    return 0

In [21]:
def have_text(url):
  if '.txt' in url.lower():
    return 1
  else:
    return 0

In [22]:
def have_js(url):
  if '.js' in url.lower():
    return 1
  else:
    return 0

In [23]:
def have_exe(url):
  if '.exe' in url.lower():
    return 1
  else:
    if 'install' in url.lower():
      return 1
    else:
      return 0

In [24]:
def number_of_periods(url):
  periods = [i for i in url if i == '.']
  return len(periods)

In [25]:
def is_encoded(url):
  if '%' in url.lower():
    return 1
  else:
    return 0

In [26]:
def num_encoded_char(url):
  encs = [i for i in url if i == '%']
  return len(encs)

In [27]:
def number_of_parameters(url):
  params = urlparse(url).query
  if params == '':
    return 0 
  else:
    return len(params.split('&'))

In [28]:
def number_of_digits(url):
  digits = [i for i in url if i.isdigit()]
  return len(digits)

In [29]:
# Number of Special characters

def number_spec_char(url):
  count = 0
  sp1 = [i for i in url if i == '@']
  count = count + len(sp1)
  sp2 = [i for i in url if i == '#']
  count = count + len(sp2)
  sp3 = [i for i in url if i == '$']
  count = count + len(sp3)
  sp4 = [i for i in url if i == '%']
  count = count + len(sp4)
  sp5 = [i for i in url if i == '^']
  count = count + len(sp5)
  sp6 = [i for i in url if i == '&']
  count = count + len(sp6)
  sp7 = [i for i in url if i == '*']
  count = count + len(sp7)
  sp8 = [i for i in url if i == '_']
  count = count + len(sp8)
  sp9 = [i for i in url if i == '-']
  count = count + len(sp9)
  sp10 = [i for i in url if i == '=']
  count = count + len(sp10)
  sp11 = [i for i in url if i == '+']
  count = count + len(sp11)
  sp12 = [i for i in url if i == '!']
  count = count + len(sp12)
  return count

### **3.2. Domain Based Features:**

Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.

*   DNS Record
*   Website Traffic 
*   Age of Domain
*   End Period of Domain

Each of these features are explained and the coded below:

In [None]:
!pip install python-whois

Collecting python-whois
  Downloading python-whois-0.7.3.tar.gz (91 kB)
[?25l[K     |███▋                            | 10 kB 16.5 MB/s eta 0:00:01[K     |███████▏                        | 20 kB 12.0 MB/s eta 0:00:01[K     |██████████▊                     | 30 kB 8.9 MB/s eta 0:00:01[K     |██████████████▎                 | 40 kB 8.1 MB/s eta 0:00:01[K     |██████████████████              | 51 kB 4.3 MB/s eta 0:00:01[K     |█████████████████████▌          | 61 kB 5.0 MB/s eta 0:00:01[K     |█████████████████████████       | 71 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████████▋   | 81 kB 6.1 MB/s eta 0:00:01[K     |████████████████████████████████| 91 kB 4.3 MB/s 
Building wheels for collected packages: python-whois
  Building wheel for python-whois (setup.py) ... [?25l[?25hdone
  Created wheel for python-whois: filename=python_whois-0.7.3-py3-none-any.whl size=87720 sha256=5a0f6183f3ab0020dd2d9dbe433ea0074f8d45d5b53adb1e24c5d65dae64e43b
  Stored in directo

In [None]:
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [None]:
# 12.Web traffic (Web_Traffic)
def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  if rank <100000:
    return 1
  else:
    return 0

In [None]:
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)  
def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

In [None]:
# 14.End time of domain: The difference between termination time and current time (Domain_End) 
def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end

## **3.3. HTML and JavaScript based Features**

Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.

*   IFrame Redirection
*   Status Bar Customization
*   Disabling Right Click
*   Website Forwarding

Each of these features are explained and the coded below:

In [None]:
import requests

In [None]:
# 15. IFrame Redirection (iFrame)
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
          return 0
      else:
          return 1

In [None]:
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

In [None]:
# 17.Checks the status of the right click attribute (Right_Click)
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

In [None]:
# 18.Checks the number of forwardings (Web_Forwards)    
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

## **4. Computing URL Features**

Create a list and a function that calls the other functions and stores all the features of the URL in the list. We will extract the features of each URL and append to this list.

In [44]:
#Function to extract features
def featureExtraction(url,label):

  features = []
  #Address bar based features (10)
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  features.append(has_client_in_string(url))
  features.append(has_admin_in_string(url))
  features.append(has_login_in_string(url))
  features.append(has_server_in_string(url))
  features.append(have_php(url))
  features.append(have_html(url))
  features.append(have_info(url))
  features.append(have_text(url))
  features.append(have_js(url))
  features.append(have_exe(url))
  features.append(number_of_periods(url))
  features.append(is_encoded(url))
  features.append(num_encoded_char(url))
  features.append(number_of_parameters(url))
  features.append(number_of_digits(url))
  features.append(number_spec_char(url))
  

  # #Domain based features (4)
  # dns = 0
  # try:
  #   domain_name = whois.whois(urlparse(url).netloc)
  # except:
  #   dns = 1

  # features.append(dns)
  # #features.append(web_traffic(url))
  # features.append(1 if dns == 1 else domainAge(domain_name))
  # features.append(1 if dns == 1 else domainEnd(domain_name))
  
  # HTML & Javascript based features (4)
  #try:
  #  response = requests.get(url)
  #except:
  #  response = ""
  #response = requests.get(url)

  #features.append(iframe(response))
  #features.append(mouseOver(response))
  #features.append(rightClick(response))
  #features.append(forwarding(response))
  features.append(label)
  
  return features

In [45]:
feature_names = ['Domain', 'Have IP', 'Have @', 'URL Length', 'URL Depth','Redirection', 
                      'https Domain', 'TinyURL', 'Prefix/Suffix', 'Have client', 'Have admin',
                      'Have login', 'Have server', '.php', '.html', '.info', '.txt', '.js', '.exe', 
                      'Num of periods', 'Is encoded', 'Num of encoded char', 'Num of parameters', 'Num of digits', 
                      'Num of spec char', 
                      # 'DNS Record', 
                      # # 'Web Traffic', 
                      # 'Domain Age', 'Domain End', 
                      'Label']


In [32]:
#Loading legitimate files 
Benign = pd.read_csv("/content/drive/MyDrive/Phishing/UNB 2016 Dataset/Benign_list_big_final.csv")
#Benign.columns = ['URLs']
Benign


Unnamed: 0,URLs
0,http://1337x.to/torrent/1048648/American-Snipe...
1,http://1337x.to/torrent/1110018/Blackhat-2015-...
2,http://1337x.to/torrent/1122940/Blackhat-2015-...
3,http://1337x.to/torrent/1124395/Fast-and-Furio...
4,http://1337x.to/torrent/1145504/Avengers-Age-o...
...,...
35373,https://lastpass.com/signup2.php?ac=1&from_uri...
35374,https://lastpass.com/signup2.php?ac=1&from_uri...
35375,https://lastpass.com/signup2.php?ac=1&from_uri...
35376,https://lastpass.com/signup2.php?ac=1&from_uri...


In [33]:
Benign.shape

(35378, 1)

In [34]:
#Extracting the feautres & storing them in a list
Benign_features = []
label = 0

for i in range(0, 35378):
  # print(i)
  url = Benign['URLs'][i]
  Benign_features.append(featureExtraction(url,label))
print('Feature Extracted')

Feature Extracted


In [35]:
#converting the list to dataframe


Benign = pd.DataFrame(Benign_features, columns= feature_names)
Benign

Unnamed: 0,Domain,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,8,0
1,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,23,9,0
2,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,22,9,0
3,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,11,0
4,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35373,lastpass.com,0,0,1,1,0,1,0,0,0,...,0,0,0,15,1,61,2,784,77,0
35374,lastpass.com,0,0,1,1,0,1,0,0,0,...,0,0,0,16,1,66,2,914,83,0
35375,lastpass.com,0,0,1,1,0,1,0,0,0,...,0,0,0,17,1,71,2,1054,89,0
35376,lastpass.com,0,0,1,1,0,1,0,0,0,...,0,0,0,18,1,76,2,1204,95,0


In [36]:
# Storing the extracted legitimate URLs fatures to csv file
Benign.to_csv('/content/drive/MyDrive/Phishing/UNB/UNB Dataset Features/Address_Benign_features.csv', index= False)

### **4.2. Phishing URLs:**

Now, feature extraction is performed on phishing URLs.

In [53]:
#Loading phishing files 
phishing = pd.read_csv("/content/drive/MyDrive/Phishing/UNB 2016 Dataset/phishing_dataset.csv")
#spam.columns = ['URLs']
phishing.head(10)

Unnamed: 0,URLs
0,http://v2.email-marketing.adminsimple.com/trac...
1,http://bid.openx.net/json?amp;amp;amp;amp;cid;...
2,http://webmail2.centurytel.net/hwebmail/servic...
3,http://www.google.com.ng/imgres?imgurl=http://...
4,http://webmail2.centurytel.net/hwebmail/servic...
5,http://www.liceonuzzi.it/cmd=_Inf/connectionSt...
6,http://bank0famerica-com.z2.newmail.ru/secure_...
7,http://superpromocao.net78.net/explosaodepremi...
8,https://www.google.fr/url?sa=t&amp;rct=j&amp;q...
9,http://www.cultivateyourlife.com/data/santande...


In [47]:
phishing.shape

(9965, 1)

In [48]:
#Extracting the feautres & storing them in a list
features = []
label = 1

for i in range(0, 9965):
  # print(i)
  url = phishing['URLs'][i]
  features.append(featureExtraction(url,label))
print('Feature Extracted')

Feature Extracted


In [54]:
#converting the list to dataframe

Phishing = pd.DataFrame(features, columns= feature_names)
Phishing.head(10)

Unnamed: 0,Domain,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,v2.email-marketing.adminsimple.com,0,0,1,2,0,0,0,1,0,...,0,0,0,9,1,7,6,45,21,1
1,bid.openx.net,0,0,1,1,0,0,0,0,0,...,0,0,0,8,0,0,6,48,21,1
2,webmail2.centurytel.net,0,0,1,3,1,0,0,0,0,...,0,0,0,6,0,0,1,79,2,1
3,google.com.ng,0,0,1,1,1,0,1,0,0,...,0,0,0,10,1,3,4,11,17,1
4,webmail2.centurytel.net,0,0,1,3,1,0,0,0,0,...,0,0,0,4,0,0,1,69,2,1
5,liceonuzzi.it,0,0,1,5,0,0,0,0,0,...,0,0,0,3,0,0,2,84,12,1
6,bank0famerica-com.z2.newmail.ru,0,0,1,1,0,0,0,1,0,...,0,0,0,4,0,0,4,54,14,1
7,superpromocao.net78.net,0,0,1,6,0,0,0,0,0,...,0,0,0,3,1,1,1,54,1,1
8,google.fr,0,0,1,1,0,1,0,0,0,...,0,0,0,3,1,5,12,16,30,1
9,cultivateyourlife.com,0,0,1,3,1,0,0,0,0,...,0,0,0,10,1,2,7,15,18,1


In [55]:
Phishing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9965 entries, 0 to 9964
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Domain               9965 non-null   object
 1   Have IP              9965 non-null   int64 
 2   Have @               9965 non-null   int64 
 3   URL Length           9965 non-null   int64 
 4   URL Depth            9965 non-null   int64 
 5   Redirection          9965 non-null   int64 
 6   https Domain         9965 non-null   int64 
 7   TinyURL              9965 non-null   int64 
 8   Prefix/Suffix        9965 non-null   int64 
 9   Have client          9965 non-null   int64 
 10  Have admin           9965 non-null   int64 
 11  Have login           9965 non-null   int64 
 12  Have server          9965 non-null   int64 
 13  .php                 9965 non-null   int64 
 14  .html                9965 non-null   int64 
 15  .info                9965 non-null   int64 
 16  .txt  

In [56]:
# Storing the extracted legitimate URLs fatures to csv file
Phishing.to_csv('/content/drive/MyDrive/Phishing/UNB/UNB Dataset Features/Address_phishing_features.csv', index= False)

### **Spam URLs:**

Now, feature extraction is performed on Spam URLs.

In [57]:
#Loading spam files 
spam = pd.read_csv("/content/drive/MyDrive/Phishing/UNB 2016 Dataset/spam_dataset.csv")
#spam.columns = ['URLs']
spam

Unnamed: 0,URLs
0,http://astore.amazon.co.uk/allezvinsfrenchr/de...
1,http://archive.salisburyjournal.co.uk/2007/3/6...
2,http://appbasic.jettons.co.uk/links/index.html
3,http://archive.yorkpress.co.uk/2003/11/6/25684...
4,http://acard4u.co.uk/product_reviews.php?cPath...
...,...
11995,http://archive.salisburyjournal.co.uk/2001/3/7/
11996,http://astore.amazon.co.uk/allezvinsfrenchr/de...
11997,http://archive.thisischeshire.co.uk/2000/1/14/...
11998,http://applerugs.co.uk/rugs/product_info.php?p...


In [58]:
spam.shape

(12000, 1)

In [59]:
#Extracting the feautres & storing them in a list
# label = 2
spam_features = [] 
label = 2
for i in range(0, 12000):
  url = spam['URLs'][i]
  # print(i)
  spam_features.append(featureExtraction(url,label))

print('Feature Extracted')

Feature Extracted


In [60]:
#converting the list to dataframe


spam = pd.DataFrame(spam_features, columns= feature_names)
spam.head()

Unnamed: 0,Domain,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,astore.amazon.co.uk,0,0,1,4,0,0,0,0,0,...,0,0,0,3,0,0,0,27,2,2
1,archive.salisburyjournal.co.uk,0,0,1,4,0,0,0,0,0,...,0,0,0,4,0,0,0,12,0,2
2,appbasic.jettons.co.uk,0,0,0,2,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,2
3,archive.yorkpress.co.uk,0,0,0,4,0,0,0,0,0,...,0,0,0,4,0,0,0,13,0,2
4,acard4u.co.uk,0,0,1,1,0,0,0,0,0,...,0,0,0,3,0,0,3,14,9,2


In [61]:
# Storing the extracted spam URLs fatures to csv file
spam.to_csv('/content/drive/MyDrive/Phishing/UNB/UNB Dataset Features/Address_spam_features.csv', index= False)

### **Malware URLs:**

Now, feature extraction is performed on malware URLs.

In [62]:

#Loading Malware files 
Malware = pd.read_csv("/content/drive/MyDrive/Phishing/UNB 2016 Dataset/Malware_dataset.csv")
#Malware.columns = ['URLs']
Malware.head()


Unnamed: 0,URLs
0,http://gzzax.livechatvalue.com/chat/chatClient...
1,http://gzzax.livechatvalue.com/chat/chatClient...
2,http://gzzax.livechatvalue.com/chat/chatClient...
3,http://gzzax.livechatvalue.com/chat/chatClient...
4,http://mtsx.com.cn/UploadFiles/2011-08/admin/%...


In [63]:
Malware.shape

(11566, 1)

In [64]:
#Extracting the feautres & storing them in a list
# label = 3
malware_features = [] 
label = 3
for i in range(0, 11566):
  url = Malware['URLs'][i]
  # print(i)
  malware_features.append(featureExtraction(url,label))

print('Feature Extracted')

Feature Extracted


In [65]:
#converting the list to dataframe


malware = pd.DataFrame(malware_features, columns= feature_names)
malware.head()

Unnamed: 0,Domain,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,gzzax.livechatvalue.com,0,0,1,3,0,0,0,0,1,...,0,1,0,5,1,42,7,106,55,3
1,gzzax.livechatvalue.com,0,0,1,3,0,0,0,0,1,...,0,1,0,5,1,42,7,106,55,3
2,gzzax.livechatvalue.com,0,0,1,3,0,0,0,0,1,...,0,1,0,5,1,42,7,106,55,3
3,gzzax.livechatvalue.com,0,0,1,3,0,0,0,0,1,...,0,1,0,5,1,42,7,106,55,3
4,mtsx.com.cn,0,0,1,4,0,0,1,0,0,...,0,0,0,3,1,72,0,83,73,3


In [66]:
# Storing the extracted Malware URLs fatures to csv file
malware.to_csv('/content/drive/MyDrive/Phishing/UNB/UNB Dataset Features/Address_Malware_features.csv', index= False)

### **Defacement URLs:**

Now, feature extraction is performed on defacement URLs.

In [67]:

#Loading Defacement files 
Defacement = pd.read_csv("/content/drive/MyDrive/Phishing/UNB 2016 Dataset/DefacementSitesURLFiltered.csv")
#Defacement.columns = ['URLs']
Defacement.head()

Unnamed: 0,URLs
0,http://www.sinduscongoias.com.br/index.html
1,http://www.sinduscongoias.com.br/index.php/ins...
2,http://www.sinduscongoias.com.br/index.php/ins...
3,http://www.sinduscongoias.com.br/index.php/ins...
4,http://www.sinduscongoias.com.br/index.php/ins...


In [68]:
Defacement.shape

(96457, 1)

In [69]:
#Extracting the feautres & storing them in a list
# label = 4
defacement_features = [] 
label = 4
for i in range(0, 96457):
  url = Defacement['URLs'][i]
  # print(i)
  defacement_features.append(featureExtraction(url,label))

print('feature Extracted')

feature Extracted


In [70]:
#converting the list to dataframe


defacemen = pd.DataFrame(defacement_features, columns= feature_names)
defacemen.head()

Unnamed: 0,Domain,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,sinduscongoias.com.br,0,0,0,1,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,4
1,sinduscongoias.com.br,0,0,1,2,0,0,0,0,0,...,0,0,0,5,0,0,0,1,0,4
2,sinduscongoias.com.br,0,0,1,3,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,4
3,sinduscongoias.com.br,0,0,1,3,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,4
4,sinduscongoias.com.br,0,0,1,3,0,0,0,0,0,...,0,0,0,4,0,0,0,0,7,4


In [71]:
# Storing the extracted defacemen URLs fatures to csv file
defacemen.to_csv('/content/drive/MyDrive/Phishing/UNB/UNB Dataset Features/Address_Defacemen_features.csv', index= False)

In [72]:
#Concatenating the dataframes into one 
urldata1 = pd.concat([Benign, Phishing]).reset_index(drop=True)
urldata1

Unnamed: 0,Domain,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,8,0
1,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,23,9,0
2,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,22,9,0
3,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,11,0
4,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45338,highedgesolar.com,0,0,0,2,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45339,jimfangimporters.yolasite.com,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,1
45340,liuheng.chengxuren.com,0,0,0,1,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,1
45341,mcnaotempreco.net,0,0,0,2,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [73]:
data = urldata1.drop(['Domain'], axis = 1).copy()
data.shape
data.head()

Unnamed: 0,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,Have admin,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,18,8,0
1,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,23,9,0
2,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,22,9,0
3,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,18,11,0
4,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,18,9,0


In [74]:
# Storing the data in CSV file
data.to_csv('/content/drive/MyDrive/Phishing/UNB/UNB Dataset Features/Address_urldata(Binary).csv', index=False)

In [75]:
#Concatenating the dataframes into one 
urldata = pd.concat([Benign, Phishing, spam, malware, defacemen]).reset_index(drop=True)
urldata.head()

Unnamed: 0,Domain,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,8,0
1,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,23,9,0
2,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,22,9,0
3,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,11,0
4,1337x.to,0,0,1,3,0,0,0,0,0,...,0,0,0,1,0,0,0,18,9,0


In [76]:
urldata.shape

(165366, 26)

In [77]:
data = urldata.drop(['Domain'], axis = 1).copy()
data.shape
data

Unnamed: 0,Have IP,Have @,URL Length,URL Depth,Redirection,https Domain,TinyURL,Prefix/Suffix,Have client,Have admin,...,.txt,.js,.exe,Num of periods,Is encoded,Num of encoded char,Num of parameters,Num of digits,Num of spec char,Label
0,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,18,8,0
1,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,23,9,0
2,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,22,9,0
3,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,18,11,0
4,0,0,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,18,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165361,0,0,1,1,0,0,0,0,0,0,...,0,0,0,4,0,0,4,5,8,4
165362,0,0,1,3,0,0,0,0,0,0,...,0,0,0,4,0,0,0,1,0,4
165363,0,0,1,3,0,0,0,0,0,0,...,0,0,0,4,0,0,0,1,1,4
165364,0,0,0,1,0,0,0,0,0,0,...,1,0,0,4,0,0,0,0,0,4


In [78]:
# Storing the data in CSV file
data.to_csv('/content/drive/MyDrive/Phishing/UNB/UNB Dataset Features/Address_urldata.csv', index=False)