### Importing Dependencies

In [None]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import socket

from tqdm import tqdm
import requests

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import urllib

from urllib.parse import urlparse
from http.client import HTTPConnection, HTTPSConnection

import re

import urllib.request


Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


### Loading Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Deceptive-Research/combined_dataset.csv")

In [None]:
df.head()

Unnamed: 0,domain,ranking,isIp,valid,activeDuration,urlLen,is@,isredirect,haveDash,domainLen,nosOfSubdomain,label
0,www.voting-yahoo.com,10000000,0,0,0,20,0,0,1,20,2,1
1,www.zvon.org/xxl/WSDL1.1/Output/index.html,194914,0,1,7305,42,0,0,0,12,2,0
2,tecportais.com/file-security-update-infonfmati...,10000000,0,0,0,155,0,0,0,14,1,1
3,bima.astro.umd.edu/nemo/linuxastro/,7001,0,0,0,35,0,0,0,18,3,0
4,huarui-tec.com/js/?us.battle.net/login/en/?ref...,10000000,0,1,730,79,0,0,1,14,1,1


About Dataset

Domain: The URL itself.

Ranking: Page Ranking

isIp: Is there an IP address in the weblink

valid: This data is fetched from google's whois API that tells us more about the current status of the URL's registration.

activeDuration: Also from whois API. Gives the duration of the time since the registration up until now.

urlLen: It is simply the length of the URL

is@: If the link has a '@' character then it's value = 1

isredirect: If the link has double dashes, there is a chance that it is a redirect. 1-> multiple dashes present together.

haveDash: If there are any dashes in the domain name.

domainLen: The length of just the domain name.

noOfSubdomain: The number of subdomains preset in the URL.

Labels: 0 -> Legitimate website , 1 -> Phishing Link/ Spam Link

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95910 entries, 0 to 95909
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   domain          95910 non-null  object
 1   ranking         95910 non-null  int64 
 2   isIp            95910 non-null  int64 
 3   valid           95910 non-null  int64 
 4   activeDuration  95910 non-null  int64 
 5   urlLen          95910 non-null  int64 
 6   is@             95910 non-null  int64 
 7   isredirect      95910 non-null  int64 
 8   haveDash        95910 non-null  int64 
 9   domainLen       95910 non-null  int64 
 10  nosOfSubdomain  95910 non-null  int64 
 11  label           95910 non-null  int64 
dtypes: int64(11), object(1)
memory usage: 8.8+ MB


In [None]:
df.tail()

Unnamed: 0,domain,ranking,isIp,valid,activeDuration,urlLen,is@,isredirect,haveDash,domainLen,nosOfSubdomain,label
95905,www.freewebs.com/ryanrules2/,5001,0,1,8035,28,0,0,0,16,2,0
95906,www.ireland-information.com/freecelticfonts.htm,230251,0,1,8400,47,0,0,1,27,2,0
95907,www.clubtaunus.soroptimist.de/img/pro/e.php,10000000,0,0,0,43,0,0,0,29,3,1
95908,www.askmen.com/sports/business/index.html,2008,0,1,9862,41,0,0,0,14,2,0
95909,xosothudo.com.vn/paypal.co.il/paypal.co.il.cgi...,771840,0,0,0,323,0,0,0,16,2,1


In [None]:
df_deceptive = df[df["label"]==1].sample(n=1510, random_state=42)

In [None]:
df_deceptive.drop_duplicates(inplace=True)

In [None]:
df_deceptive.shape

(1502, 12)

In [None]:
deceptive_urls = []
with open("/content/feed.txt") as f:
  for item in f.readlines():
    deceptive_urls.append(item.strip())

In [None]:
deceptive_urls.extend(df_deceptive["domain"].values)

In [None]:
df1 = pd.read_excel("/content/drive/MyDrive/Deceptive-Research/top_domains.xlsx")

In [None]:
df_normal = df1.sample(1000, random_state=42)

In [None]:
df_normal.drop_duplicates(inplace=True)

In [None]:
df_normal.shape

(1000, 3)

In [None]:
normal_urls = []
for url in tqdm(df_normal["Domain"].values):
    # whois.whois(url)
    normal_urls.append(url)

100%|██████████| 1000/1000 [00:00<00:00, 1221049.20it/s]


In [None]:
df_normal1 = df[df["label"]==0].sample(1000, random_state=42)

In [None]:
normal_urls.extend(df_normal1["domain"].values)

### Consolidating both the data

In [None]:
normal = pd.DataFrame({"URL": normal_urls, "label": [0]*len(normal_urls)}) 
deceptive = pd.DataFrame({"URL": deceptive_urls, "label": [1]*len(deceptive_urls)}) 
data = pd.concat([deceptive, normal])

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
data

Unnamed: 0,URL,label
0,https://www.imf.it/wp-content/themes/linkedin....,1
1,https://steamcommutilty.ru/profles/7409632692,1
2,https://delivery.buyvenoms.com/public/ApYUUvxr...,1
3,https://delivery.buyvenoms.com/public/Fv6iafRc...,1
4,https://apollo.baby/wp-content/upgrade/,1
...,...,...
3997,www.gnu.org/software/ncurses/ncurses.html,0
3998,www.angelfire.com/realm/warelords/,0
3999,www.sjgames.com/pyramid/,0
4000,www.ddj.com/cpp/184403751,0


### Building the Features

In [None]:
data["URL"] = data["URL"].apply(lambda x: x.strip())

In [None]:
# data["URL"] = data["URL"].apply(lambda x: x if x[:8] in ["http://", "https://"] else "http://"+x)

### Is Secure

In [None]:
def check_https_url(url):
    HTTPS_URL = f'https://{url}'
    try:
        HTTPS_URL = urlparse(HTTPS_URL)
        connection = HTTPSConnection(HTTPS_URL.netloc, timeout=2)
        connection.request('HEAD', HTTPS_URL.path)
        if connection.getresponse():
            return 1
        else:
            return 0
    except:
        return 0

In [None]:
data["secure"] = data["URL"].progress_apply(check_https_url)

  0%|          | 0/4002 [00:00<?, ?it/s]

In [None]:
data["secure"].value_counts()

0    2048
1    1954
Name: secure, dtype: int64

#### Host Name Feature

In [None]:
def get_host(url):
  res = re.findall("^(www.|https://|http://)", url)
  if res:
    url = re.sub(f"^{res[0]}", "", url)
  try:
    socket.gethostbyname(url)
    return 1
  except:
    return 0

In [None]:
data["host_name"] = data["URL"].progress_apply(get_host)

  0%|          | 0/4002 [00:00<?, ?it/s]

In [None]:
data["host_name"].value_counts()

0    3025
1     977
Name: host_name, dtype: int64

In [None]:
data["URL"]

0       https://www.imf.it/wp-content/themes/linkedin....
1           https://steamcommutilty.ru/profles/7409632692
2       https://delivery.buyvenoms.com/public/ApYUUvxr...
3       https://delivery.buyvenoms.com/public/Fv6iafRc...
4                 https://apollo.baby/wp-content/upgrade/
                              ...                        
3997            www.gnu.org/software/ncurses/ncurses.html
3998                   www.angelfire.com/realm/warelords/
3999                             www.sjgames.com/pyramid/
4000                            www.ddj.com/cpp/184403751
4001     www.ibiblio.org/pub/languages/fortran/ch1-1.html
Name: URL, Length: 4002, dtype: object

#### URL Len

In [None]:
data["url_len"] = data["URL"].progress_apply(lambda x: len(x))

  0%|          | 0/4002 [00:00<?, ?it/s]

#### Active/Inactive

In [None]:
def is_active(url):

  url = url if url[:8] in ["http://", "https://"] else "http://"+url
  try:
    r = requests.head(url, timeout=3)

    if r.status_code == 200: return 1
    else: return 0
  except: return 0

In [None]:
data["is_active"] = data["URL"].progress_apply(is_active)

### Is @

In [None]:
data["is_@"] = data["URL"].progress_apply(lambda x: 1 if re.search("@", x) else 0)

  0%|          | 0/4002 [00:00<?, ?it/s]

In [None]:
data["is_@"].value_counts()

0    3985
1      17
Name: is_@, dtype: int64

In [None]:
data.to_csv("/content/drive/MyDrive/Deceptive-Research/deceptive_urls.csv", index=False)

#### Check Redirect Urls

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Deceptive-Research/deceptive_urls.csv")

In [None]:
def check_redirect(url):
  try:
    url = url if url[:8] in ["http://", "https://"] else "http://"+url
    r = requests.get(url, timeout=3)
    return [1, len(r.history)] if r.history else [0, 0]
  except: 
    return [0, 0]

In [None]:
redirect, len_redirect = [], []

for url in tqdm(data["URL"]):
  res = check_redirect(url)
  redirect.extend([res[0]])
  len_redirect.extend([res[1]])

 77%|███████▋  | 3090/4002 [27:43<07:43,  1.97it/s]ERROR:urllib3.connection:Certificate did not match expected hostname: www.esi.es. Certificate: {'subject': ((('commonName', '*.tecnalia.com'),),), 'issuer': ((('countryName', 'GB'),), (('stateOrProvinceName', 'Greater Manchester'),), (('localityName', 'Salford'),), (('organizationName', 'Sectigo Limited'),), (('commonName', 'Sectigo RSA Domain Validation Secure Server CA'),)), 'version': 3, 'serialNumber': '201E88F4A2453BB6DFD8567790AD4B18', 'notBefore': 'Sep 12 00:00:00 2022 GMT', 'notAfter': 'Sep 17 23:59:59 2023 GMT', 'subjectAltName': (('DNS', '*.tecnalia.com'), ('DNS', 'tecnalia.com')), 'OCSP': ('http://ocsp.sectigo.com',), 'caIssuers': ('http://crt.sectigo.com/SectigoRSADomainValidationSecureServerCA.crt',)}
 93%|█████████▎| 3739/4002 [36:04<03:35,  1.22it/s]ERROR:urllib3.connection:Certificate did not match expected hostname: www.cse.dmu.ac.uk. Certificate: {'subject': ((('countryName', 'GB'),), (('stateOrProvinceName', 'Leicest

In [None]:
data["is_redirect"], data["len_redirect"] = redirect, len_redirect

In [None]:
data["is_redirect"].value_counts()

0    2423
1    1579
Name: is_redirect, dtype: int64

In [None]:
data["len_redirect"].value_counts()

0    2423
1     916
2     493
3      95
4      71
5       3
6       1
Name: len_redirect, dtype: int64

In [None]:
data.to_csv("/content/drive/MyDrive/Deceptive-Research/deceptive_urls.csv", index=False)