# Detecting malicious URLs

In [91]:
# GENERAL
import os
import math
import re
import datetime
import time
import pandas as pd
import matplotlib as plt
import seaborn as sns
from tqdm import tqdm

For this experiment, we will be using the Sklearn library along with it's tools to assist us along the data science process.

In [92]:
# SKLEARN
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

Here we import some libraries to assist us in pulling, parsing, and transforming our domain information.

In [93]:
# DOMAIN SPECIFIC LIBRARIES
import tldextract
from datetime import datetime
import warnings
from urllib.parse import urlparse
from socket import gethostbyname, gaierror, timeout
import whois

In [94]:
# iPython and Notebook config
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #display all results

%config InlineBackend.figure_format = 'retina' #see plots in retina displays
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Data Ingestion

### The bad stuff

Exploring the Internet, we find a nice list of maliciousness, provided by http://www.malwaredomainlist.com/

Link: http://www.malwaredomainlist.com/hostslist/hosts.txt

In [108]:
# Ingest malicious urls from malwaredomainlist
columns = ['ip', 'url']

mal_df = pd.read_csv('../data/hosts.txt', 
                     delimiter = '\s+', 
                     encoding = "ISO-8859-1", 
                     skiprows=6,
                     names = columns)[['domain']]

In [109]:
# Randomly sample 500 elements from your dataframe
mal_sample_df = mal_df.sample(n=50)

In [110]:
mal_sample_df['class'] = 'malicious'

In [111]:
mal_sample_df.head()

Unnamed: 0,domain,class
97,broadtech.co,malicious
782,tube8vidslmf.dnset.com,malicious
399,miespaciopilates.com,malicious
858,videoflyover.com,malicious
176,down.mykings.pw,malicious


### The mostly benign stuff

In [112]:
# Ingest Alex top 1 million urls 
columns = ['url']

benign_df = pd.read_csv('../data/top-1m.csv', 
                     encoding = "ISO-8859-1", 
                     names = columns)

In [113]:
benign_sample_df = benign_df.sample(n=50)

In [114]:
benign_sample_df['class'] = 'benign'

In [115]:
benign_sample_df.head()

Unnamed: 0,domain,class
633168,annaieservice.com,benign
796269,pdofna.com,benign
806060,netdreamers.co.jp,benign
846287,navins.in,benign
868009,4-u.info,benign


In [116]:
# Frames we would like to merge
full_data = [mal_sample_df, benign_sample_df]
print(mal_sample_df.shape, benign_sample_df.shape)

(50, 2) (50, 2)


In [117]:
tqdm.pandas()

def creation_date(domain_name):
    """
    Gets creation date of domain from whois
    """
    
    # Get creation date of Domain
    currentDT = datetime.now()
    default_date = currentDT.strftime("%Y-%m-%d %H:%M:%S")
    
    try:
        creation_date = whois.whois(domain_name).creation_date
        if type(creation_date) is list:
            return creation_date[0]
        elif str(creation_date).find('Aug'):
            creation_date = "1996-07-01 00:00:01"
            return creation_date
        elif creation_date == np.nan:
            return default_date
        else:
            return creation_date
    except whois.parser.PywhoisError:
        return default_date
    except gaierror:
        return default_date
    except socket.timeout:
        return default_date
    except socket.error:
        return default_date

# Generate creation date
for df in full_data:
    df['domain_creation'] = df['domain'].progress_apply(lambda x: creation_date(x))



  0%|          | 0/50 [00:00<?, ?it/s][A[A

  4%|▍         | 2/50 [00:03<01:19,  1.65s/it][A[A

  6%|▌         | 3/50 [00:05<01:18,  1.68s/it][A[A

  8%|▊         | 4/50 [00:05<01:00,  1.32s/it][A[A

 10%|█         | 5/50 [00:06<00:50,  1.13s/it][A[A

 12%|█▏        | 6/50 [00:09<01:11,  1.63s/it][A[A

 14%|█▍        | 7/50 [00:10<01:09,  1.61s/it][A[A

 16%|█▌        | 8/50 [00:10<00:52,  1.24s/it][A[A

 18%|█▊        | 9/50 [00:12<00:49,  1.20s/it][A[A

 20%|██        | 10/50 [00:13<00:46,  1.15s/it][A[A

 22%|██▏       | 11/50 [00:15<00:55,  1.42s/it][A[A

 24%|██▍       | 12/50 [00:16<00:48,  1.26s/it][A[A

 26%|██▌       | 13/50 [00:19<01:16,  2.06s/it][A[A

 28%|██▊       | 14/50 [00:21<01:12,  2.01s/it][A[A

 30%|███       | 15/50 [00:23<01:05,  1.86s/it][A[A

 32%|███▏      | 16/50 [00:24<00:52,  1.54s/it][A[A

 34%|███▍      | 17/50 [00:26<00:56,  1.70s/it][A[A

 36%|███▌      | 18/50 [00:28<00:58,  1.83s/it][A[A

 38%|███▊      | 19/50 [0

In [120]:
# Generate number of special characters
for df in full_data:
    df['specials'] = df['domain'].apply(lambda x: len(re.sub('[\w]+' ,'', x)))

In [None]:
# Generate number of special characters
for df in full_data:
    # Extract domain
    df['domain'] = df['domain'].apply(lambda x: tldextract.extract(x).domain)    

In [None]:
# Generate number of special characters
for df in full_data:
    # Extract suffix
    df['suffix'] = df['domain'].apply(lambda x: tldextract.extract(x).suffix)

In [None]:
# Generate number of special characters
for df in full_data:
    # Extract suffix
    df['domain_name'] = df['domain'].apply(lambda x: tldextract.extract(x).registered_domain)

In [121]:
mal_sample_df

Unnamed: 0,domain,class,domain_creation,specials
97,broadtech.co,malicious,1996-07-01 00:00:01,1
782,tube8vidslmf.dnset.com,malicious,1996-07-01 00:00:01,2
399,miespaciopilates.com,malicious,2019-07-10 21:59:51,1
858,videoflyover.com,malicious,2019-07-10 21:59:52,1
176,down.mykings.pw,malicious,1996-07-01 00:00:01,2
1009,www.panazan.ro,malicious,1996-07-01 00:00:01,2
290,hrdcvn.com.vn,malicious,2019-07-10 21:59:57,2
138,cznshuya.ivnet.ru,malicious,1996-07-01 00:00:01,2
101,cacl.fr,malicious,1996-07-01 00:00:01,1
1061,www.tvnews.or.kr,malicious,1996-07-01 00:00:01,3
