In [22]:
import pandas as pd
import time
from sklearn.tree import DecisionTreeClassifier
import re
import ipaddress as ip
import csv

In [2]:
gen_delims = [":", "/", "?", "#", "[", "]", "@"]
sub_delims = ["!", "$", "&", "'", "(", ")", "*", "+", ",", ";", "="]
reserved_characters = gen_delims + sub_delims
unreserved_characters = ["-", ".", "_", "~"]

In [3]:
def transform_class(x):
    return 1 if x == 'good' else 0

In [1]:
def load_known_tlds():
    with open('known_tlds.txt', 'r') as file:
        tlds = file.read().split('\n')
        tlds = [tld.lower() for tld in tlds]
        return tlds

In [2]:
load_known_tlds()

['aaa',
 'aarp',
 'abarth',
 'abb',
 'abbott',
 'abbvie',
 'abc',
 'able',
 'abogado',
 'abudhabi',
 'ac',
 'academy',
 'accenture',
 'accountant',
 'accountants',
 'aco',
 'actor',
 'ad',
 'adac',
 'ads',
 'adult',
 'ae',
 'aeg',
 'aero',
 'aetna',
 'af',
 'afamilycompany',
 'afl',
 'africa',
 'ag',
 'agakhan',
 'agency',
 'ai',
 'aig',
 'airbus',
 'airforce',
 'airtel',
 'akdn',
 'al',
 'alfaromeo',
 'alibaba',
 'alipay',
 'allfinanz',
 'allstate',
 'ally',
 'alsace',
 'alstom',
 'am',
 'amazon',
 'americanexpress',
 'americanfamily',
 'amex',
 'amfam',
 'amica',
 'amsterdam',
 'analytics',
 'android',
 'anquan',
 'anz',
 'ao',
 'aol',
 'apartments',
 'app',
 'apple',
 'aq',
 'aquarelle',
 'ar',
 'arab',
 'aramco',
 'archi',
 'army',
 'arpa',
 'art',
 'arte',
 'as',
 'asda',
 'asia',
 'associates',
 'at',
 'athleta',
 'attorney',
 'au',
 'auction',
 'audi',
 'audible',
 'audio',
 'auspost',
 'author',
 'auto',
 'autos',
 'avianca',
 'aw',
 'aws',
 'ax',
 'axa',
 'az',
 'azure',
 'ba'

In [4]:
def load_suspicious_tlds():
    with open('top_abused_tlds.txt') as file:
        tlds = file.read().split('\n')
        tlds = [tld.lower() for tld in tlds]
        return tlds

In [5]:
load_suspicious_tlds()

['.fit', '.tk', '.gq', '.ga', '.ml', '.cf', '.work', '.date', '.wang', '.men']

In [6]:
def load_suspicious_words():
    with open('suspicious_words.txt') as file:
        words = file.read().split('\n')
        words = [tld.lower() for tld in words]
        return words

In [7]:
load_suspicious_words()

['account',
 'webscr',
 'login',
 'ebayisapi',
 'signin',
 'banking',
 'confirm',
 'secure',
 'images',
 'exe',
 'account',
 'node.php',
 'username',
 'password',
 'urs',
 'user',
 'pass',
 'pwd']

In [8]:
def scrape_tld(url):
    try:
        index = url.index('/')
    except ValueError:
        index = len(url) - 1

    dot_index = url.rfind('.', 0, index)

    try:
        index = url.index(':', dot_index, index + 1)
    except ValueError:
        ()

    return url[(dot_index + 1):index]

In [9]:
def len_of_url(url):
    return len(url)


def is_tld_in_known_list(tld, known_list):
    return tld in known_list


def is_tld_in_suspicious_list(tld, suspicious_list):
    return tld in suspicious_list

In [10]:
def does_url_contain_ip_address(url):
    try:
        if ip.ip_address(url):
            return True
    except ValueError:
        return False


def len_of_deep_url(url):
    try:
        index = url.index('/')
        return len(url[index:])
    except ValueError:
        return len(url)

In [11]:
def number_of_gen_delimiters_in_url(url):
    return sum([1 if item in url else 0 for item in gen_delims])


def number_of_sub_delimiters_in_url(url):
    return sum([1 if item in url else 0 for item in sub_delims])


def number_of_reserved_characters_in_url(url):
    return number_of_gen_delimiters_in_url(url) + number_of_sub_delimiters_in_url(url)


def number_of_unreserved_special_characters_in_url(url):
    return sum([1 if item in url else 0 for item in unreserved_characters])


def number_of_sub_domains(url):
    try:
        index = url.index('/')
        return len(url[index:].split('.'))
    except ValueError:
        return 0

In [12]:
def does_url_contain_http_inside(url):
    return 'http' in url[1:]


def remove_http_from_begining(url):
    if url.startswith('http://www.'):
        return url[11:]
    elif url.startswith('http://'):
        return url[7:]
    return url

In [13]:
def count_suspicious_words_in_url(url, list_of_words):
    return sum(item in url for item in list_of_words)


def count_percent_character_in_url(url):
    return url.count('%')


def count_number_of_digits_in_url(url):
    return sum(c.isdigit() for c in url)


def number_length_ration_in_url(url):
    return count_number_of_digits_in_url(url) / len_of_url(url)

In [14]:
def does_url_contain_equal_sign_after_question_mark(url):
    try:
        index = url.index('?')
        return '=' in url[index:]
    except ValueError:
        return False


def does_url_contain_non_standard_port(url):
    tmp = re.search(":([0-9].?.?.?)", url)
    try:
        port = tmp.group(0)[1:]
        return (port != '8080') and (port != '80') and (port != '443')
    except AttributeError:
        return False

In [15]:
file = 'data.csv'
df = pd.read_csv(file, converters={'label': transform_class}, low_memory=False)
print(len(df))
df.drop_duplicates(subset=None, inplace=True)
print(len(df))
print(df.shape)
df_whois = pd.read_csv('whois_data.csv', low_memory=False)
df = pd.merge(df, df_whois, how='inner', on='url')
print(df.shape)

420464
411248
(411248, 2)
(411466, 5)


In [16]:
if df.isnull().values.any():
    print("Skup sadrzi NaN vrednosti!\n")
else:
    print("Skup ne sadrzi nijednu NaN vrednost\n")

Skup ne sadrzi nijednu NaN vrednost



In [17]:
x = df.values[:, 0]
y = df.values[:, 1]
y = y.astype('int')

df_whois = df[['rd', 'ed', 'ud']]
print(df_whois.head(10))
whois_rd = df_whois.values[:, 0]
whois_ed = df_whois.values[:, 1]
whois_ud = df_whois.values[:, 2]

     rd    ed    ud
0  4934   179   184
1    -1    -1    -1
2  4934   179   184
3  5378   100   264
4  6003   206   157
5  4789   324    40
6  3472   180   184
7  5914  1318    -1
8   223   142    54
9  4482   630  1185


In [18]:
x = [remove_http_from_begining(item) for item in x]

In [24]:
known_tlds = load_known_tlds()
abused_tlds = load_suspicious_tlds()
suspicious_words = load_suspicious_words()
data_columns = ['url', 'url_len', 'tld_in_known', 'tld_in_abused', 'contain_ip', 'deep_url_len', 'num_of_gen_deli',
                    'num_of_sub_deli', 'num_of_reserved_char', 'num_of_unreserved_spec_char', 'num_of_sub_domains',
                    'contain_http', 'number_of_suspicious_words', 'number_of_percentage_signs', 'number_of_numbers',
                    'number_of_numbers_length_of_url_ratio', 'contain_equal_sign_after_question_mark',
                    'contain_non_standard_port', 'whois_rd', 'whois_ed', 'whois_ud', 'class']

data_list = [data_columns]

In [25]:
c = 0
for (i, url) in enumerate(x):
    url_string = url
    url_string = url
    url_len = len_of_url(url)
    url_tld = scrape_tld(url)
    is_tld_known = is_tld_in_known_list(url_tld, known_tlds)
    is_tld_abused = is_tld_in_suspicious_list(url_tld, abused_tlds)
    url_contain_ip = does_url_contain_ip_address(url)
    url_deep_url_len = len_of_deep_url(url)
    url_num_of_gen_delim = number_of_gen_delimiters_in_url(url)
    url_num_of_sub_delim = number_of_sub_delimiters_in_url(url)
    url_num_of_res_delim = number_of_reserved_characters_in_url(url)
    url_num_of_unres_delim = number_of_unreserved_special_characters_in_url(url)
    url_num_of_sub_domains = number_of_sub_domains(url)
    url_contain_http = does_url_contain_http_inside(url)
    number_of_suspicious_words = count_suspicious_words_in_url(url, suspicious_words)
    number_of_percentage_signs = count_percent_character_in_url(url)
    number_of_digits = count_number_of_digits_in_url(url)
    digits_url_length_ratio = number_length_ration_in_url(url)
    url_contain_es_after_qm = does_url_contain_equal_sign_after_question_mark(url)
    url_contain_non_standard_port = does_url_contain_non_standard_port(url)
    url_class = y[i]

    days_since_created = whois_rd[i]
    days_until_expires = whois_ed[i]
    days_since_last_updated = whois_ud[i]

    data_list.append([url_string, url_len, is_tld_known, is_tld_abused, url_contain_ip, url_deep_url_len,
                      url_num_of_gen_delim, url_num_of_sub_delim, url_num_of_res_delim, url_num_of_unres_delim,
                      url_num_of_sub_domains, url_contain_http, number_of_suspicious_words,
                      number_of_percentage_signs, number_of_digits, digits_url_length_ratio,
                      url_contain_es_after_qm, url_contain_non_standard_port, days_since_created,
                      days_until_expires, days_since_last_updated, url_class])

    c += 1
    if(c%10000 == 0):
        print(c)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000


In [26]:
with open('data_for_classification_tmp.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file)
    for row in data_list:
        writer.writerow(row)