In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import sklearn as skl

In [3]:
raw_data = pd.read_csv("dataset_link_phishing.csv", low_memory=False)

In [4]:
raw_data.shape

(19431, 87)

In [5]:
raw_data.columns

Index(['Unnamed: 0', 'url', 'url_length', 'hostname_length', 'ip', 'total_of.',
       'total_of-', 'total_of@', 'total_of?', 'total_of&', 'total_of=',
       'total_of_', 'total_of~', 'total_of%', 'total_of/', 'total_of*',
       'total_of:', 'total_of,', 'total_of;', 'total_of$', 'total_of_www',
       'total_of_com', 'total_of_http_in_path', 'https_token',
       'ratio_digits_url', 'ratio_digits_host', 'punycode', 'port',
       'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain',
       'nb_subdomains', 'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statist

In [6]:
# Trying to look at the nulls without a second sum wasn't working, this let me see that nothing had nulls

In [7]:
null_info = raw_data.isnull().sum().sum()

In [8]:
null_info

0

In [9]:
slightly_cleaned_data = raw_data.drop(columns=['Unnamed: 0', 'url', 'ratio_nullHyperlinks', 'ratio_intRedirection', 'ratio_intErrors', 'submit_email', 'sfh'])

In [10]:
slightly_cleaned_data

Unnamed: 0,url_length,hostname_length,ip,total_of.,total_of-,total_of@,total_of?,total_of&,total_of=,total_of_,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,46,20,0,3,0,0,1,0,1,0,...,1,one,0,627,6678,78526,0,0,5,phishing
1,128,120,0,10,0,0,0,0,0,0,...,1,zero,0,300,65,0,0,1,0,phishing
2,52,25,0,3,0,0,0,0,0,0,...,1,zero,0,119,1707,0,0,1,0,phishing
3,21,13,0,2,0,0,0,0,0,0,...,1,one,0,130,1331,0,0,0,0,legitimate
4,28,19,0,2,0,0,0,0,0,0,...,0,zero,0,164,1662,312044,0,0,4,legitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19426,45,17,1,2,0,0,0,0,0,0,...,0,0,0,448,5396,3980,0,0,6,legitimate
19427,84,18,1,5,0,1,1,0,1,0,...,1,0,0,211,6728,0,0,1,0,phishing
19428,105,16,1,2,6,0,1,0,1,1,...,0,0,0,2809,8515,8,0,1,10,legitimate
19429,38,30,1,2,0,0,0,0,0,0,...,1,0,0,85,2836,2455493,0,0,4,legitimate


In [16]:
data_clean_2 = slightly_cleaned_data.copy()
error = 0
for item in range(len(data_clean_2['domain_with_copyright'])):
    try:
        data_clean_2.loc[item, ['domain_with_copyright']] = int(data_clean_2.loc[item, ['domain_with_copyright']].iloc[0])
    except ValueError:
        value = data_clean_2.loc[item, ['domain_with_copyright']]
        value = value.iloc[0].lower()
        if value == "zero":
            data_clean_2.loc[item, ['domain_with_copyright']] = 0
        elif value == "one":
            data_clean_2.loc[item, ['domain_with_copyright']] = 1
        else:
            data_clean_2.loc[item, ['domain_with_copyright']] = -1
            error += 1
error

0

In [28]:
data_clean_2.dtypes

url_length          int64
hostname_length     int64
ip                  int64
total_of.           int64
total_of-           int64
                    ...  
web_traffic         int64
dns_record          int64
google_index        int64
page_rank           int64
status             object
Length: 80, dtype: object

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [21]:
X = data_clean_2.drop(columns=['status'])
y = data_clean_2['status']

In [22]:
X

Unnamed: 0,url_length,hostname_length,ip,total_of.,total_of-,total_of@,total_of?,total_of&,total_of=,total_of_,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
0,46,20,0,3,0,0,1,0,1,0,...,0,1,1,0,627,6678,78526,0,0,5
1,128,120,0,10,0,0,0,0,0,0,...,1,1,0,0,300,65,0,0,1,0
2,52,25,0,3,0,0,0,0,0,0,...,0,1,0,0,119,1707,0,0,1,0
3,21,13,0,2,0,0,0,0,0,0,...,0,1,1,0,130,1331,0,0,0,0
4,28,19,0,2,0,0,0,0,0,0,...,0,0,0,0,164,1662,312044,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19426,45,17,1,2,0,0,0,0,0,0,...,0,0,0,0,448,5396,3980,0,0,6
19427,84,18,1,5,0,1,1,0,1,0,...,0,1,0,0,211,6728,0,0,1,0
19428,105,16,1,2,6,0,1,0,1,1,...,0,0,0,0,2809,8515,8,0,1,10
19429,38,30,1,2,0,0,0,0,0,0,...,0,1,0,0,85,2836,2455493,0,0,4


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15544, 79), (3887, 79), (15544,), (3887,))

In [33]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [34]:
y_pred = forest.predict(X_test)

In [36]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9907383586313352