## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('malicious_url_dataset.csv')

## Extracting Year from Date Columns

In [3]:
dataset[['WHOIS_COUNTRY', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']] = dataset[['WHOIS_COUNTRY', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']].astype("string")


In [4]:
def extract_year(arg):
    if(arg.item == 'None'):
        return
    else:
        return arg.str.slice(6, 10)

In [5]:
dataset = dataset.apply(lambda x: extract_year(x) if(x.name == 'WHOIS_REGDATE') else x)
dataset = dataset.apply(lambda x: extract_year(x) if(x.name == 'WHOIS_UPDATED_DATE') else x)

In [6]:
dataset = dataset.replace(r'', 'None')

## Preview

In [7]:
dataset.head()

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,Type
0,16,7,,,2015.0,,1
1,16,6,,,,,0
2,16,6,,,,,0
3,17,6,US,AK,1997.0,2013.0,0
4,17,6,US,TX,1996.0,2017.0,0


In [8]:
dataset.shape

(1768, 7)

In [9]:
dataset.dtypes

URL_LENGTH                    int64
NUMBER_SPECIAL_CHARACTERS     int64
WHOIS_COUNTRY                string
WHOIS_STATEPRO               string
WHOIS_REGDATE                string
WHOIS_UPDATED_DATE           string
Type                          int64
dtype: object

In [10]:
dataset.isnull().sum()

URL_LENGTH                   0
NUMBER_SPECIAL_CHARACTERS    0
WHOIS_COUNTRY                0
WHOIS_STATEPRO               0
WHOIS_REGDATE                0
WHOIS_UPDATED_DATE           0
Type                         0
dtype: int64

In [11]:
dataset['WHOIS_COUNTRY'].value_counts()

US                1103
None               306
CA                  84
ES                  63
AU                  35
PA                  21
GB                  19
UK                  10
IN                  10
CN                  10
JP                  10
FR                   9
CZ                   9
CH                   6
KR                   5
NL                   5
PH                   4
ru                   4
BS                   4
AT                   4
DE                   3
us                   3
KY                   3
BE                   3
SE                   3
TR                   3
SC                   2
RU                   2
KG                   2
IL                   2
HK                   2
NO                   2
UY                   2
BR                   2
Cyprus               2
UG                   1
LU                   1
PK                   1
United Kingdom       1
LV                   1
AE                   1
IE                   1
se                   1
UA         

In [12]:
dataset['WHOIS_STATEPRO'].value_counts()

CA                372
None              362
NY                 75
WA                 65
Barcelona          62
                 ... 
worcs               1
RIX                 1
Andhra Pradesh      1
il                  1
Maharashtra         1
Name: WHOIS_STATEPRO, Length: 176, dtype: Int64

In [13]:
dataset['WHOIS_REGDATE'].value_counts()

2000    163
None    130
1996    120
1995    117
2001    108
2008    107
2005    107
1999    106
2003     97
2006     93
1997     90
2002     84
1994     78
1998     72
2004     72
2007     61
2009     43
2010     33
2016     26
2011     12
2015     10
1993      9
2014      9
2013      8
1990      5
2017      4
1992      3
2012      1
Name: WHOIS_REGDATE, dtype: Int64

## Encoding Categorical Data

In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit(dataset[['WHOIS_COUNTRY','WHOIS_STATEPRO','WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']])

OneHotEncoder()

In [15]:
encoded = pd.DataFrame.sparse.from_spmatrix(ohe.transform(dataset[['WHOIS_COUNTRY', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']]))

In [16]:
dataset.drop(['WHOIS_COUNTRY', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'], axis='columns', inplace=True)

In [17]:
dataset = pd.concat([encoded, dataset], axis=1)

## Splitting the dataset into the Training set and Test set

In [18]:
from sklearn.model_selection import train_test_split
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [19]:
# Scaling URL_LENGTH & NUMBER_SPECIAL_CHARACTERS

from sklearn.preprocessing import StandardScaler
# Scaling X_train
X_train_features = X_train[:, -2:]

scaler = StandardScaler().fit(X_train_features)
X_train_features = scaler.transform(X_train_features)

X_train[:, -2:] = X_train_features

# Scaling X_test
X_test_features = X_test[:, -2:]

scaler = StandardScaler().fit(X_test_features)
X_test_features = scaler.transform(X_test_features)

X_test[:, -2:] = X_test_features

## Training KNN Model

In [39]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4)

## Confusion Matrix

In [40]:
y_pred = classifier.predict(X_test)

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[305   6]
 [ 15  28]]


0.940677966101695