# K Nearest Neighbour (Phishing Dataset)

### Import modules

In [1]:
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import accuracy_score

### Define Input and Output

In [2]:
phishing = np.loadtxt('phishing.txt', delimiter=',')

#attribute having_IP_Address  { -1,1 }
#attribute URL_Length   { 1,0,-1 }
#attribute Shortining_Service { 1,-1 }
#attribute having_At_Symbol   { 1,-1 }
#attribute double_slash_redirecting { -1,1 }
#attribute Prefix_Suffix  { -1,1 }
#attribute having_Sub_Domain  { -1,0,1 }
#attribute SSLfinal_State  { -1,1,0 }
#attribute Domain_registeration_length { -1,1 }
#attribute Favicon { 1,-1 }
#attribute port { 1,-1 }
#attribute HTTPS_token { -1,1 }
#attribute Request_URL  { 1,-1 }
#attribute URL_of_Anchor { -1,0,1 }
#attribute Links_in_tags { 1,-1,0 }
#attribute SFH  { -1,1,0 }
#attribute Submitting_to_email { -1,1 }
#attribute Abnormal_URL { -1,1 }
#attribute Redirect  { 0,1 }
#attribute on_mouseover  { 1,-1 }
#attribute RightClick  { 1,-1 }
#attribute popUpWidnow  { 1,-1 }
#attribute Iframe { 1,-1 }
#attribute age_of_domain  { -1,1 }
#attribute DNSRecord   { -1,1 }
#attribute web_traffic  { -1,0,1 }
#attribute Page_Rank { -1,1 }
#attribute Google_Index { 1,-1 }
#attribute Links_pointing_to_page { 1,0,-1 }
#attribute Statistical_report { -1,1 }
#attribute Result  { -1,1 }

### Create X and Y data

In [3]:
#X = phishing[:, [1, 5]]
X = phishing[:, 0:30]
y = phishing[:,30:31]
print('Class labels:', np.unique(y))


# Splitting data into 70% training and 30% test data:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,
                                                    random_state=1, stratify=y)
y_train = np.array(y_train.ravel())
y_test = np.array(y_test.ravel())


Class labels: [-1.  1.]


In [4]:
print(X)

[[-1.  1.  1. ...  1.  1. -1.]
 [ 1.  1.  1. ...  1.  1.  1.]
 [ 1.  0.  1. ...  1.  0. -1.]
 ...
 [ 1. -1.  1. ...  1.  0.  1.]
 [-1. -1.  1. ...  1.  1.  1.]
 [-1. -1.  1. ... -1.  1. -1.]]


### Train and Evaluate Data

In [5]:

knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

print('knn Misclassified samples: %d' % (y_test != knn_pred).sum())
print('Accuracy Score: %.2f' % accuracy_score(y_test, knn_pred))
print('Knn Score: %.2f' % knn.score(X_test, y_test))

knn Misclassified samples: 211
Accuracy Score: 0.94
Knn Score: 0.94


In [6]:
Log_reg = LogisticRegression()
Log_reg.fit(X_train, y_train)
Log_reg_pred = Log_reg.predict(X_test)

print('Log_reg Misclassified samples: %d' % (y_test != Log_reg_pred).sum())
print('Accuracy Score: %.2f' % accuracy_score(y_test, Log_reg_pred))
print('Log_reg Score: %.2f' % Log_reg.score(X_test, y_test))

Log_reg Misclassified samples: 250
Accuracy Score: 0.92
Log_reg Score: 0.92


In [7]:
Dec_tree = tree.DecisionTreeClassifier()
Dec_tree.fit(X_train, y_train)
Dec_tree_pred = Dec_tree.predict(X_test)

print('Dec_tree Misclassified samples: %d' % (y_test != Dec_tree_pred).sum())
print('Accuracy Score: %.2f' % accuracy_score(y_test, Dec_tree_pred))
print('Dec_tree Score: %.2f' % Dec_tree.score(X_test, y_test))

Dec_tree Misclassified samples: 163
Accuracy Score: 0.95
Dec_tree Score: 0.95
