In [13]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import warnings 
warnings.simplefilter('ignore')


In [14]:
cols = ['having_IP_Address','URL_Length','Shortining_Service','having_At_Symbol','double_slash_redirecting','Prefix_Suffix','having_Sub_Domain','SSLfinal_State','Domain_registeration_length','Favicon','port','HTTPS_token','Request_URL','URL_of_Anchor','Links_in_tags','SFH','Submitting_to_email','Abnormal_URL','Redirect','on_mouseover','RightClick','popUpWidnow','Iframe','age_of_domain','DNSRecord','web_traffic','Page_Rank','Google_Index','Links_pointing_to_page','Statistical_report','Result']
path = '/content/phishing_dataset.csv'

phishing_dataset = pd.read_csv( path, delimiter=',', dtype=np.int32,names=cols)

samples = phishing_dataset.iloc[:,:-1]
targets = phishing_dataset.iloc[:, -1]

from sklearn.model_selection import train_test_split

training_samples, testing_samples, training_targets, testing_targets = train_test_split(
         samples, targets, test_size=0.2, random_state=0)

In [15]:
# Check the LR coef to undrstand Features Importance
def calc_feat_imp(model, samples):
    
    coef = model.coef_
    col = list(samples.columns)
    col_dict = dict(list(zip(col,coef)))
    feat_sort =  sorted(col_dict.items(), key=lambda kv: abs(kv[1]), reverse=True) # Sort the dct on values
    return feat_sort

In [16]:
# Train and return Accuracy
def train_score(model, data, thres): # Data is a list
    
    acc = []
    model.fit(data[0], data[1])
    
    for index in [0,2]:
        pred = model.predict(data[index])
        pred = np.where(pred < thres, -1, 1)
        accuracy = 100.0 * accuracy_score(data[index+1], pred)
        acc.append(accuracy)
    feat_imp = calc_feat_imp(model, data[0])
    
    return acc, feat_imp

In [17]:
# Print score
model = LinearRegression()
# Thres 0.40 is based on few trial and errors
acc, feat_sort = train_score(model, [training_samples, training_targets, testing_samples, testing_targets],  thres=0.40) # Passing 4 datasets together as a list

print(acc[0], acc[1])

88.68159203980099 88.2858435097241


## Let's create the degree-2 poly features of top 20 featutes to handle Interaction

In [18]:
# Polynomial Linear Regresion 
def poly_feat(x,deg):
    from sklearn.preprocessing import PolynomialFeatures

    poly = PolynomialFeatures(degree = deg) 
    x = pd.DataFrame(poly.fit_transform(x), columns=poly.get_feature_names(samples.columns)) 

    return x

In [19]:
d = 2
feat_cnt = 20
col_top_cnt = [ elem[0] for elem in feat_sort ][:feat_cnt]

# Select top "col_top_cnt" Features
training_samples, testing_samples  = training_samples.loc[:,col_top_cnt], testing_samples.loc[:,col_top_cnt]

# Generate Polynomial Features
training_samples, testing_samples  = poly_feat(training_samples, d), poly_feat(testing_samples, d) # testing_samples.shape

# Print score
# Thres 0.40 is based on few trial and errors
acc, feat_sort = train_score(model, [training_samples, training_targets, testing_samples, testing_targets], thres=0.40) # Passing 4 datasets together as a list

print(acc[0], acc[1])

92.59384893713252 92.08502939846224


In [20]:
# Top Features without losing accuracy
feat_cnt = 50
col_top_cnt = [ elem[0] for elem in feat_sort ][:feat_cnt]
training_samples, testing_samples  = training_samples.loc[:,col_top_cnt], testing_samples.loc[:,col_top_cnt]

# Print score
# Thres 0.40 is based on few trial and errors
acc, feat_sort = train_score(model, [training_samples, training_targets, testing_samples, testing_targets], thres=0.40) # Passing 4 datasets together as a list

print(acc[0], acc[1])

92.22071460877432 92.13025780189959


In [21]:
'''Only 50 Features was required for the best score'''

'Only 50 Features was required for the best score'