In [1]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings 
warnings.simplefilter('ignore')


In [2]:
cols = ['having_IP_Address','URL_Length','Shortining_Service','having_At_Symbol','double_slash_redirecting','Prefix_Suffix','having_Sub_Domain','SSLfinal_State','Domain_registeration_length','Favicon','port','HTTPS_token','Request_URL','URL_of_Anchor','Links_in_tags','SFH','Submitting_to_email','Abnormal_URL','Redirect','on_mouseover','RightClick','popUpWidnow','Iframe','age_of_domain','DNSRecord','web_traffic','Page_Rank','Google_Index','Links_pointing_to_page','Statistical_report','Result']
path = '/content/phishing_dataset.csv'

phishing_dataset = pd.read_csv( path, delimiter=',', dtype=np.int32,names=cols)

samples = phishing_dataset.iloc[:,:-1]
targets = phishing_dataset.iloc[:, -1]

from sklearn.model_selection import train_test_split

training_samples, testing_samples, training_targets, testing_targets = train_test_split(
         samples, targets, test_size=0.2, random_state=7)

In [3]:
# Check the LR coef to undrstand Features Importance
def calc_feat_imp(model, samples):
    
    coef = model.coef_[0]
    col = list(samples.columns)
    col_dict = dict(list(zip(col,coef)))
    feat_sort =  sorted(col_dict.items(), key=lambda kv: abs(kv[1]), reverse=True)
    return feat_sort

In [4]:
# Train and return Accuracy
def train_score(model, data, threshold=0.5):
    
    acc = []
    model.fit(data[0], data[1])
    
    for index in [0,2]:
        pred = model.predict_proba(data[index],)
        pred = np.where(pred[:,1] > threshold, 1, -1)
        accuracy = 100.0 * accuracy_score(data[index+1], pred)
        acc.append(accuracy)
    feat_imp = calc_feat_imp(model, data[0])
    
    return acc, feat_imp

In [5]:
# Print score
model = LogisticRegression()
acc, feat_sort = train_score(model, [training_samples, training_targets, testing_samples, testing_targets])

print(acc[0], acc[1])

92.48077792853913 93.71325192220714


## Let's create the degree-2 poly features of top 20 featutes to handle Interaction

In [6]:
# Polynomial Linear Regresion 
def poly_feat(x,deg):
    from sklearn.preprocessing import PolynomialFeatures

    poly = PolynomialFeatures(degree = deg) 
    x = pd.DataFrame(poly.fit_transform(x), columns=poly.get_feature_names(samples.columns)) 

    return x

In [7]:
d = 2
feat_cnt = 20
col_top_cnt = [ elem[0] for elem in feat_sort ][:feat_cnt]

# Select top "col_top_cnt" Features
training_samples, testing_samples  = training_samples.loc[:,col_top_cnt], testing_samples.loc[:,col_top_cnt]

# Generate Polynomial Features
training_samples, testing_samples  = poly_feat(training_samples, d), poly_feat(testing_samples, d) # testing_samples.shape

# Print score
THRESHOLD = 0.425
acc, feat_sort = train_score(model, [training_samples, training_targets, testing_samples, testing_targets], threshold=THRESHOLD)
print(acc[0], acc[1])

95.31886024423338 95.16056083220262


In [8]:
# Top Features to reduce Feature count without impacting accuracy
feat_cnt = 140
col_top_cnt = [ elem[0] for elem in feat_sort ][:feat_cnt]
training_samples, testing_samples  = training_samples.loc[:,col_top_cnt], testing_samples.loc[:,col_top_cnt]

# Print score
acc, feat_sort = train_score(model, [training_samples, training_targets, testing_samples, testing_targets]) #testing_samples.shape[0]*(1-.9502487562189054)

print(acc[0], acc[1])

95.27363184079603 95.02487562189054


## **Hyper-parameter tuning**

In [9]:
from sklearn.model_selection import GridSearchCV
# Create regularization penalty space
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = np.linspace(0,0.1,20)
# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [10]:
# Create grid search using 5-fold cross validation
clf = GridSearchCV(model, hyperparameters, cv=5, verbose=0)
# Fit grid search
grid_clf = clf.fit(training_samples, training_targets)


In [11]:
# Print score
acc, feat_sort = train_score(model, [training_samples, training_targets, testing_samples, testing_targets])

print(acc[0], acc[1])

95.27363184079603 95.02487562189054


In [12]:
'''No benefit of Hyper-parm tuning'''

'No benefit of Hyper-parm tuning'