In [None]:
'''Feature Selection by Feature Importance With Scaling'''
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split

#For imbalanced dataset
DATAPATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_prob_freq.csv'
#For balanced dataset
#DATAPATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_prob_freq_5000.csv'

dataset=pd.read_csv(DATAPATH)
scaler=MinMaxScaler(feature_range=[0,1])
data_scaled=scaler.fit_transform(dataset)

X=dataset[['hash_count','at_count','dash_count','dot_count',
           'dol_count','asteric_count','leftparen_count','rightparen_count',
           'plus_count','semicolor_count','tide_count','colon_count',
           'apos_count','slash_count','percentage_count','quest_count',
           'comma_count','equ_count','amper_count','exclam_count',
           'under_count']] # Labels
y=dataset['Result']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Create a RandomForest Classifier
clf=RandomForestClassifier()

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

feature_imp = pd.Series(clf.feature_importances_,index=['hash_count','at_count','dash_count','dot_count',
                                                        'dol_count','asteric_count','leftparen_count','rightparen_count',
                                                        'plus_count','semicolor_count','tide_count','colon_count',
                                                        'apos_count','slash_count','percentage_count','quest_count',
                                                        'comma_count','equ_count','amper_count','exclam_count',
                                                        'under_count']).sort_values(ascending=False)

# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

df=pd.DataFrame()
for x in range(10):
    label=str(feature_imp.index[x])
    df.insert(x,label,dataset[label])

export_csv = df.to_csv (r'/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/featselect.csv', index = None, header=True)


In [None]:
#Support Vector Machine W/O Hyperparameter Tuning
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score
from sklearn.metrics import classification_report

#imbalanced dataset
print('imbalanced dataset')
DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
#print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
#print('W/O Entropy')
#X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
print('W/ Entropy')
X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=None)

clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
prediction=clf.predict(X_test)
rocscore=roc_auc_score(y_test,prediction)
print('-------------------------------------------------------------')
print('Precision   Recall    F_score   Support\n',precision_recall_fscore_support(y_test,prediction))
print('-------------------------------------------------------------')
print('Roc Score',rocscore)
print('Accuracy Score',accuracy_score(y_test,prediction))
print('Confusion metric\n',str(pd.DataFrame(confusion_matrix(y_test, prediction),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))
print('-------------------------------------------------------------')
print('-------------------------------------------------------------')
print("=== Classification Report ===")
print(classification_report(y_test.round, prediction,digits=6))

In [None]:
#Gaussian Naive Bayes W/O Hyperparameter Tuning
import numpy as np
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score

#imbalanced dataset
print('imbalanced dataset')
DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
#print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
#print('W/O Entropy')
#X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
print('W/ Entropy')
X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=None)

clf = GaussianNB()
clf.fit(X_train, y_train) 
prediction=clf.predict(X_test)
rocscore=roc_auc_score(y_test,prediction)
print('-------------------------------------------------------------')
print('Precision   Recall    F_score   Support\n',precision_recall_fscore_support(y_test,prediction))
print('-------------------------------------------------------------')
print('Roc Score',rocscore)
print('Accuracy Score',accuracy_score(y_test,prediction))
print('Confusion metric\n',str(pd.DataFrame(confusion_matrix(y_test, prediction),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))
print('-------------------------------------------------------------')
print('-------------------------------------------------------------')
print("=== Classification Report ===")
print(classification_report(y_test, prediction,digits=6))

In [None]:
#Random Forest W/O Hyperparameter Tuning
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score

#imbalanced dataset
print('imbalanced dataset')
DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
#print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
#print('W/O Entropy')
#X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
print('W/ Entropy')
X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=None)

clf=RandomForestClassifier()
clf.fit(X_train, y_train) 
prediction=clf.predict(X_test)
rocscore=roc_auc_score(y_test,prediction)
print('-------------------------------------------------------------')
print('Precision   Recall    F_score   Support\n',precision_recall_fscore_support(y_test,prediction))
print('-------------------------------------------------------------')
print('Roc Score',rocscore)
print('Accuracy Score',accuracy_score(y_test,prediction))
print('Confusion metric\n',str(pd.DataFrame(confusion_matrix(y_test, prediction),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))
print('-------------------------------------------------------------')
print('-------------------------------------------------------------')
print("=== Classification Report ===")
print(classification_report(y_test, prediction,digits=6))

In [4]:
#Random Forest W/ Hyperparameter Tuning Checked
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score,f1_score,balanced_accuracy_score
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

def grid_search_wrapper(refit_score):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    
    print('\nRefit by ',refit_score)
    
    #start model
    start_time  = datetime.datetime.now()
    print('\nStart time',start_time)
    
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    #end model fit
    end_time = datetime.datetime.now()
    print('\nEnd time',end_time)

    return grid_search

#imbalanced dataset
print('imbalanced dataset')
DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
DATA_PATH="D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
#print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
#DATA_PATH="D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv"
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
#print('W/O Entropy')
#X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
print('W/ Entropy')
X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

# show the distribution
print('y_train class distribution\n')
print(y_train.value_counts(normalize=True))
print('\ny_test class distribution')
print('\n',y_test.value_counts(normalize=True))

clf = RandomForestClassifier()

'''param_grid = {
    'n_estimators' : [100,150,200,250,300,350,400,450,500],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
'''param_grid = {
    'n_estimators' : [100,200,300,400,500,600,700],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
param_grid = {
    'n_estimators' : [100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    #'n_estimators' : [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

scorers = {
    'precision_micro_score': make_scorer(precision_score, average='micro'),
    'precision_score': make_scorer(precision_score, average='weighted'),
    'recall_micro_score': make_scorer(recall_score, average='micro'),
    'recall_score': make_scorer(recall_score, average='weighted'),
    'accuracy_score': make_scorer(accuracy_score),
    'roc_score_scorer' : make_scorer(roc_auc_score,average='weighted'),
    'metrics_roc_auc_score' : 'roc_auc',
    'f1_score' : make_scorer(f1_score, average='weighted'),
    'tp': make_scorer(tp),
    'tn': make_scorer(tn),
    'fp': make_scorer(fp),
    'fn': make_scorer(fn)
}


refit_scorer='metrics_roc_auc_score'
print('Refit by ',refit_scorer)
grid_search_clf = grid_search_wrapper(refit_score=refit_scorer)

# make the predictions
y_pred = grid_search_clf.predict(X_test)

print('\nBest params for ')
print('\n',grid_search_clf.best_params_)
best_parameters = grid_search_clf.best_estimator_.get_params()
best_result = grid_search_clf.best_score_
print('\nBest params :',best_parameters)
print('\nBest Result :',best_result)

# confusion matrix on the test data.
print('\nConfusion matrix of Random Forest optimized for the test data:')
print('\n',str(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))

results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_metrics_roc_auc_score', ascending=False)

print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score',
                        'param_n_estimators']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))

'''
print('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))
print('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))

print('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))
print('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))

print('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','split3_train_tn','split4_train_tn','split5_train_tn','split6_train_tn','split7_train_tn','split8_train_tn','split9_train_tn','mean_train_tn']].round(6)))
print('\n',str(results[['split0_test_tn','split1_test_tn','split2_test_tn','split3_test_tn','split4_test_tn','split5_test_tn','split6_test_tn','split7_test_tn','split8_test_tn','split9_test_tn','mean_test_tn']].round(6)))

print('\n',str(results[['split0_train_fn','split1_train_fn','split2_train_fn','split3_train_fn','split4_train_fn','split5_train_fn','split6_train_fn','split7_train_fn','split8_train_fn','split9_train_fn','mean_train_fn']].round(6)))
print('\n',str(results[['split0_test_fn','split1_test_fn','split2_test_fn','split3_test_fn','split4_test_fn','split5_test_fn','split6_test_fn','split7_test_fn','split8_test_fn','split9_test_fn','mean_test_fn']].round(6)))

print('\n',str(results[['std_train_tp','std_train_fp','std_train_tn','std_train_fn']].round(6)))

print('\n',str(results[['std_test_tp','std_test_fp','std_test_tn','std_test_fn']].round(6)))
'''

imbalanced dataset
W/ Entropy
y_train class distribution

-1    0.900766
 1    0.099234
Name: Result, dtype: float64

y_test class distribution

 -1    0.90334
 1    0.09666
Name: Result, dtype: float64
Refit by  metrics_roc_auc_score

Refit by  metrics_roc_auc_score

Start time 2021-09-04 15:13:37.203267

End time 2021-09-04 15:27:23.773697

Best params for 

 {'bootstrap': True, 'criterion': 'gini', 'n_estimators': 150}

Best params : {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Best Result : 0.8946969634957629

Confusion matrix of Random Forest optimized for the test data:

      pred_neg  pred_pos
neg     23795 

"\nprint('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))\nprint('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))\n\nprint('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))\nprint('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))\n\nprint('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','

In [8]:
results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_metrics_roc_auc_score', ascending=False)

print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score',
                        'param_n_estimators']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))


     mean_test_precision_score  mean_test_precision_micro_score  \
31                   0.937091                         0.940555   
28                   0.937004                         0.940530   
15                   0.937158                         0.940655   
9                    0.937023                         0.940580   
20                   0.937114                         0.940592   
14                   0.937396                         0.940806   
13                   0.937311                         0.940743   
12                   0.937237                         0.940718   
29                   0.937111                         0.940618   
22                   0.937065                         0.940592   
27                   0.937015                         0.940555   
30                   0.937079                         0.940580   
23                   0.937127                         0.940693   
11                   0.937202                         0.940731   
4       

In [3]:
#Random Forest W/ Hyperparameter Tuning Checked
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score,f1_score,balanced_accuracy_score
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

def grid_search_wrapper(refit_score):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    
    print('\nRefit by ',refit_score)
    
    #start model
    start_time  = datetime.datetime.now()
    print('\nStart time',start_time)
    
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    #end model fit
    end_time = datetime.datetime.now()
    print('\nEnd time',end_time)

    return grid_search

#imbalanced dataset
print('imbalanced dataset')
#DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
DATA_PATH="D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
#print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
#DATA_PATH="D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv"
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
print('W/O Entropy')
X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
#print('W/ Entropy')
#X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

# show the distribution
print('y_train class distribution\n')
print(y_train.value_counts(normalize=True))
print('\ny_test class distribution')
print('\n',y_test.value_counts(normalize=True))

clf = RandomForestClassifier()

'''param_grid = {
    'n_estimators' : [100,150,200,250,300,350,400,450,500],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
'''param_grid = {
    'n_estimators' : [100,200,300,400,500,600,700],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
param_grid = {
    'n_estimators' : [100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    #'n_estimators' : [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

scorers = {
    'precision_micro_score': make_scorer(precision_score, average='micro'),
    'precision_score': make_scorer(precision_score, average='weighted'),
    'recall_micro_score': make_scorer(recall_score, average='micro'),
    'recall_score': make_scorer(recall_score, average='weighted'),
    'accuracy_score': make_scorer(accuracy_score),
    'roc_score_scorer' : make_scorer(roc_auc_score,average='weighted'),
    'metrics_roc_auc_score' : 'roc_auc',
    'f1_score' : make_scorer(f1_score, average='weighted'),
    'tp': make_scorer(tp),
    'tn': make_scorer(tn),
    'fp': make_scorer(fp),
    'fn': make_scorer(fn)
}


refit_scorer='metrics_roc_auc_score'
print('Refit by ',refit_scorer)
grid_search_clf = grid_search_wrapper(refit_score=refit_scorer)

# make the predictions
y_pred = grid_search_clf.predict(X_test)

print('\nBest params for ')
print('\n',grid_search_clf.best_params_)
best_parameters = grid_search_clf.best_estimator_.get_params()
best_result = grid_search_clf.best_score_
print('\nBest params :',best_parameters)
print('\nBest Result :',best_result)

# confusion matrix on the test data.
print('\nConfusion matrix of Random Forest optimized for the test data:')
print('\n',str(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))

results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_metrics_roc_auc_score', ascending=False)

print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score',
                        'param_n_estimators']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))

'''
print('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))
print('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))

print('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))
print('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))

print('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','split3_train_tn','split4_train_tn','split5_train_tn','split6_train_tn','split7_train_tn','split8_train_tn','split9_train_tn','mean_train_tn']].round(6)))
print('\n',str(results[['split0_test_tn','split1_test_tn','split2_test_tn','split3_test_tn','split4_test_tn','split5_test_tn','split6_test_tn','split7_test_tn','split8_test_tn','split9_test_tn','mean_test_tn']].round(6)))

print('\n',str(results[['split0_train_fn','split1_train_fn','split2_train_fn','split3_train_fn','split4_train_fn','split5_train_fn','split6_train_fn','split7_train_fn','split8_train_fn','split9_train_fn','mean_train_fn']].round(6)))
print('\n',str(results[['split0_test_fn','split1_test_fn','split2_test_fn','split3_test_fn','split4_test_fn','split5_test_fn','split6_test_fn','split7_test_fn','split8_test_fn','split9_test_fn','mean_test_fn']].round(6)))

print('\n',str(results[['std_train_tp','std_train_fp','std_train_tn','std_train_fn']].round(6)))

print('\n',str(results[['std_test_tp','std_test_fp','std_test_tn','std_test_fn']].round(6)))
'''

imbalanced dataset
W/O Entropy
y_train class distribution

-1    0.900351
 1    0.099649
Name: Result, dtype: float64

y_test class distribution

 -1    0.904583
 1    0.095417
Name: Result, dtype: float64
Refit by  metrics_roc_auc_score

Refit by  metrics_roc_auc_score

Start time 2021-09-04 14:59:59.888222

End time 2021-09-04 15:11:25.924437

Best params for 

 {'bootstrap': True, 'criterion': 'gini', 'n_estimators': 200}

Best params : {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Best Result : 0.8442415115797715

Confusion matrix of Random Forest optimized for the test data:

      pred_neg  pred_pos
neg     238

"\nprint('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))\nprint('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))\n\nprint('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))\nprint('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))\n\nprint('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','

In [2]:
#Random Forest W/ Hyperparameter Tuning Checked
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score,f1_score,balanced_accuracy_score
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

def grid_search_wrapper(refit_score):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    
    print('\nRefit by ',refit_score)
    
    #start model
    start_time  = datetime.datetime.now()
    print('\nStart time',start_time)
    
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    #end model fit
    end_time = datetime.datetime.now()
    print('\nEnd time',end_time)

    return grid_search

#imbalanced dataset
#print('imbalanced dataset')
#DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#DATA_PATH="D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
DATA_PATH="D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv"
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
#print('W/O Entropy')
#X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
print('W/ Entropy')
X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

# show the distribution
print('y_train class distribution\n')
print(y_train.value_counts(normalize=True))
print('\ny_test class distribution')
print('\n',y_test.value_counts(normalize=True))

clf = RandomForestClassifier()

'''param_grid = {
    'n_estimators' : [100,150,200,250,300,350,400,450,500],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
'''param_grid = {
    'n_estimators' : [100,200,300,400,500,600,700],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
param_grid = {
    'n_estimators' : [100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    #'n_estimators' : [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

scorers = {
    'precision_micro_score': make_scorer(precision_score, average='micro'),
    'precision_score': make_scorer(precision_score, average='weighted'),
    'recall_micro_score': make_scorer(recall_score, average='micro'),
    'recall_score': make_scorer(recall_score, average='weighted'),
    'accuracy_score': make_scorer(accuracy_score),
    'roc_score_scorer' : make_scorer(roc_auc_score,average='weighted'),
    'metrics_roc_auc_score' : 'roc_auc',
    'f1_score' : make_scorer(f1_score, average='weighted'),
    'tp': make_scorer(tp),
    'tn': make_scorer(tn),
    'fp': make_scorer(fp),
    'fn': make_scorer(fn)
}


refit_scorer='metrics_roc_auc_score'
print('Refit by ',refit_scorer)
grid_search_clf = grid_search_wrapper(refit_score=refit_scorer)

# make the predictions
y_pred = grid_search_clf.predict(X_test)

print('\nBest params for ')
print('\n',grid_search_clf.best_params_)
best_parameters = grid_search_clf.best_estimator_.get_params()
best_result = grid_search_clf.best_score_
print('\nBest params :',best_parameters)
print('\nBest Result :',best_result)

# confusion matrix on the test data.
print('\nConfusion matrix of Random Forest optimized for the test data:')
print('\n',str(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))

results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_metrics_roc_auc_score', ascending=False)

print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score',
                        'param_n_estimators']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))

'''
print('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))
print('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))

print('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))
print('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))

print('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','split3_train_tn','split4_train_tn','split5_train_tn','split6_train_tn','split7_train_tn','split8_train_tn','split9_train_tn','mean_train_tn']].round(6)))
print('\n',str(results[['split0_test_tn','split1_test_tn','split2_test_tn','split3_test_tn','split4_test_tn','split5_test_tn','split6_test_tn','split7_test_tn','split8_test_tn','split9_test_tn','mean_test_tn']].round(6)))

print('\n',str(results[['split0_train_fn','split1_train_fn','split2_train_fn','split3_train_fn','split4_train_fn','split5_train_fn','split6_train_fn','split7_train_fn','split8_train_fn','split9_train_fn','mean_train_fn']].round(6)))
print('\n',str(results[['split0_test_fn','split1_test_fn','split2_test_fn','split3_test_fn','split4_test_fn','split5_test_fn','split6_test_fn','split7_test_fn','split8_test_fn','split9_test_fn','mean_test_fn']].round(6)))

print('\n',str(results[['std_train_tp','std_train_fp','std_train_tn','std_train_fn']].round(6)))

print('\n',str(results[['std_test_tp','std_test_fp','std_test_tn','std_test_fn']].round(6)))
'''

balanced dataset
W/ Entropy
y_train class distribution

 1    0.500133
-1    0.499867
Name: Result, dtype: float64

y_test class distribution

 -1    0.5004
 1    0.4996
Name: Result, dtype: float64
Refit by  metrics_roc_auc_score

Refit by  metrics_roc_auc_score

Start time 2021-09-04 14:57:02.932887

End time 2021-09-04 14:57:51.561289

Best params for 

 {'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 200}

Best params : {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Best Result : 0.9631733050631471

Confusion matrix of Random Forest optimized for the test data:

      pred_neg  pred_pos
neg      110

"\nprint('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))\nprint('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))\n\nprint('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))\nprint('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))\n\nprint('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','

In [1]:
#Random Forest W/ Hyperparameter Tuning Checked
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score,f1_score,balanced_accuracy_score
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

def grid_search_wrapper(refit_score):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    
    print('\nRefit by ',refit_score)
    
    #start model
    start_time  = datetime.datetime.now()
    print('\nStart time',start_time)
    
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    #end model fit
    end_time = datetime.datetime.now()
    print('\nEnd time',end_time)

    return grid_search

#imbalanced dataset
#print('imbalanced dataset')
#DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#DATA_PATH="D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
DATA_PATH='D:/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
print('W/O Entropy')
X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
#print('W/ Entropy')
#X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

# show the distribution
print('y_train class distribution\n')
print(y_train.value_counts(normalize=True))
print('\ny_test class distribution')
print('\n',y_test.value_counts(normalize=True))

clf = RandomForestClassifier()

'''param_grid = {
    'n_estimators' : [100,150,200,250,300,350,400,450,500],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
'''param_grid = {
    'n_estimators' : [100,200,300,400,500,600,700],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
param_grid = {
    'n_estimators' : [100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    #'n_estimators' : [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,300,400,500,600,700],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

scorers = {
    'precision_micro_score': make_scorer(precision_score, average='micro'),
    'precision_score': make_scorer(precision_score, average='weighted'),
    'recall_micro_score': make_scorer(recall_score, average='micro'),
    'recall_score': make_scorer(recall_score, average='weighted'),
    'accuracy_score': make_scorer(accuracy_score),
    'roc_score_scorer' : make_scorer(roc_auc_score,average='weighted'),
    'metrics_roc_auc_score' : 'roc_auc',
    'f1_score' : make_scorer(f1_score, average='weighted'),
    'tp': make_scorer(tp),
    'tn': make_scorer(tn),
    'fp': make_scorer(fp),
    'fn': make_scorer(fn)
}


refit_scorer='metrics_roc_auc_score'
print('Refit by ',refit_scorer)
grid_search_clf = grid_search_wrapper(refit_score=refit_scorer)

# make the predictions
y_pred = grid_search_clf.predict(X_test)

print('\nBest params for ')
print('\n',grid_search_clf.best_params_)
best_parameters = grid_search_clf.best_estimator_.get_params()
best_result = grid_search_clf.best_score_
print('\nBest params :',best_parameters)
print('\nBest Result :',best_result)

# confusion matrix on the test data.
print('\nConfusion matrix of Random Forest optimized for the test data:')
print('\n',str(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))

results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_metrics_roc_auc_score', ascending=False)

print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score',
                        'param_n_estimators']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))

'''
print('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))
print('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))

print('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))
print('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))

print('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','split3_train_tn','split4_train_tn','split5_train_tn','split6_train_tn','split7_train_tn','split8_train_tn','split9_train_tn','mean_train_tn']].round(6)))
print('\n',str(results[['split0_test_tn','split1_test_tn','split2_test_tn','split3_test_tn','split4_test_tn','split5_test_tn','split6_test_tn','split7_test_tn','split8_test_tn','split9_test_tn','mean_test_tn']].round(6)))

print('\n',str(results[['split0_train_fn','split1_train_fn','split2_train_fn','split3_train_fn','split4_train_fn','split5_train_fn','split6_train_fn','split7_train_fn','split8_train_fn','split9_train_fn','mean_train_fn']].round(6)))
print('\n',str(results[['split0_test_fn','split1_test_fn','split2_test_fn','split3_test_fn','split4_test_fn','split5_test_fn','split6_test_fn','split7_test_fn','split8_test_fn','split9_test_fn','mean_test_fn']].round(6)))

print('\n',str(results[['std_train_tp','std_train_fp','std_train_tn','std_train_fn']].round(6)))

print('\n',str(results[['std_test_tp','std_test_fp','std_test_tn','std_test_fn']].round(6)))
'''

balanced dataset
W/O Entropy
y_train class distribution

-1    0.5024
 1    0.4976
Name: Result, dtype: float64

y_test class distribution

  1    0.5072
-1    0.4928
Name: Result, dtype: float64
Refit by  metrics_roc_auc_score

Refit by  metrics_roc_auc_score

Start time 2021-09-04 14:54:28.867489

End time 2021-09-04 14:55:09.776464

Best params for 

 {'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 170}

Best params : {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 170, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Best Result : 0.8837095335799953

Confusion matrix of Random Forest optimized for the test data:

      pred_neg  pred_pos
neg      1164  

"\nprint('\n',str(results[['split0_train_tp','split1_train_tp','split2_train_tp','split3_train_tp','split4_train_tp','split5_train_tp','split6_train_tp','split7_train_tp','split8_train_tp','split9_train_tp','mean_train_tp']].round(6)))\nprint('\n',str(results[['split0_test_tp','split1_test_tp','split2_test_tp','split3_test_tp','split4_test_tp','split5_test_tp','split6_test_tp','split7_test_tp','split8_test_tp','split9_test_tp','mean_test_tp']].round(6)))\n\nprint('\n',str(results[['split0_train_fp','split1_train_fp','split2_train_fp','split3_train_fp','split4_train_fp','split5_train_fp','split6_train_fp','split7_train_fp','split8_train_fp','split9_train_fp','mean_train_fp']].round(6)))\nprint('\n',str(results[['split0_test_fp','split1_test_fp','split2_test_fp','split3_test_fp','split4_test_fp','split5_test_fp','split6_test_fp','split7_test_fp','split8_test_fp','split9_test_fp','mean_test_fp']].round(6)))\n\nprint('\n',str(results[['split0_train_tn','split1_train_tn','split2_train_tn','

In [23]:
print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))



    mean_test_precision_score  mean_test_precision_micro_score  \
2                   0.819905                         0.803867   
1                   0.812290                         0.800933   
0                   0.802911                         0.770000   

   mean_test_recall_score  mean_test_recall_micro_score  \
2                0.803867                      0.803867   
1                0.800933                      0.800933   
0                0.770000                      0.770000   

   mean_test_accuracy_score  mean_test_f1_score  mean_test_roc_score_scorer  \
2                  0.803867            0.801308                    0.803783   
1                  0.800933            0.799078                    0.800870   
0                  0.770000            0.763522                    0.769870   

   mean_test_metrics_roc_auc_score  
2                         0.881662  
1                         0.871568  
0                         0.828664  

    mean_train_tp  mean_train_fp  

In [25]:
#SVM W/ Hyperparameter Tuning
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import LabelBinarizer
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,roc_auc_score,f1_score,balanced_accuracy_score
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

def grid_search_wrapper(refit_score):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    
    print('\nRefit by ',refit_score)
    
    #start model
    start_time  = datetime.datetime.now()
    print('\nStart time',start_time)
    
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    #end model fit
    end_time = datetime.datetime.now()
    print('\nEnd time',end_time)

    return grid_search

#imbalanced dataset
print('imbalanced dataset')
DATA_PATH="/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect.csv"
#balanced dataset
#print('balanced dataset')
#DATA_PATH='/home/eint/eclipse_prj/eclipse_workspace/PhishUrlDetectionMaster/results_data/data_vec_non_alpha_numericBy_FeatSelect5000.csv'
dataset=pd.read_csv(DATA_PATH)

y=dataset['Result']
#W/O Entropy
#print('W/O Entropy')
#X=dataset[['IP_Address','EXE','Sensitive_Word','double_slash_redirecting','W3_HTTP_token','who_is','dash_sign','at_sign','dot_sign','free_host','port']]
#W/ Entropy
print('W/ Entropy')
X=dataset.drop('Result',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

# show the distribution
print('y_train class distribution\n')
print(y_train.value_counts(normalize=True))
print('\ny_test class distribution')
print('\n',y_test.value_counts(normalize=True))


clf =svm.SVC()

'''param_grid = {
    'n_estimators' : [100,150,200,250,300,350,400,450,500],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
'''param_grid = {
    'n_estimators' : [100,200,300,400,500,600,700],'criterion': ['gini', 'entropy'],'bootstrap': [True, False]
}'''
param_grid = {

    'kernel' : ['linear', 'rbf', 'poly']

}

scorers = {
    'precision_micro_score': make_scorer(precision_score, average='micro'),
    'precision_score': make_scorer(precision_score, average='weighted'),
    'recall_micro_score': make_scorer(recall_score, average='micro'),
    'recall_score': make_scorer(recall_score, average='weighted'),
    'accuracy_score': make_scorer(accuracy_score),
    'roc_score_scorer' : make_scorer(roc_auc_score,average='weighted'),
    'metrics_roc_auc_score' : 'roc_auc',
    'f1_score' : make_scorer(f1_score, average='weighted'),
    'tp': make_scorer(tp),
    'tn': make_scorer(tn),
    'fp': make_scorer(fp),
    'fn': make_scorer(fn)
}


refit_scorer='metrics_roc_auc_score'
print('Refit by ',refit_scorer)
grid_search_clf = grid_search_wrapper(refit_score=refit_scorer)


# make the predictions
y_pred = grid_search_clf.predict(X_test)

print('\nBest params for ')
print('\n',grid_search_clf.best_params_)
best_parameters = grid_search_clf.best_estimator_.get_params()
best_result = grid_search_clf.best_score_
print('\nBest params :',best_parameters)
print('\nBest Result :',best_result)


# confusion matrix on the test data.
print('\nConfusion matrix of Random Forest optimized for the test data:')
print('\n',str(pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['pred_neg', 'pred_pos'], index=['neg', 'pos'])))

results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_metrics_roc_auc_score', ascending=False)

print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))


imbalanced dataset
W/ Entropy
y_train class distribution

-1    0.901908
 1    0.098092
Name: Result, dtype: float64

y_test class distribution

 -1    0.899913
 1    0.100087
Name: Result, dtype: float64
Refit by  metrics_roc_auc_score

Refit by  metrics_roc_auc_score

Start time 2019-08-07 09:19:56.656995


KeyboardInterrupt: 

In [29]:

print('\n',str(results[['mean_test_precision_score','mean_test_precision_micro_score', 
                        'mean_test_recall_score','mean_test_recall_micro_score',
                        'mean_test_accuracy_score','mean_test_f1_score',
                        'mean_test_roc_score_scorer','mean_test_metrics_roc_auc_score']].round(6)))

print('\n',str(results[['mean_train_tp','mean_train_fp','mean_train_tn','mean_train_fn']].round(6)))

print('\n',str(results[['mean_test_tp','mean_test_fp','mean_test_tn','mean_test_fn']].round(6)))


     mean_test_precision_score  mean_test_precision_micro_score  \
10                   0.814708                         0.801067   
1                    0.820866                         0.804000   
5                    0.819768                         0.802400   
13                   0.808430                         0.800400   
14                   0.819251                         0.798800   
6                    0.818188                         0.802400   
2                    0.819572                         0.804533   
9                    0.809635                         0.800533   
21                   0.810787                         0.799733   
22                   0.811910                         0.801067   
18                   0.812975                         0.801867   
17                   0.813349                         0.801200   
0                    0.806567                         0.773467   
4                    0.806567                         0.773467   
12      