In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

X = pd.read_csv("datasets/feature_updated_dataset_X2.csv")
y = pd.read_csv("datasets/feature_updated_dataset_y.csv")

#rus = RandomOverSampler(random_state=69)
#X_rus, y_rus = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
y_train, y_test = y_train.values.ravel(), y_test.values.ravel()
print(X_train.columns)

Index(['www', 'url_length', 'digit_count', 'percentage_count', 'dot_count',
       'bs_count', 'dash_count', 'url_entropy', 'params_count',
       'subdomains_count', 'domain_extension', 'semicolon_count',
       'underscores_count', 'questionmarks_count', 'equals_count',
       'digit_letter_ratio', 'pd_num_count', 'pd_non_alphanumeric_count',
       'pd_at_count', 'pd_hyphen_count', 'pd_in_alex_top_1m',
       'path_double_slash_count', 'percent20_presence', 'uppercase_dirs',
       'single_char_dirs', 'path_count_special_chars', 'path_zeroes_count',
       'path_uppercase_to_lowercase_ratio', 'params_length', 'queries_count'],
      dtype='object')


In [76]:
X_feature1_train = X_train[['http', 'https', 'www', 'url_length', 'url_entropy',  'domain_extension']]
X_feature1_test = X_test[['http', 'https', 'www', 'url_length', 'url_entropy', 'domain_extension']]

X_feature2_train = X_train[['digit_count', 'percentage_count', 'dot_count', 'bs_count', 'dash_count', 'params_count', 'subdomains_count']]
X_feature2_test = X_test[['digit_count', 'percentage_count', 'dot_count', 'bs_count', 'dash_count', 'params_count', 'subdomains_count']]

X_features_boot_train = X_train[['http', 'https', 'www', 'url_length', 'url_entropy',  'domain_extension', 'digit_count', 'percentage_count', 'dot_count', 'dash_count', 'subdomains_count']]
X_features_boot_test = X_test[['http', 'https', 'www', 'url_length', 'url_entropy',  'domain_extension', 'digit_count', 'percentage_count', 'dot_count', 'dash_count', 'subdomains_count']]

In [18]:
# Modelling
# 1) Logistic Regression
model_lr1 = LogisticRegression(max_iter=500)
# 2) SVM
#model_svm = svm.SVC()
# 3) Random Forest Classifier
model_rfc1 = RandomForestClassifier(verbose=0, n_estimators=50, max_depth=20, min_samples_leaf=2, min_samples_split=2, random_state=69)
model_rfc2 = RandomForestClassifier(verbose=0, n_estimators=50, max_depth=20, min_samples_leaf=2, min_samples_split=2, random_state=69)
data = [(model_rfc1, X_train, X_test)]

#data = [(model_rfc1, X_features_boot_train, X_features_boot_test)]
#data = [(model_rfc1, X_train, X_test), (model_rfc2, X_train, X_test), (model_rfc3, X_train, X_test)]
#data = [(model_rfc1, X_train, X_test), (model_rfc2, X_features_boot_train, X_features_boot_test)]

for data_set in data:
    model, X_train, X_test = data_set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Verifying model fit
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Accuracy: {accuracy} | F1 score: {f1} | {X_train.columns.values} | criterion: {model.criterion}")

Accuracy: 0.9531169618931349 | F1 score: 0.9520877108577438 | ['www' 'url_length' 'digit_count' 'percentage_count' 'dot_count'
 'bs_count' 'dash_count' 'url_entropy' 'params_count' 'subdomains_count'
 'domain_extension' 'semicolon_count' 'underscores_count'
 'questionmarks_count' 'equals_count' 'digit_letter_ratio' 'pd_num_count'
 'pd_non_alphanumeric_count' 'pd_at_count' 'pd_hyphen_count'
 'pd_in_alex_top_1m' 'path_double_slash_count' 'percent20_presence'
 'uppercase_dirs' 'single_char_dirs' 'path_count_special_chars'
 'path_zeroes_count' 'path_uppercase_to_lowercase_ratio' 'params_length'
 'queries_count'] | criterion: gini


In [60]:
import joblib

# Export the model to a file
joblib.dump(model_rfc1, 'model_rfc1.pkl')


['model_rfc1.pkl']

In [101]:
import numpy as np

for data_set in data:
  model, X_train, X_test = data_set
  rf_model = model
  importances = rf_model.feature_importances_
  scaled_importances = importances / np.max(importances)
  values = X_train.columns
  scaled_importances.sort()

  for i in range(len(importances)):
    print(values[i], ":" ,f"{scaled_importances[i]:.6f}")
  print("=====================================")  

www : 0.046649
url_length : 0.145228
digit_count : 0.246565
percentage_count : 0.248241
dot_count : 0.260830
bs_count : 0.294922
dash_count : 0.301534
url_entropy : 0.594746
params_count : 0.914384
subdomains_count : 0.914924
domain_extension : 1.000000


In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(model_rfc1, X, y, n_repeats=10, random_state=42)
importances = result.importances_mean

for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance}")


In [63]:
scaled_importances = importances / np.max(importances)
for feature, importance in zip(X.columns, scaled_importances):
    print(f"{feature}: {importance}")



http: 1.0
https: 0.18423200705173653
www: 0.7532694408682497
url_length: 0.2621509177106623
digit_count: 0.20198611303530561
percentage_count: 0.4408051768822412
dot_count: 0.23032560666569807
bs_count: 0.04422447770349619
dash_count: 0.22617372240744488
url_entropy: 0.13130205850696972
params_count: 0.6064734314995082


In [None]:
from sklearn.tree import export_graphviz
import graphviz

for i in range(3):
    tree = model_rfc1.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

In [19]:
phish_urls = pd.read_csv("datasets/phishtank_phish_urls.csv")
malicious_urls = pd.read_csv("datasets/malicious_phish.csv")['url']

phish_urls_in_malicious = phish_urls[phish_urls.isin(malicious_urls)]

print(phish_urls_in_malicious.nunique())

url    0
dtype: int64


In [20]:
benign_url = pd.read_csv("datasets/benign_urls.csv")

benign_urls_in_malicious = benign_url[benign_url.isin(malicious_urls)]

print(benign_urls_in_malicious.nunique())

url    0
dtype: int64


In [22]:
phish_set = pd.read_csv("datasets\phish_set_X2.csv")
phish_pred = model_rfc1.predict(phish_set)
print(len(phish_pred))
print(np.count_nonzero(phish_pred == 0)/len(phish_pred))
print(np.count_nonzero(phish_pred == 1)/len(phish_pred))
print(np.count_nonzero(phish_pred == 2)/len(phish_pred))
print(np.count_nonzero(phish_pred == 3)/len(phish_pred))


Feature names unseen at fit time:
- at_count
- non_alphanumeric_count
Feature names seen at fit time, yet now missing:
- pd_at_count
- pd_hyphen_count
- pd_in_alex_top_1m
- pd_non_alphanumeric_count
- pd_num_count



ValueError: X has 27 features, but RandomForestClassifier is expecting 30 features as input.

In [21]:
benign_set = pd.read_csv("datasets/benign_dataset_X2.csv")
benign_pred = model_rfc1.predict(benign_set)
print(len(benign_pred))
count = np.count_nonzero(benign_pred == 0)
print(np.count_nonzero(benign_pred == 0)/len(benign_pred))
print(np.count_nonzero(benign_pred == 1)/len(benign_pred))
print(np.count_nonzero(benign_pred == 2)/len(benign_pred))
print(np.count_nonzero(benign_pred == 3)/len(benign_pred))


345738
0.04075051050217216
0.29994388814651557
0.6527052276579375
0.006600373693374752


In [9]:
malicious_phish = pd.read_csv("datasets/malicious_phish.csv")
# Filter the DataFrame based on the conditions
filtered_rows = malicious_phish[(malicious_phish['url'].str.contains("https://")) & (malicious_phish['type'] == "phishing")]

# Print the filtered rows
print(len(filtered_rows))


7091
