# Notebook for Experimental features analysis and validation

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import numpy as np

X = pd.read_csv("datasets/feature_updated_dataset_X.csv")
y = pd.read_csv("datasets/feature_updated_dataset_y.csv")

#rus = RandomOverSampler(random_state=69)
#X_rus, y_rus = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
y_train, y_test = y_train.values.ravel(), y_test.values.ravel()
# print(X_train.columns)

In [None]:
# X_feature1_train = X_train[['http', 'https', 'www', 'url_length', 'url_entropy',  'domain_extension']]
# X_feature1_test = X_test[['http', 'https', 'www', 'url_length', 'url_entropy', 'domain_extension']]

# X_feature2_train = X_train[['digit_count', 'percentage_count', 'dot_count', 'bs_count', 'dash_count', 'params_count', 'subdomains_count']]
# X_feature2_test = X_test[['digit_count', 'percentage_count', 'dot_count', 'bs_count', 'dash_count', 'params_count', 'subdomains_count']]

# X_features_boot_train = X_train[['http', 'https', 'www', 'url_length', 'url_entropy',  'domain_extension', 'digit_count', 'percentage_count', 'dot_count', 'dash_count', 'subdomains_count']]
# X_features_boot_test = X_test[['http', 'https', 'www', 'url_length', 'url_entropy',  'domain_extension', 'digit_count', 'percentage_count', 'dot_count', 'dash_count', 'subdomains_count']]

In [None]:
# Modelling
# 1) Logistic Regression
model_lr1 = LogisticRegression(max_iter=500)
# 2) SVM
#model_svm = svm.SVC()
# 3) Random Forest Classifier
model_rfc1 = RandomForestClassifier(verbose=0, n_estimators=50, max_depth=20, min_samples_leaf=2, min_samples_split=2, random_state=69)
model_rfc2 = RandomForestClassifier(verbose=0, n_estimators=50, max_depth=20, min_samples_leaf=2, min_samples_split=2, random_state=69)
model_rfc3 = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=69)
# data = [(model_rfc1, X_train, X_test)]
data = [(model_rfc3, X_train, X_test)]

#data = [(model_rfc1, X_features_boot_train, X_features_boot_test)]
#data = [(model_rfc1, X_train, X_test), (model_rfc2, X_train, X_test), (model_rfc3, X_train, X_test)]
#data = [(model_rfc1, X_train, X_test), (model_rfc2, X_features_boot_train, X_features_boot_test)]

for data_set in data:
    model, X_train, X_test = data_set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Verifying model fit
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Accuracy: {accuracy} | F1 score: {f1} | {X_train.columns.values} | criterion: {model.criterion}")

In [None]:
from sklearn.metrics import confusion_matrix

def calc_FNR_accuracy(y_true, y_pred):
  conf_matrix = confusion_matrix(y_true, y_pred)
  for label_class in range(4):
    
    FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
    
    TP = conf_matrix[label_class][label_class]  
    
    TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
    
    accuracy = (TP + TN) / np.sum(conf_matrix)
    print("Accuracy for class", label_class, ":", accuracy)

    FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
    print("FNR for class", label_class, ":", FNR)

calc_FNR_accuracy([2, 0, 2, 2, 0, 1, 3, 3], [0, 0, 2, 2, 0, 2, 3, 3])

In [None]:
import joblib

# Export the model to a file
joblib.dump(model_rfc1, 'model_rfc1.pkl')


In [None]:
import numpy as np

for data_set in data:
  model, X_train, X_test = data_set
  rf_model = model
  importances = rf_model.feature_importances_
  scaled_importances = importances / np.max(importances)
  values = X_train.columns
  scaled_importances.sort()

  for i in range(len(importances)):
    print(values[i], ":" ,f"{scaled_importances[i]:.6f}")
  print("=====================================")  

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(model_rfc1, X, y, n_repeats=10, random_state=42)
importances = result.importances_mean

for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance}")


In [None]:
scaled_importances = importances / np.max(importances)
for feature, importance in zip(X.columns, scaled_importances):
    print(f"{feature}: {importance}")



In [None]:
from sklearn.tree import export_graphviz
import graphviz

for i in range(3):
    tree = model_rfc1.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

In [None]:
phish_urls = pd.read_csv("datasets/phishtank_phish_urls.csv")
malicious_urls = pd.read_csv("datasets/malicious_phish.csv")['url']

phish_urls_in_malicious = phish_urls[phish_urls.isin(malicious_urls)]

print(phish_urls_in_malicious.nunique())

In [None]:
benign_url = pd.read_csv("datasets/benign_urls.csv")

benign_urls_in_malicious = benign_url[benign_url.isin(malicious_urls)]

print(benign_urls_in_malicious.nunique())

In [None]:
phish_set = pd.read_csv("datasets\phish_set_X2.csv")
phish_pred = model_rfc1.predict(phish_set)
print(len(phish_pred))
print(np.count_nonzero(phish_pred == 0)/len(phish_pred))
print(np.count_nonzero(phish_pred == 1)/len(phish_pred))
print(np.count_nonzero(phish_pred == 2)/len(phish_pred))
print(np.count_nonzero(phish_pred == 3)/len(phish_pred))


In [None]:
benign_set = pd.read_csv("datasets/benign_dataset_X2.csv")
benign_pred = model_rfc1.predict(benign_set)
print(len(benign_pred))
count = np.count_nonzero(benign_pred == 0)
print(np.count_nonzero(benign_pred == 0)/len(benign_pred))
print(np.count_nonzero(benign_pred == 1)/len(benign_pred))
print(np.count_nonzero(benign_pred == 2)/len(benign_pred))
print(np.count_nonzero(benign_pred == 3)/len(benign_pred))


In [None]:
malicious_phish = pd.read_csv("datasets/malicious_phish.csv")
# Filter the DataFrame based on the conditions
filtered_rows = malicious_phish[(malicious_phish['url'].str.contains("https://")) & (malicious_phish['type'] == "phishing")]

# Print the filtered rows
print(len(filtered_rows))
