In [1]:
from google.colab import drive, runtime
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


**<h1>IMPORT LIBRARIES</h1>**

In [8]:
import re
import warnings
import joblib
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.parse import urlparse
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve

In [23]:
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)

**<h1>DATA PREPARATION</h1>**

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Phising_Detection_Dataset.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0,NumDots,UrlLength,NumDash,AtSymbol,IpAddress,HttpsInHostname,PathLevel,PathLength,NumNumericChars,Phising
0,3,72,0,0,0,0,5,44,0,1.0
1,3,144,0,0,0,0,3,16,41,1.0
2,3,58,0,0,0,0,2,24,0,1.0
3,3,79,1,0,0,0,6,50,0,1.0
4,3,46,0,0,0,0,4,29,2,1.0


In [4]:
data_cleaned = data.dropna()

In [5]:
label_counts = data_cleaned['Phising'].value_counts()
label_counts

Unnamed: 0_level_0,count
Phising,Unnamed: 1_level_1
0.0,530060
1.0,100011


**<h1>DATA SPLIT</h1>**

In [6]:
X = data_cleaned.iloc[:, :-1]
y = data_cleaned.iloc[:, -1]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = rus.fit_resample(X_test, y_test)



**<h1>TRAIN MODEL (#1 attempt)</h1>**

In [11]:
rf_model = RandomForestClassifier(
    class_weight='balanced',
    max_depth=15,
    min_samples_split=15,
    max_leaf_nodes=100,
    n_estimators=100,
    random_state=42
)

In [12]:
scores = cross_val_score(rf_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print('Cross Validation accruacy:', scores.mean())

Cross Validation accruacy: 0.7974665247352158


In [13]:
rf_model.fit(X_train_resampled, y_train_resampled)

In [14]:
y_pred = rf_model.predict(X_test_resampled)
print(classification_report(y_test_resampled, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.75      0.79     20002
         1.0       0.77      0.84      0.81     20002

    accuracy                           0.80     40004
   macro avg       0.80      0.80      0.80     40004
weighted avg       0.80      0.80      0.80     40004



**<h1>APPLYING THRESHOLD (#1 attempt)</h1>**

In [15]:
y_pred_proba = rf_model.predict_proba(X_test_resampled)[:, 1]

In [16]:
precision, recall, threshold = precision_recall_curve(y_test_resampled, y_pred_proba)

In [17]:
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = f1_scores.argmax()
optimal_threshold = threshold[optimal_idx]
print(f'Optimal Threshold: {optimal_threshold:.2f}')

Optimal Threshold: 0.44


In [18]:
y_pred_adjusted = (y_pred_proba >= optimal_threshold).astype(int)
print(classification_report(y_test_resampled, y_pred_adjusted))

              precision    recall  f1-score   support

         0.0       0.87      0.69      0.77     20002
         1.0       0.74      0.90      0.81     20002

    accuracy                           0.79     40004
   macro avg       0.80      0.79      0.79     40004
weighted avg       0.80      0.79      0.79     40004



**<h1>SEARCHING FOR BEST PARAMETER</h1>**

In [10]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 5, 10, 15],
    'max_leaf_nodes': [50, 100, None],
    'class_weight': ['balanced', {0: 1, 1: 2}]
}

In [11]:
rf_model = RandomForestClassifier(random_state=42)

In [12]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)

In [13]:
grid_search.fit(X_train_resampled, y_train_resampled)
print('Best Parameters:', grid_search.best_params_)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits




Best Parameters: {'class_weight': {0: 1, 1: 2}, 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}


**<h1>TRAIN MODEL (#2 attempt)</h1>**

In [9]:
rf_model = RandomForestClassifier(
    class_weight={0: 1, 1: 2},
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=1,
    max_leaf_nodes=None,
    n_estimators=500,
    random_state=42
)

In [11]:
scores = cross_val_score(rf_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print('Cross Validation accruacy:', scores.mean())

Cross Validation accruacy: 0.8190390982553921


In [10]:
rf_model.fit(X_train_resampled, y_train_resampled)

In [11]:
y_pred = rf_model.predict(X_test_resampled)
print(classification_report(y_test_resampled, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.72      0.80     20002
         1.0       0.77      0.93      0.84     20002

    accuracy                           0.82     40004
   macro avg       0.84      0.82      0.82     40004
weighted avg       0.84      0.82      0.82     40004



**<h1>APPLYING THRESHOLD (#2 attempt)</h1>**

In [12]:
y_pred_proba = rf_model.predict_proba(X_test_resampled)[:, 1]

In [13]:
precision, recall, threshold = precision_recall_curve(y_test_resampled, y_pred_proba)

In [14]:
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = f1_scores.argmax()
optimal_threshold = threshold[optimal_idx]
print(f'Optimal Threshold: {optimal_threshold:.2f}')

Optimal Threshold: 0.56


In [15]:
y_pred_adjusted = (y_pred_proba >= optimal_threshold).astype(int)
print(classification_report(y_test_resampled, y_pred_adjusted))

              precision    recall  f1-score   support

         0.0       0.89      0.75      0.81     20002
         1.0       0.78      0.90      0.84     20002

    accuracy                           0.83     40004
   macro avg       0.84      0.83      0.83     40004
weighted avg       0.84      0.83      0.83     40004



In [None]:
# Kill runtime
runtime.unassign()

**<h1>SAVE MODEL</h1>**

**<h3>Save individually</h3>**

In [16]:
joblib.dump(rf_model, '/content/drive/MyDrive/Colab Notebooks/phishing_detection_model.pkl')

['/content/drive/MyDrive/Colab Notebooks/phishing_detection_model.pkl']

In [17]:
with open('/content/drive/MyDrive/Colab Notebooks/threshold.txt', 'w') as f:
    f.write(str(optimal_threshold))

**<h3>Save as one</h3>**

In [18]:
def save_model(model, threshold, file_name):
  model_data = {
      'model': model,
      'threshold': threshold
  }

  joblib.dump(model_data, file_name)
  print(f'Model and threshold saved to {file_name}')

In [19]:
save_model(rf_model, optimal_threshold, '/content/drive/MyDrive/Colab Notebooks/phising_detection_model_and_threshold.pkl')

Model and threshold saved to /content/drive/MyDrive/Colab Notebooks/phising_detection_model_and_threshold.pkl


**<h1>LOAD MODEL</h1>**

**<h4>Load individually</h4>**

In [None]:
model = joblib.load('/content/drive/MyDrive/Colab Notebooks/phising_detection_model_and_threshold.pkl')

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/threshold.txt', 'r') as f:
  threshold = float(f.read())

**<h4>Load as one</h4>**

In [9]:
def load_model(file_name):
  model_data = joblib.load(file_name)
  print(f'Model and threshold loaded from {file_name}')

  return model_data['model'], model_data['threshold']

In [10]:
model, threshold = load_model('/content/drive/MyDrive/Colab Notebooks/phising_detection_model_and_threshold.pkl')

Model and threshold loaded from /content/drive/MyDrive/Colab Notebooks/phising_detection_model_and_threshold.pkl


In [11]:
def ip_address(url):
  parsed_url = urlparse(url)
  domain = parsed_url.netloc

  ipv4_pattern = re.compile(r'^\d{1,3}(\.\d{1,3}){3}$')
  ipv6_pattern = re.compile(r'^\[?[a-fA-F0-9:]+\]?$')

  if ipv4_pattern.match(domain) or ipv6_pattern.match(domain):
    return 1
  else:
    return 0

In [34]:
def path_len(url):
  parsed_url = urlparse(url)
  path = parsed_url.path
  segments = [segment for segment in path.split('/') if segment]

  return len(segments)

In [35]:
def extract_features(url):
  parsed_url = urlparse(url)
  features = [
      url.count('.'),  # NumDots
      len(url),  # UrlLength
      url.count('-'),  # NumDash
      1 if '@' in url else 0,  # AtSymbol
      ip_address(url),  # IpAddress
      1 if 'https' in parsed_url.netloc else 0,  # HttpsInHostname
      parsed_url.path.count('/') - 1 if parsed_url.path else 0,  # PathLevel
      path_len(url),  # PathLength
      sum(char.isdigit() for char in url)  # NumNumericChars
  ]

  return np.array(features).reshape(1, -1)

In [36]:
def predict_url(url, model, threshold):
  features = extract_features(url)

  y_pred_proba = model.predict_proba(features)[:, 1]

  is_phishing = (y_pred_proba >= threshold).astype(int)

  return 'Phishing' if is_phishing[0] == 1 else 'Not Phishing'

In [37]:
test_url = 'http://192.168.1.1/login'

In [38]:
status = predict_url(test_url, model, threshold)
print(f"The URL '{test_url}' is classified as: {status}")

The URL 'http://192.168.1.1/login' is classified as: Phishing


In [39]:
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/phishing_site_urls.csv')
test_data['Label'] = test_data['Label'].replace({'bad': 1, 'good': 0})

In [40]:
label_counts = test_data['Label'].value_counts()
label_counts

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,393424
1,156422


In [41]:
X = test_data.iloc[:, :-1]
y = test_data.iloc[:, -1]

In [42]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [43]:
test_data_resampled = pd.concat([X_resampled, y_resampled], axis=1)
test_data_resampled.head(5)

Unnamed: 0,URL,Label
241368,spong.com/article/22522/Japanese-Video-Game-Ch...,0
170062,en.wikipedia.org/wiki/1966_NCAA_Men%27s_Divisi...,0
319481,ecy.wa.gov/programs/wq/grndwtr/LowerYak-gw.html,0
218874,muwaa.org/,0
456835,usedautopartdepot.com/OLDSMOBILE/STARFIRE.php,0


In [44]:
test_data_resampled['Predicted'] = test_data_resampled['URL'].apply(lambda url: predict_url(url, model, threshold))

In [45]:
test_data_resampled.head()

Unnamed: 0,URL,Label,Predicted
241368,spong.com/article/22522/Japanese-Video-Game-Ch...,0,Not Phishing
170062,en.wikipedia.org/wiki/1966_NCAA_Men%27s_Divisi...,0,Not Phishing
319481,ecy.wa.gov/programs/wq/grndwtr/LowerYak-gw.html,0,Phishing
218874,muwaa.org/,0,Not Phishing
456835,usedautopartdepot.com/OLDSMOBILE/STARFIRE.php,0,Not Phishing


In [48]:
test_data['URL'].loc[319481]

'ecy.wa.gov/programs/wq/grndwtr/LowerYak-gw.html'

In [49]:
test_data_resampled.to_csv('/content/drive/MyDrive/Colab Notebooks/predicted_phishing.csv', index=False)