In [8]:
import pandas as pd
df = pd.read_csv('phishing_site_urls.csv')

df.head()
#df.info()
#df.isnull().sum()
#df['Label'].value_counts()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


## Feature Engineering

### Getting the Length of a URL

In [9]:
def get_length(url):
    return len(url)

df['url_length'] = df['URL'].apply(get_length)

##### Explanation:
We care about the length of the URL because phishing URLS are often long.

### Getting the Number of Periods (.) in URL

In [10]:
def period_count(url):
    return url.count('.')

df['period_count'] = df['URL'].apply(period_count)

##### Explanation:
We care about the number of periods in the URL because phishing URLS can contain subdomains which are hosted by malicious phishing sites.

For Example:
'login.bofa.com.verify-user.com'

This looks like a real bofa (Bank of America) URL, but it's actually a phishing site hosted on verify-user.com

### Getting the Number of Hyphens/Dashes in a URL

In [11]:
def dash_count(url):
    return url.count('-')

df['dash_count'] = df['URL'].apply(dash_count)

##### Explanation:
We care about the number of dashes in the URL because dashes are used to spoof real domains.

For Example:
'secure-login-bofa.com'

### Getting the Number @'s in a URL

In [12]:
def at_count(url):
    return url.count('@')

df['at_count'] = df['URL'].apply(at_count)


##### Explanation:
We care about the number of @s in the URL because anything after the @ is ignored by the browser.

For Example:
'https://paypal.com@phishingsite.com'

It looks like the URL goes to PayPal, but it goes to phishingsite.com.

### Getting the Number of Digits in a URL

In [14]:
def digit_count(url):
    count = 0
    for i in url:
        if i.isdigit():
            count += 1
    return count

df['digit_count'] = df['URL'].apply(digit_count)

##### Explanation:
We care about the number of digits in the URL because 
phishing URLs can include numbers which bypass filters.

### Has HTTPS or Not

In [15]:
def has_https(url):
    if url.startswith('https'):
        return 1
    else:
        return 0
    
df['has_https'] = df['URL'].apply(has_https)


##### Explanation:
We care about whether the URL has HTTPs or not becuase legitimate websites should use HTTPS (secure connection). Phishing site often lack HTTPS.

### Has an IP Address or Not

In [16]:
import re

def has_ip_address(url):
    ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    if re.search(ip_pattern, url):
        return 1
    else:
        return 0

df['has_ip_address'] = df['URL'].apply(has_ip_address)

##### Explanation:
We care about whether the URL has an IP address or not becuase legitimate websites should use domain names. Phishing site often use raw IPs.

### Does the URL Have Suspicious Words?

In [None]:
def has_suspicious_word(url):
    suspicious_words = ['login', 'verify', 'secure', 'banking', 'update', 'account']
    url = url.lower()
    for word in suspicious_words:
        if word in url:
            return 1
    return 0

##### Explanation:
We care about whether the URL has  certain words because phishing URLs often use urgent or security-related words to manipulate users emotionally.

In [17]:
df.head()

Unnamed: 0,URL,Label,url_length,period_count,dash_count,at_count,digit_count,has_https,has_ip_address
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,225,6,4,0,58,0,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,81,5,2,0,1,0,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,177,7,1,0,47,0,0
3,mail.printakid.com/www.online.americanexpress....,bad,60,6,0,0,0,0,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,116,1,1,0,21,0,0


In [18]:
df['Label'] = df['Label'].map({'good': 0, 'bad': 1})

In [20]:
X = df.drop(columns=['URL', 'Label'])

y = df['Label']

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123, stratify=y)


In [24]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8165832347319559

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.95      0.88     78585
           1       0.79      0.48      0.60     31285

    accuracy                           0.82    109870
   macro avg       0.81      0.72      0.74    109870
weighted avg       0.81      0.82      0.80    109870


Confusion Matrix:
 [[74578  4007]
 [16145 15140]]


In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Accuracy: 0.7882042413761718

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85     78585
           1       0.63      0.63      0.63     31285

    accuracy                           0.79    109870
   macro avg       0.74      0.74      0.74    109870
weighted avg       0.79      0.79      0.79    109870


Confusion Matrix:
 [[66951 11634]
 [11636 19649]]


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

from collections import Counter
print("Resampled class distribution:", Counter(y_train_sm))


ModuleNotFoundError: No module named 'imblearn'