In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv('dataset_phishing.csv')
df.head(10)

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate
5,http://appleid.apple.com-app.es/,32,24,0,3,1,0,0,0,0,...,1,1,1,0,-1,0,0,1,0,phishing
6,http://www.mutuo.it,19,12,0,2,0,0,0,0,0,...,0,1,0,170,7529,0,0,0,1,legitimate
7,http://www.shadetreetechnology.com/V4/validati...,81,27,1,2,0,0,0,0,0,...,1,0,0,76,5767,0,0,1,2,phishing
8,http://vamoaestudiarmedicina.blogspot.com/,42,34,0,2,0,0,0,0,0,...,1,1,0,371,7298,0,0,0,5,legitimate
9,https://parade.com/425836/joshwigler/the-amazi...,104,10,0,1,10,0,0,0,0,...,1,0,0,128,9368,6774,0,0,5,legitimate


#### Check For Missing Values

In [5]:
df.isnull().sum()

url                0
length_url         0
length_hostname    0
ip                 0
nb_dots            0
                  ..
web_traffic        0
dns_record         0
google_index       0
page_rank          0
status             0
Length: 89, dtype: int64

#### Re-Check For Missing Values

In [29]:
df.isnull().values.any()


False

### Missing Value Analysis and Handling summary

I checked the dataset for missing values using `df.isnull().sum()` and found that there were **no missing values** in any of the columns. Therefore, no imputation or deletion was necessary.


#### Check for duplicates

In [7]:
df.duplicated().sum()

0

#### Feature Encoding

In [9]:
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder

# Example: Hash or encode domain
df['domain'] = df['url'].apply(lambda x: urlparse(x).netloc)
le = LabelEncoder()
df['domain_encoded'] = le.fit_transform(df['domain'])

# Drop original URL to avoid leakage
df.drop(['url', 'domain'], axis=1, inplace=True)

In [10]:
df['status']=le.fit_transform(df['status'])
df

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status,domain_encoded
0,37,19,0,3,0,0,0,0,0,0,...,1,0,45,-1,0,1,1,4,0,5135
1,77,23,1,1,0,0,0,0,0,0,...,0,0,77,5767,0,0,1,2,1,3442
2,126,50,1,4,1,0,1,2,0,3,...,0,0,14,4004,5828815,0,1,0,1,3701
3,18,11,0,2,0,0,0,0,0,0,...,0,0,62,-1,107721,0,0,3,0,3225
4,55,15,0,2,2,0,0,0,0,0,...,1,0,224,8175,8725,0,0,6,0,6065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,45,17,0,2,0,0,0,0,0,0,...,0,0,448,5396,3980,0,0,6,0,5582
11426,84,18,0,5,0,1,1,0,0,1,...,0,0,211,6728,0,0,1,0,1,4832
11427,105,16,1,2,6,0,1,0,0,1,...,0,0,2809,8515,8,0,1,10,0,5508
11428,38,30,0,2,0,0,0,0,0,0,...,0,0,85,2836,2455493,0,0,4,0,6665


### Explanation Of Encoding Techniques

To prepare categorical features for machine learning models, I applied **Label Encoding**, which converts categorical text values into numeric codes:

* **Domain Encoding**:
  The `url` column was parsed using `urlparse` to extract the **domain name**. This new `domain` feature was then label-encoded using `LabelEncoder`, assigning a unique integer to each domain. After encoding, both `url` and `domain` were dropped to prevent data leakage.

* **Status Encoding**:
  The `status` column, which contains categorical labels (e.g., 'legitimate', 'phishing'), was also label-encoded to transform its values into numerical format.

Label encoding was used because these categories don't have a specific order, and this method lets the model work with them as numbers without changing their meaning




#### Normalization / Scaling Report

In [12]:
from sklearn.preprocessing import MinMaxScaler
num_cols=df.select_dtypes(include=['int64','float64']).columns.tolist()
scaler=MinMaxScaler()
df[num_cols]=scaler.fit_transform(df[num_cols])
df

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status,domain_encoded
0,0.015347,0.071429,0.0,0.086957,0.000000,0.00,0.000000,0.000000,0.0,0.000000,...,1.0,0.0,0.001542,0.000854,0.000000e+00,1.0,1.0,0.4,0,5135
1,0.039902,0.090476,1.0,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.002615,0.448471,0.000000e+00,0.0,1.0,0.2,1,3442
2,0.069982,0.219048,1.0,0.130435,0.023256,0.00,0.333333,0.105263,0.0,0.157895,...,0.0,0.0,0.000503,0.311656,5.413097e-01,0.0,1.0,0.0,1,3701
3,0.003683,0.033333,0.0,0.043478,0.000000,0.00,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.002112,0.000854,1.000382e-02,0.0,0.0,0.3,0,3225
4,0.026397,0.052381,0.0,0.043478,0.046512,0.00,0.000000,0.000000,0.0,0.000000,...,1.0,0.0,0.007543,0.635341,8.102722e-04,0.0,0.0,0.6,0,6065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,0.020258,0.061905,0.0,0.043478,0.000000,0.00,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.015052,0.419680,3.696142e-04,0.0,0.0,0.6,0,5582
11426,0.044199,0.066667,0.0,0.173913,0.000000,0.25,0.333333,0.000000,0.0,0.052632,...,0.0,0.0,0.007107,0.523048,0.000000e+00,0.0,1.0,0.0,1,4832
11427,0.057090,0.057143,1.0,0.043478,0.139535,0.00,0.333333,0.000000,0.0,0.052632,...,0.0,0.0,0.094200,0.661726,7.429430e-07,0.0,1.0,1.0,0,5508
11428,0.015961,0.123810,0.0,0.043478,0.000000,0.00,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.002883,0.221015,2.280364e-01,0.0,0.0,0.4,0,6665


#### Data Splitting 

In [14]:
from sklearn.model_selection import train_test_split
X=df.drop('status',axis=1)
y=df['status']

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
print("Train Set Size:", X_train.shape)
print("Test Set Size:", X_test.shape)

Train Set Size: (9144, 88)
Test Set Size: (2286, 88)
