In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler


In [2]:
df = pd.read_csv(r"D:\InternShip\dataset_phishing.csv")  

df.shape


(11430, 89)

In [3]:
df.head()


Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11430 non-null  object 
 1   length_url                  11430 non-null  int64  
 2   length_hostname             11430 non-null  int64  
 3   ip                          11430 non-null  int64  
 4   nb_dots                     11430 non-null  int64  
 5   nb_hyphens                  11430 non-null  int64  
 6   nb_at                       11430 non-null  int64  
 7   nb_qm                       11430 non-null  int64  
 8   nb_and                      11430 non-null  int64  
 9   nb_or                       11430 non-null  int64  
 10  nb_eq                       11430 non-null  int64  
 11  nb_underscore               11430 non-null  int64  
 12  nb_tilde                    11430 non-null  int64  
 13  nb_percent                  114

In [6]:
df.isnull().sum().sort_values(ascending=False)


url                0
length_url         0
length_hostname    0
ip                 0
nb_dots            0
                  ..
web_traffic        0
dns_record         0
google_index       0
page_rank          0
status             0
Length: 89, dtype: int64

In [7]:
# Replace invalid values with NaN
df['domain_age'] = df['domain_age'].apply(lambda x: np.nan if x < 0 else x)
df['domain_registration_length'] = df['domain_registration_length'].apply(lambda x: np.nan if x < 0 else x)


In [8]:
df[['domain_age', 'domain_registration_length']].isnull().sum()


domain_age                    1837
domain_registration_length      46
dtype: int64

In [9]:
# Median imputation for skewed numerical features
df['domain_age'].fillna(df['domain_age'].median(), inplace=True)
df['domain_registration_length'].fillna(df['domain_registration_length'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['domain_age'].fillna(df['domain_age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['domain_registration_length'].fillna(df['domain_registration_length'].median(), inplace=True)


In [10]:
# Full row duplicates
df.duplicated().sum()


np.int64(0)

In [11]:
# Duplicate URLs
df['url'].duplicated().sum()


np.int64(1)

In [12]:
df = df.drop_duplicates(subset='url')
df.shape


(11429, 89)

In [13]:
binary_features = [
    'ip', 'dns_record', 'google_index', 'https_token'
]

for col in binary_features:
    df[col] = df[col].astype(int)


In [14]:
df['status'] = df['status'].map({'legitimate': 0, 'phishing': 1})


In [15]:
X = df.drop(columns=['status', 'url'])
y = df['status']


In [16]:
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


In [17]:
X.describe().loc[['mean', 'std']]


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
mean,61.127308,21.089859,0.150582,2.480619,0.997638,0.022224,0.14122,0.162306,0.0,0.293202,...,0.12477,0.775833,0.439496,0.072885,493.54528,4878.447458,856831.6,0.020124,0.533905,3.185581
std,55.299697,10.777545,0.357656,1.369672,2.087157,0.155507,0.364469,0.821372,0.0,0.998357,...,0.330473,0.417051,0.496347,0.259959,814.350312,2550.286972,1995677.0,0.140431,0.498871,2.53701


In [18]:
X_scaled.describe().loc[['mean', 'std']]


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
mean,0.371771,0.232207,0.150582,0.480619,0.997638,0.022224,0.14122,0.162306,0.0,0.293202,...,0.12477,-0.224167,0.439496,0.072885,0.686589,-0.049191,2.286448,0.020124,-0.466095,0.046395
std,1.455255,1.197505,0.357656,1.369672,2.087157,0.155507,0.364469,0.821372,0.0,0.998357,...,0.330473,0.417051,0.496347,0.259959,2.249586,0.635032,5.335736,0.140431,0.498871,0.634252


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [20]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (9143, 87)
Test shape: (2286, 87)


In [21]:
y_train.value_counts(normalize=True)


status
0    0.500055
1    0.499945
Name: proportion, dtype: float64

In [22]:
y_test.value_counts(normalize=True)


status
1    0.5
0    0.5
Name: proportion, dtype: float64

In [23]:
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
