In [2]:
import pandas as pd
from scipy.io import arff

# Load ARFF file
data, meta = arff.loadarff('Data\Training Dataset.arff')

# Convert to DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv('Data\PhishingWebsites.csv', index=False)

In [3]:
# Remove the 'b' prefix from the byte format
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

  df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)


In [4]:
# Convert columns to numeric where applicable
df = df.apply(pd.to_numeric, errors='coerce')

In [5]:
# Drop irrelevant or less useful columns
columns_to_drop = ['port', 'Favicon', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'Redirect']
df_cleaned = df.drop(columns=columns_to_drop)

In [7]:
# Check for missing values in the entire DataFrame
missing_values = df_cleaned.isnull().sum()
# Print columns with missing values and the count of missing values
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistical_report             0
Result                         0
dtype: int64


In [8]:
# Check the data types of each column
print("Data types of each column:\n", df_cleaned.dtypes)

Data types of each column:
 having_IP_Address              int64
URL_Length                     int64
Shortining_Service             int64
having_At_Symbol               int64
double_slash_redirecting       int64
Prefix_Suffix                  int64
having_Sub_Domain              int64
SSLfinal_State                 int64
Domain_registeration_length    int64
HTTPS_token                    int64
Request_URL                    int64
URL_of_Anchor                  int64
Links_in_tags                  int64
SFH                            int64
Submitting_to_email            int64
Abnormal_URL                   int64
age_of_domain                  int64
DNSRecord                      int64
web_traffic                    int64
Page_Rank                      int64
Google_Index                   int64
Links_pointing_to_page         int64
Statistical_report             int64
Result                         int64
dtype: object


In [9]:
# Check unique values for each column
for col in df_cleaned.columns:
    unique_values = df_cleaned[col].unique()
    print(f"Column '{col}' unique values: {unique_values}")


Column 'having_IP_Address' unique values: [-1  1]
Column 'URL_Length' unique values: [ 1  0 -1]
Column 'Shortining_Service' unique values: [ 1 -1]
Column 'having_At_Symbol' unique values: [ 1 -1]
Column 'double_slash_redirecting' unique values: [-1  1]
Column 'Prefix_Suffix' unique values: [-1  1]
Column 'having_Sub_Domain' unique values: [-1  0  1]
Column 'SSLfinal_State' unique values: [-1  1  0]
Column 'Domain_registeration_length' unique values: [-1  1]
Column 'HTTPS_token' unique values: [-1  1]
Column 'Request_URL' unique values: [ 1 -1]
Column 'URL_of_Anchor' unique values: [-1  0  1]
Column 'Links_in_tags' unique values: [ 1 -1  0]
Column 'SFH' unique values: [-1  1  0]
Column 'Submitting_to_email' unique values: [-1  1]
Column 'Abnormal_URL' unique values: [-1  1]
Column 'age_of_domain' unique values: [-1  1]
Column 'DNSRecord' unique values: [-1  1]
Column 'web_traffic' unique values: [-1  0  1]
Column 'Page_Rank' unique values: [-1  1]
Column 'Google_Index' unique values: [ 

In [10]:
# Step 1: Identify binary columns (those that only have -1 and 1)
binary_columns = [col for col in df.columns if df[col].nunique() == 2]

# Step 2: Encode binary columns
for col in binary_columns:
    df[col] = df[col].replace({-1: 0, 1: 1})

# Step 3: Identify multi-class columns (if needed, for example)
# Here we assume 'URL_Length' has multiple values which should be one-hot encoded.
multi_class_columns = ['URL_Length', 'having_Sub_Domain', 'SSLfinal_State', 'URL_of_Anchor', 'Links_in_tags', 'web_traffic']

# Step 4: Apply one-hot encoding for multi-class columns
df = pd.get_dummies(df, columns=multi_class_columns, drop_first=True)

# Display the updated DataFrame
print(df.head())

   having_IP_Address  Shortining_Service  having_At_Symbol  \
0                  0                   1                 1   
1                  1                   1                 1   
2                  1                   1                 1   
3                  1                   1                 1   
4                  1                   0                 1   

   double_slash_redirecting  Prefix_Suffix  Domain_registeration_length  \
0                         0              0                            0   
1                         1              0                            0   
2                         1              0                            0   
3                         1              0                            1   
4                         1              0                            0   

   Favicon  port  HTTPS_token  Request_URL  ...  having_Sub_Domain_0  \
0        1     1            0            1  ...                False   
1        1     1            0   

In [11]:
# Assuming your DataFrame is named 'df'
column_names = df.columns.tolist()  # Convert to a list for better readability
print("Column Names:")
for name in column_names:
    print(name)

Column Names:
having_IP_Address
Shortining_Service
having_At_Symbol
double_slash_redirecting
Prefix_Suffix
Domain_registeration_length
Favicon
port
HTTPS_token
Request_URL
SFH
Submitting_to_email
Abnormal_URL
Redirect
on_mouseover
RightClick
popUpWidnow
Iframe
age_of_domain
DNSRecord
Page_Rank
Google_Index
Links_pointing_to_page
Statistical_report
Result
URL_Length_0
URL_Length_1
having_Sub_Domain_0
having_Sub_Domain_1
SSLfinal_State_0
SSLfinal_State_1
URL_of_Anchor_0
URL_of_Anchor_1
Links_in_tags_0
Links_in_tags_1
web_traffic_0
web_traffic_1


In [12]:
import pandas as pd

# Set pandas options to display all columns
pd.set_option('display.max_columns', None)  # Set to None to display all columns

# Now print all column names
print("Column Names:")
print(df.columns.tolist())


Column Names:
['having_IP_Address', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'Result', 'URL_Length_0', 'URL_Length_1', 'having_Sub_Domain_0', 'having_Sub_Domain_1', 'SSLfinal_State_0', 'SSLfinal_State_1', 'URL_of_Anchor_0', 'URL_of_Anchor_1', 'Links_in_tags_0', 'Links_in_tags_1', 'web_traffic_0', 'web_traffic_1']


In [4]:
# Save the cleaned DataFrame to a new CSV file
cleaned_file_path = 'cleaned_Phishing.csv'  # Specify the path and filename
df.to_csv(cleaned_file_path, index=False)  # Set index=False to avoid writing row indices