**Mount to Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Necessary Libraries**

In [None]:
import pandas as pd

1. Load the CSV file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Legitimate/Raw/PhishDataset - Imbalanced.csv')

2. Inspect the DataFrame to confirm column names

In [None]:
print("Columns available", df.columns.tolist())
print(df.head())

Columns available ['Labels', 'URLs']
   Labels                                               URLs
0       1   https://www.ujhyjhujhyjhyuj.ga/CC_POSTALE/f2a83/
1       1  https://stitch-statichosting-prod.s3.amazonaws...
2       0            https://www.reservoirgroup.com/careers/
3       0  https://www.camosy.com/themes/juicy/images/doo...
4       0                       https://www.liveapps.com.au/


3. Filter the data with label 0 (legitimate)

In [None]:
filtered_df = df[df['Labels'] == 0]

4. Retrieve the URLs column only

In [None]:
# Filter only the URLs with normal patterns
filtered_df = filtered_df[filtered_df['URLs'].str.contains(r'^https?://')]

# Clean unwanted characters
filtered_df['URLs'] = filtered_df['URLs'].str.replace(r'[^a-zA-Z0-9/:.?&=%#_\-]', '', regex=True)

url_only_df = filtered_df[['URLs']].head(10000)  # Adjust the number of URLs based on the research purpose
url_only_df = url_only_df.rename(columns={'URLs': 'Legitimate_URL'})

5. Save the filtered data

In [None]:
url_only_df.to_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Legitimate/Filtered/filtered_legitimate_url_10000_again.csv', index=False)

print("Filtered CSV file has been created.")

Filtered CSV file has been created.


7. [Additional] Concatenate the filtered data from different dataset

In [None]:
import pandas as pd
import random
import numpy as np

# Load dataset
imbalanced_df = pd.read_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Legitimate/Raw/PhishDataset - Imbalanced.csv')
imbalanced_df = imbalanced_df.drop_duplicates()

# Rename the column if necessary
if 'URLs' in imbalanced_df.columns:
    imbalanced_df = imbalanced_df.rename(columns={'URLs': 'URL'})

# Filter based on the 'Labels' column
df_0 = imbalanced_df[imbalanced_df['Labels'] == 0]  # Legitimate
df_1 = imbalanced_df[imbalanced_df['Labels'] == 1]  # Phishing

# Make 10 runs of sampling
for i in range(1, 11):
    df_0_sample = df_0.sample(n=50)
    df_1_sample = df_1.sample(n=50)

    # Concantenate without randomization
    df_sample = pd.concat([df_0_sample, df_1_sample]).reset_index(drop=True)

    # Save the labeled and unlabeled samples
    df_sample.to_csv(f'sampled_urls_run{i}.csv', index=False)
    df_sample[['URL']].to_csv(f'sampled_urls_run{i}_nolabel.csv', index=False)

print("✅ All ordered samples saved! Legitimate URLs first, then Phishing.")

✅ All ordered samples saved! Legitimate URLs first, then Phishing.
