## Data Preprocessing for URL dataset pulled from the Mendeley website. 

### This data will be used in a final course project in CS 549 Fall 2025, San Diego State University.

Author: Jia Gapuz

1. Import the dataset

In [6]:
import pandas as pd
from pathlib import Path

csv_path = Path("URL dataset.csv")

# Read with fallback encoding
try:
    df = pd.read_csv(csv_path)
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding="latin-1")

print(f"Loaded {len(df):,} rows")
df.head()

Loaded 450,176 rows


Unnamed: 0,url,type
0,https://www.google.com,legitimate
1,https://www.youtube.com,legitimate
2,https://www.facebook.com,legitimate
3,https://www.baidu.com,legitimate
4,https://www.wikipedia.org,legitimate


2. Drop exact duplicates in the dataset, print the ratio of legitimate to malicious URLS in the dataset.

In [7]:
#log number of rows before removing duplicates
original_rows = len(df)

#drop exact duplicates
df = df.drop_duplicates()

#log current number of rows and print removed count
new_rows = len(df)
removed = original_rows - new_rows
print(f"Removed {removed} duplicate rows ({removed/original_rows:.2%} of original).")

# Count each type and compute ratio
counts = df['type'].value_counts(dropna=False)
ratio = (counts / counts.sum()).rename('ratio')
summary = pd.concat([counts.rename('count'), ratio], axis=1)

print("\nCounts and ratios by type:")
summary

Removed 0 duplicate rows (0.00% of original).

Counts and ratios by type:


Unnamed: 0_level_0,count,ratio
type,Unnamed: 1_level_1,Unnamed: 2_level_1
legitimate,345738,0.768006
phishing,104438,0.231994


Since the current dataset's ratio is unaaceptable (we defined acceptable as at least 60-40), we will pull from another dataset to reach a 50-50 if possible. 

In [9]:
from pathlib import Path

PHISH_FILE = "Phishing URLs.csv"  # set to your exact filename
PHISH_URL_COLUMN = "url"          # <-- set to your exact URL column name
PHISH_TYPE_COLUMN = "Type"        # <-- set to your exact label column name

phish_path = Path(PHISH_FILE)
if not phish_path.exists():
    raise FileNotFoundError(f"Phishing dataset not found at '{PHISH_FILE}'. Place it in the same folder or update PHISH_FILE.")

#load the dataset
try:
    phish_df = pd.read_csv(phish_path)
except UnicodeDecodeError:
    phish_df = pd.read_csv(phish_path, encoding="latin-1")

# Validate user-provided columns
missing = [c for c in [PHISH_URL_COLUMN, PHISH_TYPE_COLUMN] if c not in phish_df.columns]
if missing:
    print("Available columns:", phish_df.columns.tolist())
    raise ValueError(f"Missing columns in phishing dataset: {missing}. Update PHISH_URL_COLUMN/PHISH_TYPE_COLUMN.")

# Normalize labels: replace any 'malicious' with 'phishing'
label_series = phish_df[PHISH_TYPE_COLUMN].astype(str).str.lower()
label_series = label_series.map({
    "phishing": "phishing",
    "malicious": "phishing"
}).fillna("phishing")

phish_norm = pd.DataFrame({
    "url": phish_df[PHISH_URL_COLUMN].astype(str),
    "type": label_series
})

# Remove duplicates inside phishing dataset and vs current df
phish_norm = phish_norm.drop_duplicates(subset=["url"])  # self-dedupe
existing_urls = set(df["url"].astype(str))
phish_candidates = phish_norm[~phish_norm["url"].isin(existing_urls)]

# Compute how many phishing/malicious we need to append to reach 50-50
counts = df["type"].value_counts()
legit = counts.get("legitimate", 0)
mal_total = counts.get("malicious", 0) + counts.get("phishing", 0)
needed = max(0, legit - mal_total)

if needed == 0:
    print("No append needed: dataset already at or above 50% phishing/malicious.")
    appended = 0
else:
    take = min(needed, len(phish_candidates))
    to_append = phish_candidates.sample(n=take, random_state=42) if take > 0 else phish_candidates.head(0)
    df = pd.concat([df, to_append], ignore_index=True).drop_duplicates(subset=["url"])  # ensure no duplicates
    appended = len(to_append)

# Report
new_counts = df["type"].value_counts()
new_ratio = (new_counts / new_counts.sum()).rename("ratio")
summary_after = pd.concat([new_counts.rename("count"), new_ratio], axis=1)

print(f"Requested append: {needed}; actually appended: {appended} unique phishing URLs.")
if appended < needed:
    print("Warning: Not enough unique new phishing URLs available to reach exact 50-50.")

print("\nCounts and ratios after append:")
summary_after

Requested append: 186543; actually appended: 0 unique phishing URLs.

Counts and ratios after append:


Unnamed: 0_level_0,count,ratio
type,Unnamed: 1_level_1,Unnamed: 2_level_1
legitimate,345738,0.684721
phishing,104438,0.206835
malicious,54757,0.108444
