In [9]:
import pandas as pd
from datasets import DatasetDict, Dataset

In [10]:
# data source: https://www.kaggle.com/datasets/taruntiwarihp/phishing-site-urls/data
df = pd.read_csv("data/phishing_site_urls.csv")

In [13]:
df = df.dropna()

df_safe = df[df['Label']=="good"]
df_not_safe = df[df['Label']=="bad"]

num_samples = 1500

# Sample min_size rows from each class to ensure a 50-50 split
df_safe_sample = df_safe.sample(num_samples, random_state=42)
df_not_safe_sample = df_not_safe.sample(num_samples, random_state=42)

# replace "Email Type" with Boolean flag "isPhising"
df_safe_sample = df_safe_sample.assign(isPhishing=False)
df_safe_sample = df_safe_sample.drop('Label',axis=1)
df_not_safe_sample = df_not_safe_sample.assign(isPhishing=True)
df_not_safe_sample = df_not_safe_sample.drop('Label',axis=1)

# Concatenate the samples to create a new balanced dataset
balanced_df = pd.concat([df_safe_sample, df_not_safe_sample])
balanced_df.columns = ['text', 'labels']

# convert labels column to int
balanced_df['labels'] = balanced_df['labels'].astype(int)

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

train_frac = 0.7
valid_frac = 0.15
test_frac = 0.15

# define train and validation size
train_size = int(train_frac * len(balanced_df))
valid_size = int(valid_frac * len(balanced_df))

# create train, validation, and test datasets
train_df = balanced_df[:train_size]
valid_df = balanced_df[train_size:train_size + valid_size]
test_df = balanced_df[train_size + valid_size:]

# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)
test_ds = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

In [14]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})

In [20]:
# push data to hub
dataset_dict.push_to_hub("Arnav0805/phishing-site-classification")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Arnav0805/phishing-site-classification/commit/8438abb9b383d3508f46ab06a3fdc6f84f3a17dc', commit_message='Upload dataset', commit_description='', oid='8438abb9b383d3508f46ab06a3fdc6f84f3a17dc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Arnav0805/phishing-site-classification', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Arnav0805/phishing-site-classification'), pr_revision=None, pr_num=None)

Bad pipe message: %s [b'0.9,image/avif,image/webp,image/apng,*/*;q=0.8\r\nHost: localhost:46843\r\nUser-Agent: Mozilla/5.0 (Macintosh; Intel', b'ac OS X 10_15_7) AppleWebKit/53', b'36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36\r']
Bad pipe message: %s [b'ccept-Enc', b'ing: gzip, deflate, br, zstd\r\nAccept-Language: en-GB,en;q=0.8\r\nCache-Control: max-age=0\r\nReferer: https://gith']
Bad pipe message: %s [b'.com/\r\nX-Request-ID: 6bbb8ed961905defcc51a49ded33e255\r\nX-Real-IP: 10.240.0.69\r\nX-Forwarded-Port: 443\r\nX-Forwarded-Sc']
Bad pipe message: %s [b'me: https\r\nX-Original-URI: /\r\nX-Scheme: https\r\nsec-gpc: 1\r\nsec-fetch-site: cross-site\r\nsec-fetch-mode: ', b'vigate\r\nsec-fetch-dest: document\r\nsec-ch-ua: "Brave";v="131", "Chromium";v="131", "Not_A Brand";v="24"\r\nsec-c', b'ua-mobile: ?0\r\nsec-ch-ua-platform: "macOS"\r\npriority: u=0, i\r\nX-Original-Proto: https\r\nX-Forwarded-Prot', b' https\r\nX-Forwarded-Host: super-parakeet-pjr9799xvxgw2rpgr-46843.app.gith