In [50]:
import pandas as pd
# Load the dataset from the CSV file. Adjust the file path as needed.
df = pd.read_csv("malicious_phish.csv")

# We assume df has columns: 'url' and 'type' (e.g., "benign", "defacement", "phishing").
# We'll create a binary label: 0 for benign, 1 for anything else.
df = df[df['type'] == 'benign']

In [None]:
df.shape

In [52]:
import random 
import string

def generate_dummy_url():
    domain = ''.join(random.choices(string.ascii_lowercase, k=8))
    return f"http://{domain}.com"

def generate_realistic_url():
    # Lists of components to create realistic-looking URLs
    protocols = ['http://', 'https://']
    tlds = ['.com', '.org', '.net', '.ru', '.us', '.co', '.site', '.app']
    file_extensions = ['.php', '.html', '.aspx', '']
    path_patterns = [
        'checkpoint/login',
        'images/secure',
        'admin/panel',
        'verify/account',
        'secure/login',
        'manager/auth'
    ]
    
    # Generate random components
    protocol = random.choice(protocols)
    
    # Create domain (might include subdomain)
    if random.random() < 0.3:  # 30% chance of subdomain
        subdomain = ''.join(random.choices(string.ascii_lowercase, k=random.randint(2, 8)))
        domain = f"{subdomain}."
    else:
        domain = ""
    
    # Main domain name
    domain += ''.join(random.choices(string.ascii_lowercase + string.digits, k=random.randint(6, 12)))
    
    # TLD
    tld = random.choice(tlds)
    
    # Build the base URL
    url = f"{protocol}{domain}{tld}"
    
    # Add path and parameters (70% chance)
    if random.random() < 0.7:
        # Random path
        if random.random() < 0.5:
            path = random.choice(path_patterns)
        else:
            path = ''.join(random.choices(string.ascii_lowercase + string.digits, k=random.randint(4, 12)))
        
        # Add file extension
        path += random.choice(file_extensions)
        
        url += f"/{path}"
        
        # Add query parameters (30% chance)
        if random.random() < 0.3:
            params = []
            num_params = random.randint(1, 3)
            param_names = ['id', 'session', 'token', 'auth', 'cmd', 'user']
            for _ in range(num_params):
                param_name = random.choice(param_names)
                param_value = ''.join(random.choices(string.ascii_letters + string.digits, k=random.randint(16, 32)))
                params.append(f"{param_name}={param_value}")
            url += '?' + '&'.join(params)
    
    return url

In [53]:
desired_size = int(2 * 10**6) # Same as phishin dataset size
dummies = [generate_realistic_url() for _ in range(desired_size - df.shape[0])]

In [None]:
# Create a DataFrame from the dummy URLs
dummy_df = pd.DataFrame({
    'url': dummies,
    'type': 'benign'
})

# Concatenate the original DataFrame with the dummy DataFrame
df = pd.concat([df, dummy_df], ignore_index=True)

# Shuffle the entire dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into two DataFrames
df_1_4M = df.iloc[:1400000]  # First 1.4M rows
df_remaining = df.iloc[1400000:]  # Remaining rows

# Print statistics
print(f"First DataFrame size: {len(df_1_4M)}")
print(f"Second DataFrame size: {len(df_remaining)}")
print(f"Total URLs: {len(df)}")

# Save both DataFrames to CSV files
df_1_4M.to_csv('benign_urls_1_4M.csv', index=False)
df_remaining.to_csv('benign_urls_remaining.csv', index=False)

In [55]:
df_remaining.to_csv('negative_dataset.csv', index=False)

In [56]:
df = df_1_4M

In [None]:
# Read the phishing links from the file
with open('phishing_links.lst', 'r') as file:
    phishing_links = [line.strip() for line in file if line.strip()]

# Create a DataFrame for phishing links
phishing_df = pd.DataFrame({
    'url': phishing_links,
    'type': 'phishing'
})

# Combine with the benign URLs
full_dataset = pd.concat([df, phishing_df], ignore_index=True)

# Shuffle the dataset
full_dataset = full_dataset.sample(frac=1).reset_index(drop=True)

# Save the combined dataset to CSV
full_dataset.to_csv('full_dataset.csv', index=False)

print(f"Full dataset created with {len(full_dataset)} URLs")
print(f"Phishing URLs: {len(full_dataset[full_dataset['type'] == 'phishing'])}")
print(f"Benign URLs: {len(full_dataset[full_dataset['type'] == 'benign'])}")
print("Dataset saved to 'full_dataset.csv'")


In [58]:
test = pd.read_csv("negative_dataset.csv")

In [None]:
all(test["type"] == "benign")

In [None]:
test.shape

In [None]:
import pandas as pd
t = pd.read_csv("full_dataset.csv")
    

In [None]:
t["type"].value_counts()