**Mount to Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Necessary Libraries**

In [None]:
import pandas as pd
import re

1. Load the CSV file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Phishing/Raw/Crawled-Phishing-URLs-Batch1.csv')

2. Inspect the DataFrame to confirm column names

In [None]:
print("Columns available:", df.columns.tolist())
print(df.head())

Columns available: ['conversation_id_str', 'created_at', 'favorite_count', 'full_text', 'id_str', 'image_url', 'in_reply_to_screen_name', 'lang', 'location', 'quote_count', 'reply_count', 'retweet_count', 'tweet_url', 'user_id_str', 'username']
   conversation_id_str                      created_at  favorite_count  \
0  1875784676901842949  Sun Jan 05 06:01:32 +0000 2025               0   
1  1877217545263968616  Thu Jan 09 04:55:14 +0000 2025               1   
2  1877214318065766421  Thu Jan 09 04:42:25 +0000 2025               0   
3  1877215200186708314  Thu Jan 09 04:45:55 +0000 2025               2   
4  1877217204954911165  Thu Jan 09 04:53:53 +0000 2025               0   

                                           full_text               id_str  \
0  #phishing ALERT https://sigwgb[.]net https://t...  1875784676901842949   
1  #phishing ALERT https://authorizedqfsbond[.]co...  1877217545263968616   
2  #phishing ALERT https://airdrop-tronnetwork[.]...  1877214318065766421   
3 

3. Select the column containing the tweet text

In [None]:
if 'full_text' in df.columns:
    tweets = df['full_text']
else:
    raise ValueError("The expected column 'full text' was not found in your CSV.")

4. Define a function to extract the suspected phishing URL

In [None]:
def extract_phishing_url(tweet):
    """
    Extract the phishing URL from a tweet's text.
    It selects the first URL that does not belong to 't.co'
    and cleans it by replacing '[.]' with '.'.
    """
    if isinstance(tweet, str):
        # Regex to extract URLs starting with http:// or https://
        urls = re.findall(r'https?://[^\s,]+', tweet)
        for url in urls:
            # Skip URLs from the shortener (t.co)
            if "t.co" not in url:
                # Clean the phishing obfuscation by replacing '[.]' with '.'
                return url.replace('[.]', '.')
        # Return an empty string if none of the URLs match our criterion
        return ''
    else:
        return ''

5. Apply the function to each tweet to create a new column

In [None]:
df['Phishing_URL'] = tweets.apply(extract_phishing_url)

# Debug: Print the first few cleaned URLs to check results
print("Sample Phishing URLs:")
print(df['Phishing_URL'].head(10))

Sample Phishing URLs:
0                 https://sigwgb.net
1      https://authorizedqfsbond.com
2    https://airdrop-tronnetwork.com
3       https://amlwalletreports.com
4                 https://robiox.top
5             https://thefarm.today/
6                https://gekkoai.xyz
7       https://stake-stone.digital/
8                                   
9       https://cryptomainettapps.in
Name: Phishing_URL, dtype: object


6. Filter out rows where no phishing URL was found and keep the cleaned URLs

In [None]:
df = df[df['Phishing_URL'] != '']

# Keep only the column with the cleaned URLs
cleaned_df = df[['Phishing_URL']] # Adjust the number of URLs based on the research purpose

**Save the cleaned phishing URLs to a new CSV file**

In [None]:
cleaned_df.to_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Phishing/Cleaned/cleaned_phishing_url_batch1.csv', index=False)

print("Cleaned CSV file has been created.")

Cleaned CSV file has been created.


[Additional] Concantenate the cleaned data from different dataset

In [None]:
import pandas as pd
import random
import re
import string
from urllib.parse import urlparse, parse_qs
import matplotlib.pyplot as plt
import seaborn as sns

# Load Datasets
crawled_df = pd.read_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Phishing/Cleaned/cleaned_phishing_url.csv')
urlscan_df = pd.read_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Phishing/Cleaned/cleaned_URLScanio.csv')

# Lists for Filtering
phishy_keywords = [
    'login', 'signin', 'verify', 'secure', 'account', 'update', 'submit',
    'invoice', 'bank', 'password', 'confirmation', 'security-check', 'validate'
]

impersonated_brands = ['paypal', 'google', 'facebook', 'amazon', 'apple', 'microsoft']

suspicious_tlds = ['.xyz', '.club', '.top', '.gq', '.tk', '.ml', '.cf', '.ga']

# Filtering Functions
def contains_phishy_keywords(url):
    return any(kw in url.lower() for kw in phishy_keywords)

def contains_suspicious_characters(url):
    return len(re.findall(r'[@$%&!^*()<>?;]', url)) > 1

def is_short_and_suspicious(url):
    return len(url) < 40 and contains_suspicious_characters(url)

def is_long_and_suspicious(url):
    return len(url) >= 40 and contains_suspicious_characters(url)

def is_short_with_sneaky_pattern(url):
    return len(url) < 40 and contains_suspicious_characters(url) and contains_phishy_keywords(url)

def starts_suspicious_path(url):
    path_match = re.findall(r'//[^/]+(/[^?]*)', url)
    if path_match:
        suspicious_starts = ['/login', '/account', '/secure', '/update']
        return any(p in path_match[0].lower() for p in suspicious_starts)
    return False

def has_weird_subdomains(url):
    match = re.findall(r'//([^/]+)', url)
    if match:
        domain = match[0]
        subdomains = domain.split('.')
        return len(subdomains) >= 4
    return False

def contains_obscure_unicode(url):
    return bool(re.search(r'%[0-9a-fA-F]{2}', url)) or 'xn--' in url

def contains_brand_typo(url):
    url_lower = url.lower()
    for brand in impersonated_brands:
        if brand in url_lower:
            if re.search(r'[01]', url_lower) or brand not in re.split(r'\W+', url_lower):
                return True
    return False

def uses_suspicious_tld(url):
    return any(url.lower().endswith(tld) for tld in suspicious_tlds)

def has_many_paths_or_params(url):
    parsed = urlparse(url)
    num_paths = parsed.path.count('/')
    num_params = len(parse_qs(parsed.query))
    return num_paths > 5 or num_params > 5

# Master Filter Function 
def url_is_suspicious(url):
    return any([
        is_short_with_sneaky_pattern(url),
        starts_suspicious_path(url),
        has_weird_subdomains(url),
        contains_obscure_unicode(url),
        contains_brand_typo(url),
        uses_suspicious_tld(url),
        has_many_paths_or_params(url),
    ])

# Apply Filtering
# Custom suspicious filters
urlscan_df_custom_filtered = urlscan_df[
    urlscan_df['Phishing_URL'].apply(url_is_suspicious)
]

# Filter short and long URLs based on the criteria 
urlscan_df_filtered_short = urlscan_df[urlscan_df['Phishing_URL'].apply(is_short_and_suspicious)]
urlscan_df_filtered_long = urlscan_df[urlscan_df['Phishing_URL'].apply(is_long_and_suspicious)]

short_limit = len(urlscan_df_filtered_short) // 2
long_limit = len(urlscan_df_filtered_long) // 2

urlscan_df_filtered = pd.concat([
    urlscan_df_filtered_short.head(short_limit),
    urlscan_df_filtered_long.head(long_limit),
    urlscan_df_custom_filtered
]).drop_duplicates().reset_index(drop=True)

# Combine with Crawled Dataset
combined_df = pd.concat([crawled_df, urlscan_df_filtered], ignore_index=True)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Sort and Select Most Suspicious URLs 
# Sorting without adding a new column
sorted_combined_df = combined_df[combined_df['Phishing_URL'].apply(url_is_suspicious)].sort_values(by='Phishing_URL', ascending=False)

# Select the top suspicious URLs
top_suspicious_df = sorted_combined_df.head(5000)

# Save the filtered dataset
top_suspicious_df.to_csv('/content/drive/MyDrive/Thesis - Undergraduate Ch./Dataset/Phishing/Cleaned/combined_phishing_url_5000.csv', index=False)

print(f"The suspicious phishing URLs have been saved! Total samples: {len(top_suspicious_df)}")


The 1500 most suspicious phishing URLs have been saved! Total samples: 2785
