In [53]:
url = "http://www.facebook.com"

In [54]:
import requests
from bs4 import BeautifulSoup
import joblib  # For loading your trained model
import re

# Load your pre-trained model and feature extraction functions here
# ...
# Initialize the features dictionary
features = {}

# Function to extract features from a URL
def extract_features(url):
    try:
        # Fetch HTML content
        response = requests.get(url)
        html_content = response.text

        # Parse HTML content
        parsed_content = BeautifulSoup(html_content, "html.parser")

        # Extract features based on your feature extraction functions
        # Example: Feature extraction logic for length_url
        length_url = len(url)

        # Add more feature extraction logic for other features...
        # Example:
        nb_dots = url.count('.')
        nb_hyphens = url.count('-')
        ip = int(bool(re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url)))  # Check for the presence of an IP address in the URL
        nb_dots = url.count('.')
        nb_hyphens = url.count('-')
        nb_at = url.count('@')
        nb_qm = url.count('?')
        nb_and = url.count('&')
        nb_or = url.count('|')
        nb_eq = url.count('=')
        nb_underscore = url.count('_')
        nb_tilde = url.count('~')
        nb_percent = url.count('%')
        nb_slash = url.count('/')
        nb_star = url.count('*')
        nb_colon = url.count(':')
        nb_comma = url.count(',')
        nb_semicolon = url.count(';')
        nb_dollar = url.count('$')
        nb_space = url.count(' ')
        nb_www = url.count('www')
        nb_com = url.count('.com')
        nb_dslash = url.count('//')
        http_in_path = int('http' in url)
        https_token = int('https' in url)
#         ratio_digits_url = sum(c.isdigit() for c in url) / length_url
#         ratio_digits_host = sum(c.isdigit() for c in parsed_content.netloc) / length_hostname

        # Return a dictionary of features
        features["length_url"] = length_url
        features["nb_dots"] = nb_dots
        features["nb_hyphens"] = nb_hyphens
        features["nb_at"] = nb_at
        features["nb_qm"] = nb_qm
        features["nb_and"] = nb_and
        features["nb_or"] = nb_or
        features["nb_eq"] = nb_eq
        features["nb_underscore"] = nb_underscore
        features["nb_tilde"] = nb_tilde
        features["nb_percent"] = nb_percent
        features["nb_slash"] = nb_slash
        features["nb_star"] = nb_star
        features["nb_colon"] = nb_colon
        features["nb_comma"] = nb_comma
        features["nb_semicolon"] = nb_semicolon
        features["nb_dollar"] = nb_dollar
        features["nb_space"] = nb_space
        features["nb_www"] = nb_www
        features["nb_com"] = nb_com
        features["nb_dslash"] = nb_dslash
        features["http_in_path"] = http_in_path
        features["https_token"] = https_token

        return features

    except Exception as e:
        print(f"Error: {str(e)}")
        return None

In [55]:
extract_features(url)
print(features)

{'length_url': 23, 'nb_dots': 2, 'nb_hyphens': 0, 'nb_at': 0, 'nb_qm': 0, 'nb_and': 0, 'nb_or': 0, 'nb_eq': 0, 'nb_underscore': 0, 'nb_tilde': 0, 'nb_percent': 0, 'nb_slash': 2, 'nb_star': 0, 'nb_colon': 1, 'nb_comma': 0, 'nb_semicolon': 0, 'nb_dollar': 0, 'nb_space': 0, 'nb_www': 1, 'nb_com': 1, 'nb_dslash': 1, 'http_in_path': 1, 'https_token': 0}


In [13]:
# pip install tldextract

In [56]:
import requests
from bs4 import BeautifulSoup
import re
import tldextract

# Fetch HTML content
response = requests.get(url)
html_content = response.text

# Parse HTML content
parsed_content = BeautifulSoup(html_content, "html.parser")

# Function to check for the presence of Punycode in the domain
def extract_punycode(url):
    extracted = tldextract.extract(url)
    return 1 if any(ord(char) > 127 for char in extracted.domain) else 0

# Function to extract the 'port' feature
def extract_port(url):
    return 1 if ":" in url else 0

# Function to extract the 'tld_in_path' feature
def extract_tld_in_path(url):
#     path = tldextract.extract(url).path
    return 1 if url.endswith((".com", ".org", ".net", ".edu", ".gov", ".mil")) else 0


# Function to extract the 'tld_in_subdomain' feature
def extract_tld_in_subdomain(url):
    subdomain = tldextract.extract(url).subdomain
    return 1 if tldextract.extract(subdomain).suffix else 0

# Function to extract the 'abnormal_subdomain' feature (example check)
def extract_abnormal_subdomain(url):
    subdomain = tldextract.extract(url).subdomain
    # Example check: Consider "www" as not abnormal, everything else as abnormal
    return 0 if subdomain == "www" else 1

# Function to extract the 'nb_subdomains' feature
def extract_nb_subdomains(url):
    subdomains = tldextract.extract(url).subdomain
    return len(subdomains.split('.'))

# Function to extract the 'prefix_suffix' feature
def extract_prefix_suffix(url):
    domain = tldextract.extract(url).domain
    return 1 if domain.startswith('-') or domain.endswith('-') else 0

# Function to extract the 'random_domain' feature (example check)
def extract_random_domain(url):
    domain = tldextract.extract(url).domain
    # Example check: Consider "example" as not random, everything else as random
    return 0 if domain == "example" else 1

# Function to extract the 'shortening_service' feature (example check)
def extract_shortening_service(url):
    # Example check: Detect common URL shortening services like "bit.ly"
    shortening_services = ["bit.ly", "t.co", "tinyurl"]
    return 1 if any(service in url for service in shortening_services) else 0

# Function to extract the 'nb_redirection' feature
def extract_nb_redirections(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    redirections = soup.find_all("meta", attrs={"http-equiv": "refresh"})
    return len(redirections)

# Function to extract the 'nb_external_redirection' feature (example check)
def extract_nb_external_redirections(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    redirections = soup.find_all("meta", attrs={"http-equiv": "refresh"})
    # Example check: Consider external redirections if they contain "http" or "https"
    return sum(1 for redirection in redirections if "http" in redirection["content"])

# Function to extract the 'length_words_raw' feature (example calculation)
def extract_length_words_raw(html_content):
    words = re.findall(r'\w+', html_content)
    return len(words)

# Function to extract the 'char_repeat' feature (example calculation)
def extract_char_repeat(html_content):
    repeated_chars = re.findall(r'((\w)\2{2,})', html_content)
    return len(repeated_chars)

# Function to extract the 'shortest_words_raw' feature (example calculation)
def extract_shortest_words_raw(html_content):
    words = re.findall(r'\w+', html_content)
    shortest_word = min(words, key=len, default='')
    return len(shortest_word)
# Function to extract the 'longest_words_raw' feature (example calculation)
def extract_longest_words_raw(html_content):
    words = re.findall(r'\w+', html_content)
    longest_word = max(words, key=len, default='')
    return len(longest_word)

# Function to extract the 'avg_words_raw' feature (example calculation)
def extract_avg_words_raw(html_content):
    words = re.findall(r'\w+', html_content)
    total_word_length = sum(len(word) for word in words)
    return total_word_length / len(words) if len(words) > 0 else 0

# Function to extract the 'phish_hints' feature
def extract_phish_hints(html_content):
    # Example check: Search for phishing-related keywords in the content
    phishing_keywords = ["phish", "fraud", "scam", "spoof"]
    return 1 if any(keyword in html_content for keyword in phishing_keywords) else 0

In [57]:
features['punycode'] = extract_punycode(url)
features['port'] = extract_port(url)
features['tld_in_path'] = extract_tld_in_path(url)
features['tld_in_subdomain'] = extract_tld_in_subdomain(url)
features['abnormal_subdomain'] = extract_abnormal_subdomain(url)
features['nb_subdomains'] = extract_nb_subdomains(url)
features['prefix_suffix'] = extract_prefix_suffix(url)
features['random_domain'] = extract_random_domain(url)
features['shortening_service'] = extract_shortening_service(url)
# features['path_extension'] = extract_path_extension(url)
features['nb_redirection'] = extract_nb_redirections(html_content)
features['nb_external_redirection'] = extract_nb_external_redirections(html_content)
features['length_words_raw'] = extract_length_words_raw(html_content)
features['char_repeat'] = extract_char_repeat(html_content)
features['shortest_words_raw'] = extract_shortest_words_raw(html_content)
features['longest_words_raw'] = extract_longest_words_raw(html_content)
features['avg_words_raw'] = extract_avg_words_raw(html_content)
features['phish_hints'] = extract_phish_hints(html_content)

In [58]:
# Function to extract the 'domain_in_brand' feature
def extract_domain_in_brand(url, brand_name):
    return 1 if brand_name in url else 0

# Function to extract the 'brand_in_subdomain' feature
def extract_brand_in_subdomain(url, brand_name):
    subdomain = tldextract.extract(url).subdomain
    return 1 if brand_name in subdomain else 0

# Function to extract the 'brand_in_path' feature
def extract_brand_in_path(url, brand_name):
    path = tldextract.extract(url).path
    return 1 if brand_name in path else 0

# Function to extract the 'suspecious_tld' feature (example check)
def extract_suspecious_tld(url):
    tld = tldextract.extract(url).suffix
    # Example check: Detect suspicious TLDs like ".tk", ".ml", ".ga", ".cf", ".gq"
    return 1 if tld in [".tk", ".ml", ".ga", ".cf", ".gq"] else 0

# Function to extract the 'statistical_report' feature (example check)
def extract_statistical_report(html_content):
    # Example check: Detect the presence of statistical reports in the content
    statistical_keywords = ["statistics", "report", "data", "analysis"]
    return 1 if any(keyword in html_content for keyword in statistical_keywords) else 0

# Function to extract the 'nb_hyperlinks' feature
def extract_nb_hyperlinks(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    hyperlinks = soup.find_all("a")
    return len(hyperlinks)

# Function to extract the 'ratio_intHyperlinks' feature
def extract_ratio_intHyperlinks(html_content):
    # Example calculation: Calculate the ratio of internal hyperlinks to total hyperlinks
    soup = BeautifulSoup(html_content, "html.parser")
    hyperlinks = soup.find_all("a")
    int_hyperlinks = sum(1 for link in hyperlinks if url_is_internal(link["href"]))
    return int_hyperlinks / len(hyperlinks) if len(hyperlinks) > 0 else 0

# Function to extract the 'ratio_extHyperlinks' feature
def extract_ratio_extHyperlinks(html_content):
    # Example calculation: Calculate the ratio of external hyperlinks to total hyperlinks
    soup = BeautifulSoup(html_content, "html.parser")
    hyperlinks = soup.find_all("a")
    ext_hyperlinks = sum(1 for link in hyperlinks if not url_is_internal(link["href"]))
    return ext_hyperlinks / len(hyperlinks) if len(hyperlinks) > 0 else 0

# Function to extract the 'ratio_nullHyperlinks' feature
def extract_ratio_nullHyperlinks(html_content):
    # Example calculation: Calculate the ratio of hyperlinks with null (empty) href attributes
    soup = BeautifulSoup(html_content, "html.parser")
    hyperlinks = soup.find_all("a")
    null_hyperlinks = sum(1 for link in hyperlinks if not link.has_attr("href") or not link["href"])
    return null_hyperlinks / len(hyperlinks) if len(hyperlinks) > 0 else 0

# Function to extract the 'nb_extCSS' feature
def extract_nb_extCSS(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    ext_css = soup.find_all("link", attrs={"rel": "stylesheet"})
    return len(ext_css)

# ... Add functions for the remaining features similarly ...

# Add the extracted features to the dictionary
features['domain_in_brand'] = extract_domain_in_brand(url, "example_brand")
features['brand_in_subdomain'] = extract_brand_in_subdomain(url, "example_brand")
# features['brand_in_path'] = extract_brand_in_path(url, "example_brand")
features['suspecious_tld'] = extract_suspecious_tld(url)
features['statistical_report'] = extract_statistical_report(html_content)
features['nb_hyperlinks'] = extract_nb_hyperlinks(html_content)
# features['ratio_intHyperlinks'] = extract_ratio_intHyperlinks(html_content)
# features['ratio_extHyperlinks'] = extract_ratio_extHyperlinks(html_content)
# features['ratio_nullHyperlinks'] = extract_ratio_nullHyperlinks(html_content)
features['nb_extCSS'] = extract_nb_extCSS(html_content)

# Print the extracted features
for feature_name, feature_value in features.items():
    print(f"{feature_name}: {feature_value}")

# Extract the remaining features using similar functions...

length_url: 23
nb_dots: 2
nb_hyphens: 0
nb_at: 0
nb_qm: 0
nb_and: 0
nb_or: 0
nb_eq: 0
nb_underscore: 0
nb_tilde: 0
nb_percent: 0
nb_slash: 2
nb_star: 0
nb_colon: 1
nb_comma: 0
nb_semicolon: 0
nb_dollar: 0
nb_space: 0
nb_www: 1
nb_com: 1
nb_dslash: 1
http_in_path: 1
https_token: 0
punycode: 0
port: 1
tld_in_path: 1
tld_in_subdomain: 0
abnormal_subdomain: 0
nb_subdomains: 1
prefix_suffix: 0
random_domain: 1
shortening_service: 0
nb_redirection: 1
nb_external_redirection: 0
length_words_raw: 7152
char_repeat: 105
shortest_words_raw: 1
longest_words_raw: 104
avg_words_raw: 5.862975391498882
phish_hints: 0
domain_in_brand: 0
brand_in_subdomain: 0
suspecious_tld: 0
statistical_report: 1
nb_hyperlinks: 46
nb_extCSS: 6


In [59]:
import statistics  # For computing mode
import pandas as pd  # Assuming you have a DataFrame with your dataset

# Create a DataFrame with your dataset (replace this with your actual dataset)
data = pd.read_csv("dataset_phishing.csv")
# Define a function to compute mean for numerical features and mode for categorical features
def compute_mean_or_mode(feature_values):
    if pd.api.types.is_numeric_dtype(feature_values):
        return statistics.mean(feature_values)
    else:
        return statistics.mode(feature_values)

# Create a dictionary 'features1' with computed mean/mode values
features1 = {column: compute_mean_or_mode(data[column]) for column in data.columns}
features1 = {key: features.get(key, value) for key, value in features1.items()}

# # Print the updated 'features' dictionary
print(features1)

{'url': 'http://e710z0ear.du.r.appspot.com/c:/users/user/downlo', 'length_url': 23, 'length_hostname': 21.090288713910763, 'ip': 0.15056867891513562, 'nb_dots': 2, 'nb_hyphens': 0, 'nb_at': 0, 'nb_qm': 0, 'nb_and': 0, 'nb_or': 0, 'nb_eq': 0, 'nb_underscore': 0, 'nb_tilde': 0, 'nb_percent': 0, 'nb_slash': 2, 'nb_star': 0, 'nb_colon': 1, 'nb_comma': 0, 'nb_semicolumn': 0.06229221347331584, 'nb_dollar': 0, 'nb_space': 0, 'nb_www': 1, 'nb_com': 1, 'nb_dslash': 1, 'http_in_path': 1, 'https_token': 0, 'ratio_digits_url': 0.0531373016455818, 'ratio_digits_host': 0.02502370947016623, 'punycode': 0, 'port': 1, 'tld_in_path': 1, 'tld_in_subdomain': 0, 'abnormal_subdomain': 0, 'nb_subdomains': 1, 'prefix_suffix': 0, 'random_domain': 1, 'shortening_service': 0, 'path_extension': 0.00017497812773403323, 'nb_redirection': 1, 'nb_external_redirection': 0, 'length_words_raw': 7152, 'char_repeat': 105, 'shortest_words_raw': 1, 'shortest_word_host': 5.019772528433946, 'shortest_word_path': 2.39895013123

In [60]:
features1.pop('url')
features1.pop('status')
print(features1)

{'length_url': 23, 'length_hostname': 21.090288713910763, 'ip': 0.15056867891513562, 'nb_dots': 2, 'nb_hyphens': 0, 'nb_at': 0, 'nb_qm': 0, 'nb_and': 0, 'nb_or': 0, 'nb_eq': 0, 'nb_underscore': 0, 'nb_tilde': 0, 'nb_percent': 0, 'nb_slash': 2, 'nb_star': 0, 'nb_colon': 1, 'nb_comma': 0, 'nb_semicolumn': 0.06229221347331584, 'nb_dollar': 0, 'nb_space': 0, 'nb_www': 1, 'nb_com': 1, 'nb_dslash': 1, 'http_in_path': 1, 'https_token': 0, 'ratio_digits_url': 0.0531373016455818, 'ratio_digits_host': 0.02502370947016623, 'punycode': 0, 'port': 1, 'tld_in_path': 1, 'tld_in_subdomain': 0, 'abnormal_subdomain': 0, 'nb_subdomains': 1, 'prefix_suffix': 0, 'random_domain': 1, 'shortening_service': 0, 'path_extension': 0.00017497812773403323, 'nb_redirection': 1, 'nb_external_redirection': 0, 'length_words_raw': 7152, 'char_repeat': 105, 'shortest_words_raw': 1, 'shortest_word_host': 5.019772528433946, 'shortest_word_path': 2.3989501312335957, 'longest_words_raw': 104, 'longest_word_host': 10.46797900

In [61]:
import pickle
import numpy as np
input_features = features1
# Load the pre-trained model from the pickle file
with open('trained_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Prepare the input data as a NumPy array
input_data = np.array(list(input_features.values())).reshape(1, -1)  # Assuming 1 sample
# print(input_data)
# Make predictions using the loaded model
predicted_class = loaded_model.predict(input_data)

# Interpret the predicted class (e.g., 0 for not phishing, 1 for phishing)
if predicted_class == 0:
    print("The URL maybe phishing url.")
else:
    print("The URL not phishing url.")

The URL not phishing url.
