## Git Clone for Dataset & Pickel Files

In [None]:
# We cloned the GitHub repository and moved to the correct directory
!git clone https://github.com/Yogesh0903/Phishing_URL.git
!ls Phishing_URL/Model_7  # We checked if models exist


Cloning into 'Phishing_URL'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 38 (delta 8), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (38/38), 31.44 MiB | 9.85 MiB/s, done.
Resolving deltas: 100% (8/8), done.
decision_tree.pkl	 logistic_regression.pkl  neural_network.pkl  svm.pkl
k-nearest_neighbors.pkl  naive_bayes.pkl	  random_forest.pkl


## Model

https://www.somaiya.edu/en/


https://www.meesho.com/

https://github.com/kamranahmedse

https://www.jnj.com/

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pickle
import numpy as np
import statistics
import time
import os

# Function to extract features from a URL
def extract_features(url):
    features = {
        'LineOfCode': 0,
        'LargestLineLength': 0,
        'NoOfSelfRef': 0,
        'IsHTTPS': 1 if url.startswith("https") else 0,
        'NoOfExternalRef': 0,
        'NoOfImage': 0,
        'NoOfJS': 0
    }

    try:
        for _ in range(3):  # We retried up to 3 times
            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                break
            time.sleep(2)
        else:
            print(f"Error fetching the webpage: {response.status_code} {response.reason}")
            return features

        response.raise_for_status()

        lines = response.text.split('\n')
        features['LineOfCode'] = len(lines)
        features['LargestLineLength'] = max(len(line) for line in lines)

        soup = BeautifulSoup(response.text, 'html.parser')
        domain = urlparse(url).netloc

        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(url, href)
            if urlparse(full_url).netloc == domain:
                features['NoOfSelfRef'] += 1
            else:
                features['NoOfExternalRef'] += 1

        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfJS'] = len(soup.find_all('script'))

    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")

    return features

# Function to load models from 'Model_7' folder
def load_models():
    models = {}
    model_names = ["Logistic Regression", "Decision Tree", "Random Forest",
                   "K-Nearest Neighbors", "Naive Bayes", "Neural Network", "SVM"]

    model_folder = "Phishing_URL/Model_7/"  # We set the path to model files

    for name in model_names:
        filename = f"{model_folder}{name.replace(' ', '_').lower()}.pkl"
        if os.path.exists(filename):  # We checked if the file exists
            try:
                with open(filename, 'rb') as f:
                    models[name] = pickle.load(f)
            except Exception as e:
                print(f"Error loading {filename}: {e}")
        else:
            print(f"Warning: {filename} not found. We skipped this model.")

    return models

# Function to predict whether a URL is phishing or legitimate
def predict_url(url, models):
    features = extract_features(url)
    feature_values = np.array(list(features.values())).reshape(1, -1)

    predictions = []
    for name, model in models.items():
        try:
            pred = model.predict(feature_values)[0]
            predictions.append(pred)
            print(f"{name} Prediction: {'Legitimate' if pred else 'Phishing'}")
        except Exception as e:
            print(f"Error predicting with {name}: {e}")

    if predictions:
        avg_prediction = np.mean(predictions)
        mode_prediction = statistics.mode(predictions)

        final_prediction = 'Legitimate' if mode_prediction == 1 else 'Phishing'
        print(f"\nAverage Prediction: {avg_prediction:.4f}")
        print(f"Mode Prediction: {mode_prediction}")
        print(f"Final Prediction (based on mode): {final_prediction}")
    else:
        print("No models available for prediction.")

# We loaded models
models = load_models()

# We got URL input from the user
url = input("Enter URL to classify: ")
predict_url(url, models)


Enter URL to classify: https://www.somaiya.edu/en/
Logistic Regression Prediction: Legitimate
Decision Tree Prediction: Legitimate
Random Forest Prediction: Legitimate
K-Nearest Neighbors Prediction: Legitimate
Naive Bayes Prediction: Legitimate
Neural Network Prediction: Legitimate
SVM Prediction: Legitimate

Average Prediction: 1.0000
Mode Prediction: 1
Final Prediction (based on mode): Legitimate




## Rough Code

### Working Rough

#### 12 Features

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pickle
import numpy as np
import statistics
import time
import os

# Function to extract features from a URL
def extract_features(url):
    features = {
        'LineOfCode': 0,
        'LargestLineLength': 0,
        'NoOfSelfRef': 0,
        'IsHTTPS': 1 if url.startswith("https") else 0,
        'NoOfExternalRef': 0,
        'NoOfImage': 0,
        'NoOfJS': 0,
        'NoOfCSS': 0,
        'HasSocialNet': 0,
        'DigitRatioInURL': sum(c.isdigit() for c in url) / len(url),
        'NoOfSubDomain': urlparse(url).netloc.count('.'),
        'HasCopyrightInfo': 0
    }

    try:
        for _ in range(3):  # Retry up to 3 times
            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                break
            time.sleep(2)
        else:
            print(f"Error fetching the webpage: {response.status_code} {response.reason}")
            return features

        response.raise_for_status()

        lines = response.text.split('\n')
        features['LineOfCode'] = len(lines)
        features['LargestLineLength'] = max(len(line) for line in lines)

        soup = BeautifulSoup(response.text, 'html.parser')
        domain = urlparse(url).netloc

        social_sites = ['facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 'youtube.com']
        copyright_keywords = ['copyright', 'Â©', 'all rights reserved']

        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(url, href)
            if urlparse(full_url).netloc == domain:
                features['NoOfSelfRef'] += 1
            else:
                features['NoOfExternalRef'] += 1
                if any(site in full_url for site in social_sites):
                    features['HasSocialNet'] = 1

        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfJS'] = len(soup.find_all('script'))
        features['NoOfCSS'] = len(soup.find_all('link', {'rel': 'stylesheet'}))

        page_text = soup.get_text().lower()
        features['HasCopyrightInfo'] = 1 if any(word in page_text for word in copyright_keywords) else 0

    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")

    return features

# Function to load models from 'Model_12' folder
def load_models():
    models = {}
    model_names = ["Logistic Regression", "Decision Tree", "Random Forest",
                   "K-Nearest Neighbors", "Naive Bayes", "Neural Network", "SVM"]

    model_folder = "Phishing_URL/Model_12/"  # Path to model files

    for name in model_names:
        filename = f"{model_folder}{name.replace(' ', '_').lower()}.pkl"
        if os.path.exists(filename):  # Check if the file exists
            try:
                with open(filename, 'rb') as f:
                    models[name] = pickle.load(f)
            except Exception as e:
                print(f"Error loading {filename}: {e}")
        else:
            print(f"Warning: {filename} not found. Skipping this model.")

    return models

# Function to predict whether a URL is phishing or legitimate
def predict_url(url, models):
    features = extract_features(url)
    feature_values = np.array(list(features.values())).reshape(1, -1)

    predictions = []
    for name, model in models.items():
        try:
            pred = model.predict(feature_values)[0]
            predictions.append(pred)
            print(f"{name} Prediction: {'Legitimate' if pred else 'Phishing'}")
        except Exception as e:
            print(f"Error predicting with {name}: {e}")

    if predictions:
        avg_prediction = np.mean(predictions)
        mode_prediction = statistics.mode(predictions)

        final_prediction = 'Legitimate' if mode_prediction == 1 else 'Phishing'
        print(f"\nAverage Prediction: {avg_prediction:.4f}")
        print(f"Mode Prediction: {mode_prediction}")
        print(f"Final Prediction (based on mode): {final_prediction}")
    else:
        print("No models available for prediction.")

# Load models from Model_12
models = load_models()

# Get URL input from user
url = input("Enter URL to classify: ")
predict_url(url, models)


Enter URL to classify: https://www.netflix.com/browse
Logistic Regression Prediction: Phishing
Decision Tree Prediction: Legitimate
Random Forest Prediction: Phishing
K-Nearest Neighbors Prediction: Legitimate
Naive Bayes Prediction: Legitimate
Neural Network Prediction: Phishing
SVM Prediction: Phishing

Average Prediction: 0.4286
Mode Prediction: 0
Final Prediction (based on mode): Phishing




### Naive Bayes Model

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pickle
import numpy as np
import statistics
import time
import os

# Function to extract features from a URL
def extract_features(url):
    features = {
        'LineOfCode': 0,
        'LargestLineLength': 0,
        'NoOfSelfRef': 0,
        'IsHTTPS': 1 if url.startswith("https") else 0,
        'NoOfExternalRef': 0,
        'NoOfImage': 0,
        'NoOfJS': 0
    }

    try:
        for _ in range(3):  # We retried up to 3 times
            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                break
            time.sleep(2)
        else:
            print(f"Error fetching the webpage: {response.status_code} {response.reason}")
            return features

        response.raise_for_status()

        lines = response.text.split('\n')
        features['LineOfCode'] = len(lines)
        features['LargestLineLength'] = max(len(line) for line in lines)

        soup = BeautifulSoup(response.text, 'html.parser')
        domain = urlparse(url).netloc

        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(url, href)
            if urlparse(full_url).netloc == domain:
                features['NoOfSelfRef'] += 1
            else:
                features['NoOfExternalRef'] += 1

        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfJS'] = len(soup.find_all('script'))

    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")

    return features

# Function to load models from 'Model_7' folder
def load_models():
    models = {}
    model_names = ["Logistic Regression", "Decision Tree", "Random Forest",
                   "K-Nearest Neighbors", "Naive Bayes", "Neural Network", "SVM"]

    model_folder = "Phishing_URL/Model_7/"  # We set the path to model files

    for name in model_names:
        filename = f"{model_folder}{name.replace(' ', '_').lower()}.pkl"
        if os.path.exists(filename):  # We checked if the file exists
            try:
                with open(filename, 'rb') as f:
                    models[name] = pickle.load(f)
            except Exception as e:
                print(f"Error loading {filename}: {e}")
        else:
            print(f"Warning: {filename} not found. We skipped this model.")

    return models

# Function to predict whether a URL is phishing or legitimate
def predict_url(url, models):
    features = extract_features(url)
    feature_values = np.array(list(features.values())).reshape(1, -1)

    if "Naive Bayes" in models:
        try:
            pred = models["Naive Bayes"].predict(feature_values)[0]
            print(f"Naive Bayes Prediction: {'Legitimate' if pred else 'Phishing'}")
        except Exception as e:
            print(f"Error predicting with Naive Bayes: {e}")
    else:
        print("Naive Bayes model not found.")


# We loaded models
models = load_models()

# We got URL input from the user
url = input("Enter URL to classify: ")
predict_url(url, models)


Enter URL to classify: https://meet.google.com/btu-wpjo-jie
Naive Bayes Prediction: Phishing




### Others

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pickle
import numpy as np
import statistics
import time

def extract_features(url):
    features = {
        'LineOfCode': 0,
        'LargestLineLength': 0,
        'NoOfSelfRef': 0,
        'IsHTTPS': 1 if url.startswith("https") else 0,
        'NoOfExternalRef': 0,
        'NoOfImage': 0,
        'NoOfJS': 0
    }

    try:
        for _ in range(3):  # Retry up to 3 times
            response = requests.get(url)
            if response.status_code == 200:
                break
            time.sleep(2)
        else:
            print(f"Error fetching the webpage: {response.status_code} {response.reason}")
            return features

        response.raise_for_status()

        lines = response.text.split('\n')
        features['LineOfCode'] = len(lines)
        features['LargestLineLength'] = max(len(line) for line in lines)

        soup = BeautifulSoup(response.text, 'html.parser')
        domain = urlparse(url).netloc

        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(url, href)
            if urlparse(full_url).netloc == domain:
                features['NoOfSelfRef'] += 1
            else:
                features['NoOfExternalRef'] += 1

        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfJS'] = len(soup.find_all('script'))

    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")

    return features

def load_models():
    models = {}
    model_names = ["Logistic Regression", "Decision Tree", "Random Forest", "K-Nearest Neighbors", "Naive Bayes", "Neural Network", "SVM"]

    for name in model_names:
        filename = f"{name.replace(' ', '_').lower()}.pkl"
        try:
            with open(filename, 'rb') as f:
                models[name] = pickle.load(f)
        except FileNotFoundError:
            print(f"Warning: {filename} not found. Skipping this model.")

    return models

def predict_url(url, models):
    features = extract_features(url)
    feature_values = np.array(list(features.values())).reshape(1, -1)

    predictions = []
    for name, model in models.items():
        try:
            pred = model.predict(feature_values)[0]
            predictions.append(pred)
            print(f"{name} Prediction: {'Legitimate' if pred else 'Phishing'}")  # Updated label mapping
        except Exception as e:
            print(f"Error predicting with {name}: {e}")

    if predictions:
        avg_prediction = np.mean(predictions)
        mode_prediction = statistics.mode(predictions)

        final_prediction = 'Legitimate' if mode_prediction == 1 else 'Phishing'  # Corrected final decision
        print(f"\nAverage Prediction: {avg_prediction:.4f}")
        print(f"Mode Prediction: {mode_prediction}")
        print(f"Final Prediction (based on mode): {final_prediction}")
    else:
        print("No models available for prediction.")

# Get URL from user and predict
url = input("Enter URL to classify: ")
models = load_models()
predict_url(url, models)


Enter URL to classify: https://www.google.com/search?q=google&rlz=1C1RXQR_enIN979IN980&oq=google+&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIMCAEQIxgnGIAEGIoFMhIIAhAAGEMYgwEYsQMYgAQYigUyDQgDEAAYkQIYgAQYigUyEggEEAAYQxiDARixAxiABBiKBTITCAUQABiDARiRAhixAxiABBiKBTIPCAYQABhDGLEDGIAEGIoFMhIIBxAAGEMYgwEYsQMYgAQYigUyBwgIEAAYjwIyBwgJEAAYjwLSAQoxMDgxN2owajE1qAIJsAIB8QXe6bonfUVCnQ&sourceid=chrome&ie=UTF-8
Logistic Regression Prediction: Phishing
Decision Tree Prediction: Phishing
Random Forest Prediction: Phishing
K-Nearest Neighbors Prediction: Phishing
Naive Bayes Prediction: Phishing
Neural Network Prediction: Phishing
SVM Prediction: Phishing

Average Prediction: 0.0000
Mode Prediction: 0
Final Prediction (based on mode): Phishing




In [None]:
# import requests
# from bs4 import BeautifulSoup
# from urllib.parse import urlparse, urljoin
# import pickle
# import numpy as np
# import re

# def extract_features(url):
#     features = {
#         'LineOfCode': 0,
#         'LargestLineLength': 0,
#         'NoOfSelfRef': 0,
#         'IsHTTPS': 1 if url.startswith("https") else 0,
#         'NoOfExternalRef': 0,
#         'NoOfImage': 0,
#         'NoOfJS': 0
#     }

#     try:
#         response = requests.get(url)
#         response.raise_for_status()

#         lines = response.text.split('\n')
#         features['LineOfCode'] = len(lines)
#         features['LargestLineLength'] = max(len(line) for line in lines)

#         soup = BeautifulSoup(response.text, 'html.parser')
#         domain = urlparse(url).netloc

#         for link in soup.find_all('a', href=True):
#             href = link['href']
#             full_url = urljoin(url, href)
#             if urlparse(full_url).netloc == domain:
#                 features['NoOfSelfRef'] += 1
#             else:
#                 features['NoOfExternalRef'] += 1

#         features['NoOfImage'] = len(soup.find_all('img'))
#         features['NoOfJS'] = len(soup.find_all('script'))

#     except requests.RequestException as e:
#         print(f"Error fetching the webpage: {e}")

#     return features

# # Load models from pickle files
# def load_models():
#     models = {}
#     model_names = ["logistic_regression.pkl", "decision_tree.pkl", "random_forest.pkl","k-nearest_neighbors.pkl","naive_bayes.pkl", "neural_network.pkl", "svm.pkl"]

#     for model_name in model_names:
#         try:
#             with open(model_name, 'rb') as f:
#                 models[model_name] = pickle.load(f)
#         except Exception as e:
#             print(f"Error loading {model_name}: {e}")

#     return models

# # Get user input
# url = input("Enter the URL: ")
# features = extract_features(url)
# feature_values = np.array([list(features.values())]).reshape(1, -1)

# models = load_models()

# # Make predictions
# for name, model in models.items():
#     prediction = model.predict(feature_values)
#     print(f"{name}: {'Phishing' if prediction[0] == 0 else 'Legitimate'}")


In [None]:
# import requests
# from bs4 import BeautifulSoup
# from urllib.parse import urlparse, urljoin
# import pickle
# import numpy as np
# import statistics
# import time

# def extract_features(url):
#     features = {
#         'LineOfCode': 0,
#         'LargestLineLength': 0,
#         'NoOfSelfRef': 0,
#         'IsHTTPS': 1 if url.startswith("https") else 0,
#         'NoOfExternalRef': 0,
#         'NoOfImage': 0,
#         'NoOfJS': 0
#     }

#     try:
#         for _ in range(3):  # Retry up to 3 times
#             response = requests.get(url)
#             if response.status_code == 200:
#                 break
#             time.sleep(2)
#         else:
#             print(f"Error fetching the webpage: {response.status_code} {response.reason}")
#             return features

#         response.raise_for_status()

#         lines = response.text.split('\n')
#         features['LineOfCode'] = len(lines)
#         features['LargestLineLength'] = max(len(line) for line in lines)

#         soup = BeautifulSoup(response.text, 'html.parser')
#         domain = urlparse(url).netloc

#         for link in soup.find_all('a', href=True):
#             href = link['href']
#             full_url = urljoin(url, href)
#             if urlparse(full_url).netloc == domain:
#                 features['NoOfSelfRef'] += 1
#             else:
#                 features['NoOfExternalRef'] += 1

#         features['NoOfImage'] = len(soup.find_all('img'))
#         features['NoOfJS'] = len(soup.find_all('script'))

#     except requests.RequestException as e:
#         print(f"Error fetching the webpage: {e}")

#     return features

# def load_models():
#     models = {}
#     model_names = ["Logistic Regression", "Decision Tree", "Random Forest", "K-Nearest Neighbors", "Naive Bayes", "Neural Network", "SVM"]

#     for name in model_names:
#         filename = f"{name.replace(' ', '_').lower()}.pkl"
#         try:
#             with open(filename, 'rb') as f:
#                 models[name] = pickle.load(f)
#         except FileNotFoundError:
#             print(f"Warning: {filename} not found. Skipping this model.")

#     return models

# def predict_url(url, models):
#     features = extract_features(url)

#     # Display extracted features
#     print("\nExtracted Features:")
#     for key, value in features.items():
#         print(f"{key}: {value}")

#     feature_values = np.array(list(features.values())).reshape(1, -1)

#     predictions = []
#     for name, model in models.items():
#         try:
#             pred = model.predict(feature_values)[0]
#             predictions.append(pred)
#             print(f"{name} Prediction: {'Legitimate' if pred else 'Phishing'}")
#         except Exception as e:
#             print(f"Error predicting with {name}: {e}")

#     if predictions:
#         avg_prediction = np.mean(predictions)
#         mode_prediction = statistics.mode(predictions)

#         final_prediction = 'Legitimate' if mode_prediction == 1 else 'Phishing'
#         print(f"\nAverage Prediction: {avg_prediction:.4f}")
#         print(f"Mode Prediction: {mode_prediction}")
#         print(f"Final Prediction (based on mode): {final_prediction}")
#     else:
#         print("No models available for prediction.")

# # Get URL from user and predict
# url = input("Enter URL to classify: ")
# models = load_models()
# predict_url(url, models)


Enter URL to classify: https://www.amazon.in/

Extracted Features:
LineOfCode: 3011
LargestLineLength: 7131
NoOfSelfRef: 86
IsHTTPS: 1
NoOfExternalRef: 17
NoOfImage: 18
NoOfJS: 60
Logistic Regression Prediction: Legitimate
Decision Tree Prediction: Legitimate
Random Forest Prediction: Legitimate
K-Nearest Neighbors Prediction: Legitimate
Naive Bayes Prediction: Legitimate
Neural Network Prediction: Legitimate
SVM Prediction: Legitimate

Average Prediction: 1.0000
Mode Prediction: 1
Final Prediction (based on mode): Legitimate


