In [43]:
import pandas as pd 
import numpy as np
from thefuzz import fuzz,process
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv('5.urldata.csv', usecols=["Domain"],low_memory=False)
df["Domain"] = df["Domain"].str.lower()

In [3]:
def find_best_match(phishing_domain, legit_domains):
    # Vectorized fuzzy matching to get the best match
    best_match = process.extractOne(phishing_domain, legit_domains, scorer=fuzz.ratio)
    return best_match  # Returns (best_match_domain, score)

# Convert column to a NumPy array for faster access
legit_domains = df["Domain"].values

# Test with a phishing domain
phishing_input = "gogle.orf"
best_match = find_best_match(phishing_input, legit_domains)

print(f"Phishing Domain: {phishing_input}")
print(f"Suggested Legitimate Domain: {best_match[0]} (Score: {best_match[1]})")

Phishing Domain: gogle.orf
Suggested Legitimate Domain: google.com (Score: 74)


In [16]:
df2 = pd.read_csv('5.urldata.csv')
df2_cleaned = df2.drop(columns=["Domain"])

# Separate features (X) and target (y)
X = df2_cleaned.drop(columns=["Label"])  # Features
y = df2_cleaned["Label"]  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7925

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.94      0.82      1012
           1       0.92      0.64      0.75       988

    accuracy                           0.79      2000
   macro avg       0.82      0.79      0.79      2000
weighted avg       0.82      0.79      0.79      2000


Confusion Matrix:
 [[955  57]
 [358 630]]


In [28]:
X_test

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards
6252,0,0,0,2,0,0,0,0,0,1,0,1,0,0,1,0
4684,0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0
1731,0,1,1,2,0,0,0,0,0,1,0,1,0,0,1,1
4742,0,0,1,10,0,0,0,0,0,1,0,1,0,0,1,0
4521,0,0,1,7,0,0,0,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6412,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0
8285,0,0,0,0,0,0,0,1,1,1,1,1,0,0,1,0
7853,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0
1095,0,0,1,2,0,0,0,0,0,1,0,0,0,0,1,0


In [23]:
y_pred

array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

In [38]:
#testing
df3 = pd.read_csv('5.urldata.csv')
test_case = df3[df3['Domain'] == 'techcrose.com']
g = test_case.drop(columns=["Label"])
f = g.drop(columns=["Domain"])
prediction = model.predict(f)
prediction

array([1], dtype=int64)

In [40]:
df2 = pd.read_csv('5.urldata.csv')

# Create a new DataFrame df5 from the original dataset
df5 = df2.copy()

# Drop the "Domain" column from df5
df5 = df5.drop(columns=["Domain"])

# Separate features (X) and target (y) from df5
X = df5.drop(columns=["Label"])  # Features
y = df5["Label"]  # Target variable

# Split df5 into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", max_depth=6, n_estimators=100, learning_rate=0.1)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.85

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.97      0.87      1012
           1       0.96      0.72      0.83       988

    accuracy                           0.85      2000
   macro avg       0.87      0.85      0.85      2000
weighted avg       0.87      0.85      0.85      2000


Confusion Matrix:
 [[985  27]
 [273 715]]


Parameters: { "use_label_encoder" } are not used.



In [45]:
df2 = pd.read_csv('5.urldata.csv')

# Create a new DataFrame df6 from the original dataset
df6 = df2.copy()

# Drop the "Domain" column from df6
df6 = df6.drop(columns=["Domain"])

# Separate features (X) and target (y) from df6
X = df6.drop(columns=["Label"])  # Features
y = df6["Label"]  # Target variable

# Split df6 into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the CatBoost classifier
model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=8, verbose=0)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.86

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.96      0.87      1012
           1       0.95      0.76      0.84       988

    accuracy                           0.86      2000
   macro avg       0.87      0.86      0.86      2000
weighted avg       0.87      0.86      0.86      2000


Confusion Matrix:
 [[969  43]
 [237 751]]
