In [3]:
import pandas as pd

# load the dataset
df = pd.read_csv("dataset_phishing.csv")

# show first 5 rows
df.head()




Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [4]:
df["status"].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
legitimate,5715
phishing,5715


In [5]:
# convert target labels to numeric
df["status"] = df["status"].map({
    "legitimate": 1,
    "phishing": 0
})

# verify conversion
df["status"].value_counts()


Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
1,5715
0,5715


In [6]:
# separate features and target
X = df.drop("status", axis=1)
y = df["status"]

# check shapes
X.shape, y.shape


((11430, 88), (11430,))

In [7]:
from sklearn.model_selection import train_test_split

# split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# check shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((9144, 88), (2286, 88), (9144,), (2286,))

In [9]:
# keep only numeric columns for training
X_numeric = X.select_dtypes(include=['int64', 'float64'])

# check remaining columns
X_numeric.head()


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
0,37,19,0,3,0,0,0,0,0,0,...,0,0,1,0,45,-1,0,1,1,4
1,77,23,1,1,0,0,0,0,0,0,...,0,1,0,0,77,5767,0,0,1,2
2,126,50,1,4,1,0,1,2,0,3,...,0,1,0,0,14,4004,5828815,0,1,0
3,18,11,0,2,0,0,0,0,0,0,...,0,1,0,0,62,-1,107721,0,0,3
4,55,15,0,2,2,0,0,0,0,0,...,0,0,1,0,224,8175,8725,0,0,6


In [11]:
from sklearn.model_selection import train_test_split

# split numeric features
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((9144, 87), (2286, 87), (9144,), (2286,))

In [12]:
from sklearn.linear_model import LogisticRegression

# create the model
logistic_model = LogisticRegression(
    max_iter=1000,
    solver="liblinear"
)

# train the model
logistic_model.fit(X_train, y_train)

print("Logistic Regression training completed")



Logistic Regression training completed


In [13]:
from sklearn.metrics import accuracy_score, classification_report

# predict on test set
y_pred = logistic_model.predict(X_test)

# accuracy
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)

# full classification report
print(classification_report(y_test, y_pred))


Test Accuracy: 0.799650043744532
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1143
           1       0.80      0.80      0.80      1143

    accuracy                           0.80      2286
   macro avg       0.80      0.80      0.80      2286
weighted avg       0.80      0.80      0.80      2286



In [14]:
from sklearn.tree import DecisionTreeClassifier

# create the model
tree_model = DecisionTreeClassifier(max_depth=5, random_state=42)

# train the model
tree_model.fit(X_train, y_train)

print("Decision Tree training completed")


Decision Tree training completed


In [15]:
# predict on test set
y_pred_tree = tree_model.predict(X_test)

# accuracy
acc_tree = accuracy_score(y_test, y_pred_tree)
print("Decision Tree Test Accuracy:", acc_tree)

# full classification report
print(classification_report(y_test, y_pred_tree))


Decision Tree Test Accuracy: 0.9282589676290464
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      1143
           1       0.92      0.94      0.93      1143

    accuracy                           0.93      2286
   macro avg       0.93      0.93      0.93      2286
weighted avg       0.93      0.93      0.93      2286



In [16]:
import joblib

# save Logistic Regression
joblib.dump(logistic_model, "logistic_model.joblib")

# save Decision Tree
joblib.dump(tree_model, "decision_tree_model.joblib")

print("Models saved successfully")


Models saved successfully


In [17]:
from google.colab import files

# Download Logistic Regression model
files.download("logistic_model.joblib")

# Download Decision Tree model
files.download("decision_tree_model.joblib")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>