<a href="https://colab.research.google.com/github/aleng13/phish-and-destroy/blob/main/Final_Enhanced_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INITIAL CONFIGURATION

In [None]:
#Install the Kaggle API client
!pip install -q kaggle

In [None]:
#Move the kaggle.json file to the correct directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
#You can download the dataset directly. In a new code cell, run
!kaggle datasets download naserabdullahalam/phishing-email-dataset

Dataset URL: https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset
License(s): CC-BY-SA-4.0
Downloading phishing-email-dataset.zip to /content
 95% 73.0M/77.1M [00:00<00:00, 764MB/s]
100% 77.1M/77.1M [00:00<00:00, 759MB/s]


In [None]:
#To access the CSV file, you need to unzip the downloaded file
!unzip phishing-email-dataset.zip

Archive:  phishing-email-dataset.zip
  inflating: CEAS_08.csv             
  inflating: Enron.csv               
  inflating: Ling.csv                
  inflating: Nazario.csv             
  inflating: Nigerian_Fraud.csv      
  inflating: SpamAssasin.csv         
  inflating: phishing_email.csv      


In [None]:
import pandas as pd

df = pd.read_csv('phishing_email.csv')

# You can now inspect the first few rows of the DataFrame
print(df.head())

                                       text_combined  label
0  hpl nom may 25 2001 see attached file hplno 52...      0
1  nom actual vols 24 th forwarded sabrae zajac h...      0
2  enron actuals march 30 april 1 201 estimated a...      0
3  hpl nom may 30 2001 see attached file hplno 53...      0
4  hpl nom june 1 2001 see attached file hplno 60...      0


In [None]:
#Which column contains the label (like phishing or not_phishing or 0/1)?
df.columns

Index(['text_combined', 'label'], dtype='object')

# PREPROCESSING THE DATASET

In [None]:
# ==============================
# 🧹 2. PREPROCESSING THE DATASET
# ==============================

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the phishing dataset
df = pd.read_csv('phishing_email.csv')

# Rename columns if needed (make sure you know the actual column names)
df.rename(columns={"text_combined": "text"}, inplace=True)

# Basic text preprocessing
# Lowercase, strip whitespaces
df['cleaned_text'] = df['text'].astype(str).str.lower().str.strip()

# Feature engineering

df['email_length'] = df['cleaned_text'].apply(len)
df['num_exclamations'] = df['cleaned_text'].apply(lambda x: x.count('!'))
df['num_links'] = df['cleaned_text'].apply(lambda x: x.count('http') + x.count('www'))
df['num_uppercase_words'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
df['num_special_chars'] = df['text'].apply(lambda x: sum(not c.isalnum() and not c.isspace() for c in x))


# VECTORIZING THE TEXT DATA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['cleaned_text'])

# Extract numeric features
numeric_features = df[['email_length', 'num_exclamations', 'num_links',
                      'num_uppercase_words', 'num_special_chars']]

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(numeric_features)

# Combine all features
from scipy.sparse import hstack
X_final = hstack([X_tfidf, X_scaled])
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)


# MODEL TRAINING & SAVING

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Save model, vectorizer, and scaler
with open('phishing_detector_model_enhanced.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf_vectorizer_enhance.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('feature_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("\n✅ All files saved successfully. Ready for Streamlit deployment!")



Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7935
           1       0.98      0.99      0.98      8563

    accuracy                           0.98     16498
   macro avg       0.98      0.98      0.98     16498
weighted avg       0.98      0.98      0.98     16498


✅ All files saved successfully. Ready for Streamlit deployment!


# Downloading the Files

In [12]:
from google.colab import files

files.download("phishing_detector_model_enhanced.pkl")
files.download("tfidf_vectorizer_enhanced.pkl")
files.download("feature_scaler_enhanced.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>