In [None]:
import pandas as pd 

df = pd.read_csv("/Users/ahmed/repositories/enron_spam_data.csv", encoding='latin-1')
df.head()

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14


In [4]:
## Text Preprocessing (Minimal, Intentional)
import re

def basic_preprocess(text):
    """Basic text preprocessing: lowercase and remove special characters."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [5]:
# Show original and cleaned messages for comparison
df_compare = pd.read_csv("/Users/ahmed/repositories/enron_spam_data.csv", encoding='latin-1')
df_compare['Message_clean'] = df_compare['Message'].apply(basic_preprocess)
df_compare[['Message', 'Message_clean']].head(10)

Unnamed: 0,Message,Message_clean
0,,
1,"gary , production from the high island larger ...",gary production from the high island larger b...
2,- calpine daily gas nomination 1 . doc,calpine daily gas nomination 1 doc
3,fyi - see note below - already done .\nstella\...,fyi see note below already done \nstella\n ...
4,fyi .\n- - - - - - - - - - - - - - - - - - - -...,fyi \n forwarded by lauri...
5,"jackie ,\nsince the inlet to 3 river plant is ...",jackie \nsince the inlet to 3 river plant is s...
6,"george ,\ni need the following done :\njan 13\...",george \ni need the following done \njan 13\nz...
7,fyi\n- - - - - - - - - - - - - - - - - - - - -...,fyi\n forwarded by gary l...
8,there are two fields of gas that i am having d...,there are two fields of gas that i am having d...
9,thanks so much for the memo . i would like to ...,thanks so much for the memo i would like to r...


In [6]:
df['Message'] = df['Message'].apply(basic_preprocess)

In [7]:
## Train / test Split (Security-Correct)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["Message"],
    df["Spam/Ham"],
    test_size=0.2,
    stratify=df["Spam/Ham"],
    random_state=42
)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

Training samples: 26972, Testing samples: 6744


In [8]:
## TF-IDF Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(X_train_tfidf)

  (0, 30)	0.10040518161944313
  (0, 4592)	0.08910106415716591
  (0, 1745)	0.07818139274401767
  (0, 2589)	0.06500895449666307
  (0, 2399)	0.06375963434787166
  (0, 3195)	0.07270194459540937
  (0, 2052)	0.04555557573832732
  (0, 2162)	0.10798289032842123
  (0, 1852)	0.08320421016571056
  (0, 2047)	0.09094282425833758
  (0, 4886)	0.07180399928998381
  (0, 4347)	0.05179325572329931
  (0, 4719)	0.09812658595618652
  (0, 4446)	0.04936502754612546
  (0, 4887)	0.09746698572748717
  (0, 477)	0.0768464575934019
  (0, 1319)	0.0676296408036416
  (0, 534)	0.10500186173417936
  (0, 4785)	0.10375969300504383
  (0, 3117)	0.10633066059117573
  (0, 4157)	0.11179111444693796
  (0, 2897)	0.08466829972262878
  (0, 3812)	0.0831429550155204
  (0, 4097)	0.10813356735416646
  (0, 2098)	0.04457406773589702
  :	:
  (26971, 223)	0.04307762623375375
  (26971, 1698)	0.012030744022234743
  (26971, 1265)	0.031235404982427398
  (26971, 151)	0.031950862269613335
  (26971, 3934)	0.03962747372681928
  (26971, 2138)	0.02

In [None]:
## Handling Class Imbalance and Model Training

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

model.fit( X_train_tfidf, y_train)


In [None]:
## Model Evaluation

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      0.98      0.98      3309
        spam       0.98      0.99      0.98      3435

    accuracy                           0.98      6744
   macro avg       0.98      0.98      0.98      6744
weighted avg       0.98      0.98      0.98      6744



In [20]:
## Feature Explainability
import numpy as np

feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]

top_phishing = sorted(
    zip(feature_names, coefficients),
    key=lambda x: x[1],
    reverse=True
)[:30]

for f, w in top_phishing:
    print(f, round(w, 4))

http 4.1803
2005 3.8482
2004 3.7847
no 3.6168
your 3.4963
money 3.4141
remove 3.1786
mobile 3.0851
site 2.8812
life 2.8253
here 2.683
our 2.6669
you 2.6591
now 2.5688
software 2.5014
info 2.4956
only 2.4549
account 2.4315
de 2.4291
more 2.3933
man 2.3678
hello 2.3656
quality 2.3627
ca 2.3319
statements 2.246
investment 2.2028
php 2.1834
here to 2.1827
men 2.1824
of 2.1731


In [19]:
# Top features that push against the positive class (i.e., toward the other class)
top_nonphishing = sorted(zip(feature_names, coefficients), key=lambda x: x[1])[:20]
for f, w in top_nonphishing:
    print(f, round(w, 4))

enron -10.359
vince -5.9895
thanks -5.462
attached -5.0898
louise -4.9603
2001 -4.6856
713 -4.352
2000 -3.8236
doc -3.7182
will -3.668
energy -3.4293
houston -3.3142
enron com -3.2759
on -3.2595
deal -3.2011
please -3.1766
here is -3.145
daren -3.145
meeting -3.1216
gas -3.1066


In [22]:
## Save Model and Vectorizer
import joblib

joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']