In [None]:
pip install voila nltk tensorflow scikit-learn ipywidgets


In [None]:
# Cell 1: Imports & Setup
import re
import pickle
import nltk
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
import ipywidgets as widgets
from IPython.display import display, Markdown

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
MAX_LEN = 100
phishing_keywords = ['verify', 'login', 'click', 'password', 'urgent', 'account', 'security', 'update']

# Load model and tokenizer
model = tf.keras.models.load_model('spam_model.h5')
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)


In [None]:
# Cell 2: Functions

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\\S+', '', text)
    text = re.sub(r'\\W+', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return \" \".join(words)

def detect_phishing(text):
    urls = re.findall(r'http[s]?://\\S+', text)
    found_keywords = [kw for kw in phishing_keywords if kw in text.lower()]
    return urls, found_keywords

def predict_spam(email_text):
    cleaned = clean_text(email_text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    pred = model.predict(padded)[0][0]
    return pred


In [1]:
from datasets import load_dataset
import pandas as pd

# Load SMS Spam dataset from Hugging Face
dataset = load_dataset("sms_spam")

# Convert to pandas DataFrame
df = dataset["train"].to_pandas()

# Preview the dataset
print(df.head())

 

README.md:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/359k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

                                                 sms  label
0  Go until jurong point, crazy.. Available only ...      0
1                    Ok lar... Joking wif u oni...\n      0
2  Free entry in 2 a wkly comp to win FA Cup fina...      1
3  U dun say so early hor... U c already then say...      0
4  Nah I don't think he goes to usf, he lives aro...      0


In [6]:
# Install the Hugging Face datasets library
!pip install datasets pandas

# Load the dataset
from datasets import load_dataset
import pandas as pd

ds = load_dataset("bvk/SpamAssassin-spam", split="train")  # Raw format with 'text' and 'label'
df = ds.to_pandas()

# Rename columns if needed
df = df.rename(columns={"text": "email", "label": "label"})
print("Rows:", len(df))
print(df.head())
df.to_csv("spamassassin_raw.csv", index=False)





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Rows: 6046
   label                                               data  \
0      0  ['Subject: Re: New Sequences Window  \n    Dat...   
1      0  ["Subject: [zzzzteana] RE: Alexander  \nMartin...   
2      0  ["Subject: [zzzzteana] Moscow bomber  \nMan Th...   
3      0  ["Subject: [IRR] Klez: The Virus That  Won't D...   
4      0  ["Subject: Re: [zzzzteana] Nothing like mama u...   

                                 filename  
0  00001.7c53336b37003a9286aba55d2945844c  
1  00002.9c4069e25e1ef370c078db7ee85ff9ac  
2  00003.860e3c3cee1b42ead714c5c874fe25f7  
3  00004.864220c5b6930b209cc287c361c99af1  
4  00005.bf27cdeaf0b8c4647ecd61b1d09da613  


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
df = pd.read_csv("spamassassin_raw.csv")

# Basic cleanup
df.dropna(subset=["data", "label"], inplace=True)
df["data"] = df["data"].astype(str)

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["data"])

# Convert to sequences
sequences = tokenizer.texts_to_sequences(df["data"])
X = pad_sequences(sequences, maxlen=300)

# Labels
y = df["label"].astype(int).values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=300),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))




Epoch 1/5
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 268ms/step - accuracy: 0.7604 - loss: 0.4628 - val_accuracy: 0.9760 - val_loss: 0.0821
Epoch 2/5
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 236ms/step - accuracy: 0.9777 - loss: 0.0801 - val_accuracy: 0.9405 - val_loss: 0.1551
Epoch 3/5
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 239ms/step - accuracy: 0.9839 - loss: 0.0564 - val_accuracy: 0.9736 - val_loss: 0.1089
Epoch 4/5
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 247ms/step - accuracy: 0.9804 - loss: 0.0606 - val_accuracy: 0.9860 - val_loss: 0.0527
Epoch 5/5
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 248ms/step - accuracy: 0.9919 - loss: 0.0255 - val_accuracy: 0.9851 - val_loss: 0.0514


<keras.src.callbacks.history.History at 0x2aea1a241d0>

In [12]:
import pickle

model.save("spam_detector_lstm.h5")

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)




In [17]:
email = ["Hi Team, Just a reminder we have a meeting tomorrow at 10 AM."]
seq = tokenizer.texts_to_sequences(email)
padded = pad_sequences(seq, maxlen=300)
pred = model.predict(padded)
print("Prediction score:", pred[0][0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Prediction score: 0.13983499
