In [18]:
!pip install pandas nltk scikit-learn numpy



In [19]:
# Download the dataset directly from GitHub as 'spam.csv'
!wget https://raw.githubusercontent.com/babar-a11y/AI-BS_AI-3-1-/main/spam.csv

--2025-11-30 17:06:55--  https://raw.githubusercontent.com/babar-a11y/AI-BS_AI-3-1-/main/spam.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480130 (469K) [text/plain]
Saving to: ‘spam.csv’


2025-11-30 17:06:55 (13.0 MB/s) - ‘spam.csv’ saved [480130/480130]



In [20]:
import pandas as pd

# Replace 'spam.csv' with your file's name
df = pd.read_csv('spam.csv', encoding='latin-1')  # Use 'latin-1' if UTF-8 fails
df.head()  # Preview the first 5 rows

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
df = df[['Category', 'Message']]  # Keep only relevant columns

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

def preprocess_text(text):
    if pd.isna(text):
        text = ''
    text = text.lower()
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Combine columns if needed (e.g., subject + message)
df['cleaned_text'] = df['Message'].apply(preprocess_text)  # Adjust 'text' to your column name
df['Category'] = df['Category'].map({'spam': 1, 'ham': 0})  # Encode labels
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Category,Message,cleaned_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [23]:
df['cleaned_text'] = (df['Message'].fillna('')).apply(preprocess_text)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['Category']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.97847533632287
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [25]:
import nltk
nltk.download('punkt_tab')

new_email = "Free offer! Click to win $1000 now!"  # Adjust based on your dataset's format
new_email_cleaned = preprocess_text(new_email)
new_email_vector = vectorizer.transform([new_email_cleaned])
prediction = model.predict(new_email_vector)
print("Prediction (1=spam, 0=ham):", prediction[0])

Prediction (1=spam, 0=ham): 1


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [26]:
import joblib
joblib.dump(model, 'spam_detector_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']