In [11]:
# Import libraries
import pandas as pd
import numpy as np
import nltk
# import matplotlib.pyplot as plt
# import seaborn as sns
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from pathlib import Path

BASE_DIR = Path.cwd()
DATASET_DIR = BASE_DIR / "dataset"


# Download stopwords
nltk.download("stopwords")

# Load datasets
# df_sms = pd.read_csv("/ai_model/dataset/Spam_SMS.csv")
#
#
# df_email = pd.read_csv("/ai_model/dataset/email.csv")
#
# df_uci_sms = pd.read_csv("/ai_model/dataset/SMSSpamCollection", sep="\t", header=None, names=["Category", "Message"])
df_sms = pd.read_csv(DATASET_DIR / "Spam_SMS.csv")
df_sms.columns = ["Category", "Message"]
df_email = pd.read_csv(DATASET_DIR / "email.csv")
df_email.columns = ["Category", "Message"]

df_uci_sms = pd.read_csv(DATASET_DIR / "SMSSpamCollection", sep="\t", header=None,
                          names=["Category", "Message"])
# Load phishing email datasets
paths = [
    DATASET_DIR/"CEAS_08.csv",
    DATASET_DIR/"Ling.csv",
    DATASET_DIR/"Enron.csv",
    DATASET_DIR/"Nazario.csv",
    DATASET_DIR/"Nigerian_Fraud.csv",
    DATASET_DIR/"SpamAssasin.csv",
    DATASET_DIR/"phishing_email.csv"
]

phishing_dfs = []
for path in paths:
    try:
        df_temp = pd.read_csv(path)
        df_temp = df_temp.iloc[:, :2]
        df_temp.columns = ["Category", "Message"]
        phishing_dfs.append(df_temp)
    except Exception as e:
        print(f"‚ùå Error loading {path}: {e}")

# Combine all datasets
df = pd.concat([df_sms, df_email, df_uci_sms] + phishing_dfs, ignore_index=True)
df.dropna(subset=["Category", "Message"], inplace=True)

# Clean and encode labels
df["Category"] = df["Category"].astype(str).str.lower().str.strip()
df = df[df["Category"].isin(["spam", "ham"])]
df["Category"] = df["Category"].map({"spam": 0, "ham": 1})
df.dropna(inplace=True)

print(f"üì¶ Total Samples Before Balancing: {df.shape[0]}")

# # Visualize original distribution
# sns.countplot(data=df, x="Category")
# plt.title("Original Distribution of Spam vs Ham")
# plt.show()

# Balance the dataset (undersampling)
spam_df = df[df["Category"] == 0]
ham_df = df[df["Category"] == 1].sample(len(spam_df), random_state=42)
df_balanced = pd.concat([spam_df, ham_df]).sample(frac=1, random_state=42)

print(f"üì¶ Total Samples After Balancing: {df_balanced.shape[0]}")

# # Visualize balanced data
# sns.countplot(data=df_balanced, x="Category")
# plt.title("Balanced Spam vs Ham Distribution")
# plt.show()

# Train/test split on balanced data
X = df_balanced["Message"]
Y = df_balanced["Category"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Train model
model = LogisticRegression()
model.fit(X_train_features, Y_train)

# Evaluate model
train_pred = model.predict(X_train_features)
test_pred = model.predict(X_test_features)

print("üìà Accuracy on training data:", accuracy_score(Y_train, train_pred))
print("üìä Accuracy on testing data:", accuracy_score(Y_test, test_pred))

# Confusion matrix
cm = confusion_matrix(Y_test, test_pred)
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
# plt.title("Confusion Matrix")
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.show()

[nltk_data] Downloading package stopwords to /home/sahana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


üì¶ Total Samples Before Balancing: 16718
üì¶ Total Samples After Balancing: 4482
üìà Accuracy on training data: 0.9896792189679219
üìä Accuracy on testing data: 0.9743589743589743


In [17]:
# Test the model
new_messages = ["Congratulations! You have won a free ticket to Bahamas! Click here.",
                "Here is the **Marathi translation** of the given Kannada text: ***Malade:** Your bank account has recently encountered a writedown. Please click on the link below and check your information."]
new_features = vectorizer.transform(new_messages)
predictions = model.predict(new_features)

for msg, pred in zip(new_messages, predictions):
    label = "Ham" if pred == 1 else "Spam"
    print(f"Message: {msg}\nPrediction: {label}\n")

Message: Congratulations! You have won a free ticket to Bahamas! Click here.
Prediction: Spam

Message: Here is the **Marathi translation** of the given Kannada text: ***Malade:** Your bank account has recently encountered a writedown. Please click on the link below and check your information.
Prediction: Spam



In [14]:
!nvidia-smi

Tue Feb  3 19:27:52 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.108                Driver Version: 581.83         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   49C    P8              3W /   70W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------

In [16]:
import joblib

joblib.dump(model, DATASET_DIR / "spam_model.pkl")
joblib.dump(vectorizer, DATASET_DIR / "tfidf_vectorizer.pkl")

['/mnt/d/python_venv/agent-pot/ScamNest/app/ai_model/dataset/tfidf_vectorizer.pkl']