loading and setting up data

In [12]:
import pandas as pd

In [13]:
df=pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')

In [14]:
df=df[['v1','v2']]

In [15]:
df.columns=['label','text']

In [16]:
df['label']=df['label'].map({'ham':0,'spam':1})

In [17]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Preprocessing

In [18]:
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [19]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [21]:
Stopwords=set(stopwords.words('english'))
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [22]:
def clean_text(text):
  text=text.lower()
  text=text.translate(str.maketrans('','',string.punctuation))
  tokens=word_tokenize(text)
  tokens=[lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in Stopwords]
  return " ".join(tokens)

df['clean_text']=df['text'].apply(clean_text)

Splitting data

In [23]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
        df['clean_text'], df['label'], test_size=test_size, random_state=42)

Vectorize

In [24]:
vectorizer = CountVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [25]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_tfidf, y_train)

y_pred_lr = log_reg.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9533632286995516

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.68      0.80       150

    accuracy                           0.95      1115
   macro avg       0.96      0.84      0.89      1115
weighted avg       0.95      0.95      0.95      1115



LSTM

In [30]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'])

X_seq = tokenizer.texts_to_sequences(df['clean_text'])
X_pad = pad_sequences(X_seq, padding='post', maxlen=100)

X_train_pad, X_test_pad, y_train_dl, y_test_dl = train_test_split(
    X_pad, df['label'], test_size=0.2, random_state=42)

# Build LSTM
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
history = model.fit(X_train_pad, y_train_dl, validation_data=(X_test_pad, y_test_dl),
                    epochs=10, batch_size=64)

# Evaluate
loss, acc = model.evaluate(X_test_pad, y_test_dl)
print("LSTM Accuracy:", acc)


Epoch 1/10




[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 103ms/step - accuracy: 0.8555 - loss: 0.4914 - val_accuracy: 0.8655 - val_loss: 0.3950
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 103ms/step - accuracy: 0.8601 - loss: 0.4126 - val_accuracy: 0.8655 - val_loss: 0.3952
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 81ms/step - accuracy: 0.8627 - loss: 0.4075 - val_accuracy: 0.8655 - val_loss: 0.3956
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 98ms/step - accuracy: 0.8641 - loss: 0.4038 - val_accuracy: 0.8655 - val_loss: 0.3951
Epoch 5/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 82ms/step - accuracy: 0.8704 - loss: 0.3904 - val_accuracy: 0.8655 - val_loss: 0.3982
Epoch 6/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 93ms/step - accuracy: 0.8626 - loss: 0.4040 - val_accuracy: 0.8655 - val_loss: 0.3980
Epoch 7/10
[1m70/70[0m [32m━━━━━━━━━

Testing

In [28]:
# print("Final Results:")
# print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
# print("LSTM Accuracy:", acc)

# best_model = "LSTM" if acc > accuracy_score(y_test, y_pred_lr) else "Logistic Regression"
# print("Best Model:", best_model)

# # Test on new messages
# sample = ["Congratulations! You won a free ticket to Bahamas. Call now!",
#           "Hey, are we meeting at 6 pm for coffee?"]

# if best_model == "Logistic Regression":
#     sample_tfidf = vectorizer.transform(sample)
#     preds = log_reg.predict(sample_tfidf)
# else:
#     sample_seq = tokenizer.texts_to_sequences(sample)
#     sample_pad = pad_sequences(sample_seq, padding='post', maxlen=100)
#     preds = (model.predict(sample_pad) > 0.5).astype("int32")

# for text, pred in zip(sample, preds):
#     print(f"Message: {text} --> {'SPAM' if pred==1 else 'HAM'}")


In [29]:
print("Final Results on Test Data:")
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("LSTM Accuracy:", acc)

best_model = "LSTM" if acc > accuracy_score(y_test, y_pred_lr) else "Logistic Regression"
print("\nBest Model Based on Test Data:", best_model)

user_msg = input("\nEnter a message to classify (spam/ham): ")

user_msg_clean = clean_text(user_msg)

user_msg_tfidf = vectorizer.transform([user_msg_clean])
pred_lr = log_reg.predict(user_msg_tfidf)[0]

user_msg_seq = tokenizer.texts_to_sequences([user_msg_clean])
user_msg_pad = pad_sequences(user_msg_seq, padding='post', maxlen=100)
pred_lstm = (model.predict(user_msg_pad) > 0.5).astype("int32")[0][0]

print("\nMessage:", user_msg)
print("Logistic Regression says -->", "SPAM" if pred_lr==1 else "HAM")
print("LSTM says -->", "SPAM" if pred_lstm==1 else "HAM")

print(f"\n✅ Proposed Best Model (based on accuracy): {best_model}")


Final Results on Test Data:
Logistic Regression Accuracy: 0.9533632286995516
LSTM Accuracy: 0.865470826625824

Best Model Based on Test Data: Logistic Regression

Enter a message to classify (spam/ham): not goog
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step

Message: not goog
Logistic Regression says --> HAM
LSTM says --> HAM

✅ Proposed Best Model (based on accuracy): Logistic Regression
