In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Drop unnecessary columns if there are any
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Encode the labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)


In [2]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)


In [4]:
print(X_test_tfidf[0:5])


  (0, 7482)	0.2755095016014955
  (0, 7040)	0.37452799320205127
  (0, 4776)	0.31491292100771273
  (0, 4773)	0.1639052845916638
  (0, 4685)	0.357091702346315
  (0, 3873)	0.16066113692640557
  (0, 3567)	0.32728416624914575
  (0, 3337)	0.3206551961224227
  (0, 3067)	0.3206551961224227
  (0, 2759)	0.32728416624914575
  (0, 1808)	0.2908476600252535
  (1, 7679)	0.07245791194421201
  (1, 6906)	0.1391939644063715
  (1, 6819)	0.21252091630280584
  (1, 6816)	0.15314259442370007
  (1, 6786)	0.08597927139460872
  (1, 5994)	0.173529265786496
  (1, 5922)	0.2791775831288673
  (1, 5917)	0.2791775831288673
  (1, 5916)	0.20088795134690807
  (1, 5709)	0.26618036629009734
  (1, 4919)	0.23473980524482632
  (1, 4918)	0.14561479233537142
  (1, 4912)	0.11039868655458097
  (1, 4869)	0.10739626420319198
  :	:
  (3, 1071)	0.37014377711174457
  (3, 955)	0.11790389269484952
  (4, 7611)	0.17275254473565038
  (4, 7412)	0.21930649622271095
  (4, 7298)	0.23261456021985846
  (4, 7170)	0.13444078299099177
  (4, 7065)	0.1

In [5]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_nb = nb_classifier.predict(X_test_tfidf)

# Evaluate the classifier
print("Naive Bayes Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb)}")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classifier
Accuracy: 0.9623318385650225
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [6]:
# Initialize the Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000)

# Train the classifier
lr_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_lr = lr_classifier.predict(X_test_tfidf)

# Evaluate the classifier
print("\nLogistic Regression Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(classification_report(y_test, y_pred_lr))



Logistic Regression Classifier
Accuracy: 0.9659192825112107
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.99      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [7]:
# Initialize the Support Vector Machine classifier
svm_classifier = SVC()

# Train the classifier
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Evaluate the classifier
print("\nSupport Vector Machine Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(classification_report(y_test, y_pred_svm))



Support Vector Machine Classifier
Accuracy: 0.9820627802690582
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.87      0.93       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
# Function to predict whether a message is spam or not
def predict_message(message):
    message_tfidf = vectorizer.transform([message])  # Transform the input message using the trained vectorizer
    prediction = nb_classifier.predict(message_tfidf)  # Use the trained Naive Bayes classifier to predict
    return 'spam' if prediction[0] == 1 else 'ham'

# Loop to take user input and predict
while True:
    user_input = input("Enter an SMS message (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    prediction = predict_message(user_input)
    print(f"The message is: {prediction}")


Enter an SMS message (or type 'exit' to quit):  Hi babes


The message is: ham


Enter an SMS message (or type 'exit' to quit):  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


The message is: spam


Enter an SMS message (or type 'exit' to quit):  Free entry


The message is: spam


Enter an SMS message (or type 'exit' to quit):  Happy days back


The message is: ham
