In [29]:
import pandas as pd

# Step 1: Load the dataset
input_file = "SMSSpamCollection"  # Adjust path to your downloaded file
output_file = "sms_spam_dataset.csv"

# Step 2: Read the data and split into labels and messages
data = []
with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        # Split the label and message by the first whitespace
        label, message = line.split('\t', 1)
        data.append((label, message.strip()))

# Step 3: Convert to a DataFrame
df = pd.DataFrame(data, columns=["label", "message"])

# Step 4: Save to CSV
df.to_csv(output_file, index=False)

print("CSV file has been created:", output_file)
print(df.head())


CSV file has been created: sms_spam_dataset.csv
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string

In [33]:
data = pd.read_csv("sms_spam_dataset.CSV")
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [34]:
#convert label into binary (spam:1 spam:0)
data['label']=data['label'].map({'spam':1, 'ham':0})


In [35]:
print(data.head())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [41]:
!pip install nltk


In [43]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [47]:
# preprocess text
def preprocess_text(text):
    #lower case
    text=text.lower()
    #Remove punctuation and number
    text = re.sub('[[^a-zA-Z]', ' ', text)
    #Tokenize and remove stopwords
    text = text.split()
    text = [PorterStemmer().stem(word) for word in text if word not in stopwords.words('english')]
    return " ".join(text)
# Apply preprocessing
data['cleaned_message'] = data['message'].apply(preprocess_text)

  text = re.sub('[[^a-zA-Z]', ' ', text)


In [51]:
# Split data
X = data['cleaned_message']
Y = data['label']
# Convert text data into numerical vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Train Test Split

X_train, X_test, Y_train, Y_test = train_test_split(X_tfidf, Y, test_size=0.3, random_state=42)




In [60]:
# Train the Multinominal naive bayes data
model = MultinomialNB()
model.fit(X_train, Y_train)

#Predict one test set
Y_predict = model.predict(X_test)

# Evaluate the model

print("Accuracy:", accuracy_score(Y_test, Y_predict))
print("\nClassification Report:\n", classification_report(Y_test, Y_predict))
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, Y_predict))
 

Accuracy: 0.8810520023909145

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94      1447
           1       1.00      0.12      0.21       226

    accuracy                           0.88      1673
   macro avg       0.94      0.56      0.57      1673
weighted avg       0.90      0.88      0.84      1673


Confusion Matrix:
 [[1447    0]
 [ 199   27]]
