In [None]:
import pandas as pd

# 1. Data Preparation

In [None]:
# Load the dataset

df = pd.read_csv('spam.csv', encoding='latin-1')

In [None]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
df = df[['v1', 'v2']]

In [None]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
df.columns = ['label', 'message']

In [None]:
df.columns

Index(['label', 'message'], dtype='object')

In [None]:
# Display basic info about the dataset

print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


# 2. Text Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import re

In [None]:
# Function to clean text

def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    return text

In [None]:
df['cleaned_message'] = df['message'].apply(preprocess_text)

In [None]:
df['cleaned_message']

0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in a wkly comp to win fa cup final ...
3            u dun say so early hor u c already then say 
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the nd time we have tried contact u u ...
5568                will ì_ b going to esplanade fr home 
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: cleaned_message, Length: 5572, dtype: object

In [None]:
# Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_message'], df['label'], test_size=0.2, random_state=42)

# 3. Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_vectorizer

In [None]:
# Fit and transform the training data

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:
X_train_tfidf

<4457x6961 sparse matrix of type '<class 'numpy.float64'>'
	with 57419 stored elements in Compressed Sparse Row format>

# 4. Model Training

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
# Initialize and train the Naive Bayes classifier

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

In [None]:
# Predict on the test set

y_pred = nb_classifier.predict(X_train_tfidf)

In [None]:
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)

In [None]:
# Predict on the test set

y_pred = lr_classifier.predict(X_train_tfidf)

In [None]:
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

# 5. Model Evaluation

In [None]:
from sklearn.svm import SVC

# Initialize and train the Support Vector Machine classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)

In [None]:
# Predict on the test set

y_pred = svm_classifier.predict(X_train_tfidf)

In [None]:
y_pred

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)