In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

# Loading Dataset

In [14]:
path = '../Data_Source/spam.csv'

# Read the CSV file
df = pd.read_csv(path, encoding='latin-1') [['v1', 'v2']]

# New Column Names
df.columns = ['label', 'text' ]

# ham and spam encoding
df['label'] = df['label'].replace({'ham': 0, 'spam': 1})

print(df.head())

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


# Data Inspection

In [15]:
df.shape

(5572, 2)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   int64 
 1   text    5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


# Feature Engineering

In [17]:
# Extract numerical features
df['text_length'] = df.text.apply(len)
df['num_words'] = df.text.apply(lambda x: len(x.split()))
df['num_digits'] = df.text.apply(lambda x: sum(c.isdigit() for c in x))

# Define Target and Features

In [18]:
X = df[['text_length', 'num_words', 'num_digits']]
y = df['label' ]

# Train Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Standardization

In [21]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Gaussian Naive Bayes Model

In [23]:
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [24]:
# Prediction
y_pred_gnb = gnb.predict(X_test_scaled)

## Evaluation

In [26]:
# Accuracy
accuracy_score(y_test, y_pred_gnb)

0.9802690582959641

In [27]:
# confusion matrix
confusion_matrix(y_test, y_pred_gnb)

array([[953,  12],
       [ 10, 140]], dtype=int64)

In [None]:
# classification report
classification_report(y_test, y_pred)