# Library

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import joblib

# Import Data

In [2]:
df = pd.read_csv('../data/cleaned_spam_data.csv', on_bad_lines='skip')
df

Unnamed: 0,label,cleaned_message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah dont think goe usf live around though
...,...,...
5567,1,2nd time tri 2 contact u u å£750 pound prize 2...
5568,0,ì b go esplanad fr home
5569,0,piti mood soani suggest
5570,0,guy bitch act like id interest buy someth els ...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   label            5572 non-null   int64 
 1   cleaned_message  5567 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


#### The reason for having missing values in the 'cleaned_message' column is beacuse many sentances were written in stopwords and puntuations only whith resulted in a empty string in the preprocesing procedure

In [4]:
df = df.dropna() 

In [5]:
df[df['cleaned_message'].isna()].sum()

label              0
cleaned_message    0
dtype: object

# Features and Label

In [6]:
X = df['cleaned_message']
y = df['label']

# Vectorization

In [7]:
tfidf = TfidfVectorizer(max_features=3000)
X_vectorized = tfidf.fit_transform(X).toarray()

In [8]:
joblib.dump(X_vectorized, '../outputs/models/vectorizer.pkl')

['../outputs/models/vectorizer.pkl']

# Train-Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Model Training

In [10]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation

In [11]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


print("Accuracy:", accuracy)
print(report)

Accuracy: 0.981149012567325
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       959
           1       0.99      0.87      0.93       155

    accuracy                           0.98      1114
   macro avg       0.99      0.93      0.96      1114
weighted avg       0.98      0.98      0.98      1114



# Save Model Report

In [12]:
report_text = f"Accuracy: {accuracy:.6f}\n\n{report}"

with open('../outputs/reports/classification_report.txt', 'w') as f:
    f.write(report_text)

# Save model

In [13]:
joblib.dump(model, '../outputs/models/multinomial_nb_model.pkl')

['../outputs/models/multinomial_nb_model.pkl']