In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import seaborn as sns

In [None]:
df_sms = pd.read_csv("spam.csv", encoding= 'latin-1')

In [None]:
df_sms.head()

In [None]:
df_sms = df_sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df_sms = df_sms.rename(columns={"v1": "label", "v2": "sms"})
df_sms.head()

In [None]:
df_sms.label.value_counts()

In [None]:
df_sms['length'] = df_sms['sms'].apply(len)

In [None]:
df_sms.hist(column='length', by='label', bins=50, figsize=(10, 4))

In [None]:
df_sms.loc[:, 'label'] = df_sms.label.map({'ham':0, 'spam':1})
df_sms.head()

## implementation of bag of words

In [None]:
documents = ['Hello', 'how are you', 'win money', 'call me now', 'Hello, call me tomorrow', 'Win an iphone 15 pro max']
lower_case_documents = []
lower_case_documents = [d.lower() for d in documents]
print(lower_case_documents)

In [None]:
sans_punctuation_documents = []
import string

for i in lower_case_documents:
    sans_punctuation_documents.append(i.translate(str.maketrans("", "", string.punctuation)))
sans_punctuation_documents 

In [None]:
preprocessed_documents = [[w for w in d.split()] for d in sans_punctuation_documents]
preprocessed_documents

In [None]:
frequency_list = []
import pprint
from collections import Counter

frequency_list = [Counter(d) for d in preprocessed_documents]
pprint.pprint(frequency_list)

## Implementing Bag of Words in scikit-learn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

## Data Preprocessing with CountVectorizer

In [None]:
count_vector.fit(documents)
count_vector.get_feature_names_out()

In [None]:
doc_array = count_vector.transform(documents).toarray()
doc_array

In [None]:
frequency_matrix = pd.DataFrame(doc_array, columns= count_vector.get_feature_names_out())
frequency_matrix

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_sms['sms'], df_sms['label'], test_size=0.20, random_state=1)

In [None]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(x_train)
testing_data = count_vector.transform(x_test)

## Implementation of Naive Bayes Machine learning Algorithm

In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

In [None]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
predictions = naive_bayes.predict(testing_data)

## Evaluating our SMS Spam Detecton Model

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

print('Accuracy_score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision Score: {}'.format(precision_score(y_test, predictions)))
print('recall_score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))