In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding='ISO-8859-1', usecols=['v1', 'v2'])
data.head()

In [None]:
data.rename(columns={'v1': 'label', 'v2': 'sms_text'}, inplace=True)
data.head()

In [None]:
sns.countplot(x='label', data=data)
plt.title('Count of labels')
plt.xlabel('Total Count')
plt.ylabel('SMS Category')
plt.show()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm

In [None]:
ps = PorterStemmer()

corpus = []

for i in tqdm(range(0, len(data))):
    preprocessed_sms = re.sub('[^a-zA-Z]', ' ', data['sms_text'][i]) # remove non alphabetical words
    preprocessed_sms = preprocessed_sms.lower() # lowercase sms
    preprocessed_sms = preprocessed_sms.split() # split into words
    preprocessed_sms = [ps.stem(word) for word in preprocessed_sms if not word in stopwords.words('english')] # perform stemming on every word
    preprocessed_sms = ' '.join(preprocessed_sms) # get the stemmed sms
    corpus.append(preprocessed_sms) # append it to the corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a bag of words
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [None]:
data.head()

In [None]:
# One hot encode the label column
y = pd.get_dummies(data, columns=['label'])
y = y.drop(columns=['sms_text', 'label_ham'])
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [None]:
y_pred = spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
print('Accuracy score: ', accuracy_score(y_pred, y_test))

In [None]:
print(confusion_matrix(y_pred, y_test))

In [None]:
import pickle

model_pkl_file = "email_spam_classifier_model.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(spam_detect_model, file)

In [None]:
vectorizer_pkl_file = "email_spam_vectorizer_model.pkl"

with open(vectorizer_pkl_file, 'wb') as file:
    pickle.dump(cv, file)