**Binary classifier**

In [11]:
# Dataset

from google.colab import drive
import pandas as pd

drive.mount("/content/drive")

y_names = ["label", "message"]

# SMS spam collection (Department of Telematics, School of Electrical and Computer Engineering at University of Campinas, Brazil)
# http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/
dataset = pd.read_csv("/content/drive/My Drive/data/sms_spam/SMSSpamCollection.txt", sep="\t", names=y_names)

y = dataset.get(y_names[0]).tolist()
raw_dataset = dataset.get(y_names[1]).tolist()

print(dataset.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
# Text preprocessing

import nltk
from nltk.tokenize import regexp_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
nltk.download("stopwords")

dataset = []
stemmer = SnowballStemmer("english")
stopwords_en = stopwords.words("english")

for i in range(0, len(raw_dataset)):  
  tokens = regexp_tokenize(str(raw_dataset[i]), r"\w+")
  stems = [stemmer.stem(token) for token in tokens]
  words_no_stopwords = [word for word in stems if word not in stopwords_en]
  document = ' '.join(words_no_stopwords)
  dataset.append(document)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Feature extraction (converting text to vectors)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset).toarray()

In [5]:
# Split training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
# Train model

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [8]:
# Model evaluation

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[955   0]
 [ 33 127]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       955
        spam       1.00      0.79      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

0.9704035874439462


In [9]:
# Predict unseen data

unseen_sentence = input("Enter a message: ")
X_unseen = vectorizer.transform([unseen_sentence]).toarray()
y_unseen = classifier.predict(X_unseen)

print("The pedicted class for that message is:", y_unseen)

Enter a message: You win the first prize
The pedicted class for that message is: ['spam']


In [10]:
# Model serialization

import pickle

with open("binary_classifier.pickle", "wb") as pickle_file:
  pickle.dump(classifier, pickle_file)

with open("binary_classifier.pickle", "rb") as serialized_model:
  loaded_model = pickle.load(serialized_model)

y_pred2 = loaded_model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

[[955   0]
 [ 33 127]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       955
        spam       1.00      0.79      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

0.9704035874439462
