<a href="https://colab.research.google.com/github/bfeijoj/SMS-Ham-or-Spam/blob/main/SMS_Ham_or_Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing import sequence

In [2]:
# -------------------------------------------------------------------------- Get data files ------------------------------------------------------------------------------

train_data_url = "https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/train-data.tsv"
test_data_url = "https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/valid-data.tsv"

train_file_path = tf.keras.utils.get_file("train-data.tsv", train_data_url)
test_file_path = tf.keras.utils.get_file("valid-data.tsv", test_data_url)

Downloading data from https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/train-data.tsv
Downloading data from https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/valid-data.tsv


In [3]:
# -------------------------------------------------------------------------- Data Management ------------------------------------------------------------------------------

import string

raw_train_data = pd.read_csv(train_file_path, sep='\t', header = None, 
                  names=["type", "message"])
raw_test_data = pd.read_csv(test_file_path, sep='\t', header = None, 
                  names=["type", "message"])

raw_train_data['type'].replace('ham', 0, inplace = True)
raw_train_data['type'].replace('spam', 1, inplace = True)
raw_train_data['message'] = raw_train_data['message'].str.lower()
raw_train_data['message'] = raw_train_data['message'].str.translate(str.maketrans(' ', ' ', string.punctuation))
raw_train_data['message'] = raw_train_data['message'].str.split()

raw_test_data['type'].replace('ham', 0, inplace = True)
raw_test_data['type'].replace('spam', 1, inplace = True)
raw_test_data['message'] = raw_test_data['message'].str.lower()
raw_test_data['message'] = raw_test_data['message'].str.translate(str.maketrans(' ', ' ', string.punctuation))
raw_test_data['message'] = raw_test_data['message'].str.split()

raw_train_data.tail()

Unnamed: 0,type,message
4174,0,"[just, woke, up, yeesh, its, late, but, i, did..."
4175,0,"[what, do, u, reckon, as, need, 2, arrange, tr..."
4176,1,"[free, entry, into, our, £250, weekly, competi..."
4177,1,"[pls, stop, bootydelious, 32f, is, inviting, y..."
4178,0,"[tell, my, bad, character, which, u, dnt, lik,..."


In [4]:
# --------------------------------------------------------------------------- Get Index Function --------------------------------------------------------------------------

def get_index(df_column):

  count = 1
  bag_of_words_dict = {}
  index_dict = {}
  bag_of_words = []

  for ii in range(len(df_column)):

    len_list = len(df_column[ii])

    for jj in range(len_list):

      if df_column[ii][jj] not in bag_of_words:

        bag_of_words.append(df_column[ii][jj])
        bag_of_words_dict[df_column[ii][jj]] = count
        index_dict[count] = df_column[ii][jj]
        count += 1

  return bag_of_words_dict, index_dict

data_index = pd.concat([raw_train_data, raw_test_data], axis = 0, ignore_index = True)

index, inverse_index = get_index(data_index['message'])

vocabulary_size = len(index)

In [5]:
# ---------------------------------------------------------------------------- Encoding Function --------------------------------------------------------------------------

def encoding(data, index):

  for ii in range(len(data['message'])):
    for jj in range(len(data['message'][ii])):

      data['message'][ii][jj] = index[data['message'][ii][jj]]

  return data

train_data_encoded = encoding(raw_train_data, index)
test_data_encoded = encoding(raw_test_data, index)

train_data = np.array(train_data_encoded['message'])
train_data_labels = np.array(train_data_encoded.pop('type'))

test_data = np.array(test_data_encoded['message'])
test_data_labels = np.array(test_data_encoded.pop('type'))

In [8]:
# ------------------------------------------------------------------------ Regularizing data length -------------------------------------------------------------------------

max_length = 50

train_data = sequence.pad_sequences(train_data, max_length)
test_data = sequence.pad_sequences(test_data, max_length)

In [9]:
# ------------------------------------------------------------------------- Building the model ------------------------------------------------------------------------------

model = keras.Sequential([
  keras.layers.Embedding(vocabulary_size + 1, 32),
  keras.layers.LSTM(32),
  keras.layers.Dense(1, activation='sigmoid')])

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          308864    
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 317,217
Trainable params: 317,217
Non-trainable params: 0
_________________________________________________________________


In [10]:
# ----------------------------------------------------------------------------- Training ------------------------------------------------------------------------------------

history = model.fit(train_data, train_data_labels, epochs = 10, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
# ----------------------------------------------------------------------- Evaluating the model -----------------------------------------------------------------------------

results = model.evaluate(test_data, test_data_labels)
print(results)

[0.06339622288942337, 0.9856321811676025]


In [13]:
def predict_message(message):

  lower_text = message.lower()
  no_punc_text = lower_text.translate(str.maketrans(' ', ' ', string.punctuation))
  splited_text = no_punc_text.split()
  encoded_text = np.array([index[ii] for ii in splited_text]).reshape(1, len(splited_text))
  encoded_text = sequence.pad_sequences(encoded_text, max_length)
  predict = model.predict(encoded_text)
  if predict > 0.5:
    prediction = 'Spam, with {}% accuracy.'.format(predict * 100)
  else:
    prediction = 'Ham, with {}% accuracy.'.format((1 - predict) * 100)
  return (prediction)

message = 'How are you doing today?'

prediction = predict_message(message)
print(prediction)

Ham, with [[99.993965]]% accuracy.
