# **Setup**

In [None]:

import pandas as pd
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping



# **Setup input pipeline**

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2023-07-26 14:50:08--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv.1’


2023-07-26 14:50:09 (11.6 MB/s) - ‘train-data.tsv.1’ saved [358233/358233]

--2023-07-26 14:50:09--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv.1’


2023-07-26 14:50:09 (1.75 MB/s) - ‘valid-data.tsv.1’ saved [118774/118774]



In [None]:
train_dataset = pd.read_csv(train_file_path, sep='\t', names=["class", "messages"])
train_dataset.head(3)

Unnamed: 0,class,messages
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."


In [None]:
test_dataset = pd.read_csv(test_file_path, sep='\t', names=["class", "messages"])
test_dataset.head(3)

Unnamed: 0,class,messages
0,ham,i am in hospital da. . i will return home in e...
1,ham,"not much, just some textin'. how bout you?"
2,ham,i probably won't eat at all today. i think i'm...


In [None]:
#define x train, y train, x test, and y test
x_train = train_dataset.messages.values.tolist()
y_train = np.array([0 if x=="ham" else 1 for x in train_dataset['class'].values.tolist()])

x_test = test_dataset.messages.values.tolist()
y_test = np.array([0 if x=="ham" else 1 for x in test_dataset['class'].values.tolist()])

# **Create the text encoder**

In [None]:
vocabulary_dict = {}
for message in x_train:
  for vocabulary in message.split():
    if vocabulary not in vocabulary_dict:
      vocabulary_dict[vocabulary] = 1
    else:
      vocabulary_dict[vocabulary] += 1

In [None]:
vocab_size = len(vocabulary_dict)
max_length = len(max(x_train, key=lambda p: len(p.split())).split())

In [None]:
encoded_x_train = [one_hot(d, vocab_size) for d in x_train]
padded_x_train = pad_sequences(encoded_x_train, maxlen=max_length, padding='post')
encoded_x_test = [one_hot(d, vocab_size) for d in x_test]
padded_x_test = pad_sequences(encoded_x_test, maxlen=max_length, padding='post')

# **Create the model**

In [None]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, input_length=max_length,mask_zero=True)
model.add(embedding_layer)
model.add(Dense(64, activation='relu'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
monitor = EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=25, verbose=1, mode='max', restore_best_weights=True)
model.fit(padded_x_train, y_train, validation_data=(padded_x_test, y_test), callbacks=[monitor], epochs=10000, verbose=2)

Epoch 1/10000
131/131 - 4s - loss: 0.2684 - acc: 0.8978 - val_loss: 0.0844 - val_acc: 0.9770 - 4s/epoch - 34ms/step
Epoch 2/10000
131/131 - 4s - loss: 0.0458 - acc: 0.9876 - val_loss: 0.0444 - val_acc: 0.9856 - 4s/epoch - 28ms/step
Epoch 3/10000
131/131 - 3s - loss: 0.0156 - acc: 0.9955 - val_loss: 0.0388 - val_acc: 0.9878 - 3s/epoch - 22ms/step
Epoch 4/10000
131/131 - 3s - loss: 0.0063 - acc: 0.9981 - val_loss: 0.0369 - val_acc: 0.9885 - 3s/epoch - 21ms/step
Epoch 5/10000
131/131 - 3s - loss: 0.0029 - acc: 0.9998 - val_loss: 0.0384 - val_acc: 0.9892 - 3s/epoch - 23ms/step
Epoch 6/10000
131/131 - 4s - loss: 0.0019 - acc: 0.9998 - val_loss: 0.0421 - val_acc: 0.9892 - 4s/epoch - 29ms/step
Epoch 7/10000
131/131 - 3s - loss: 0.0012 - acc: 0.9998 - val_loss: 0.0417 - val_acc: 0.9892 - 3s/epoch - 21ms/step
Epoch 8/10000
131/131 - 3s - loss: 9.0411e-04 - acc: 0.9998 - val_loss: 0.0442 - val_acc: 0.9885 - 3s/epoch - 21ms/step
Epoch 9/10000
131/131 - 3s - loss: 5.4440e-04 - acc: 0.9998 - val_lo

<keras.callbacks.History at 0x78f9b1eaa740>

# **Model evaluation**

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  class_dict = {
      0 : "ham",
      1 : "spam",
      }
  encoded_message = [one_hot(pred_text, vocab_size)]
  padded_message = pad_sequences(encoded_message, maxlen=max_length, padding='post')
  prediction = [model.predict(padded_message)[0][0], class_dict[np.round(model.predict(padded_message)[0][0])]]
  return prediction

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[0.0014723837, 'ham']


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
