In [39]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [40]:
# First we read both tsv and convert them into a valid dataframe
train_df = pd.read_csv('train-data.tsv', sep='\t', header=None)
train_df.columns = ['spam', 'text']

valid_df = pd.read_csv('valid-data.tsv', sep='\t', header=None)
valid_df.columns = ['spam', 'text']

train_df

Unnamed: 0,spam,text
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...
...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...
4175,ham,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...


In [41]:
# We convert the kind column into a one-hot
train_df['spam'] = train_df['spam'].map({'ham': 0,'spam': 1})
valid_df['spam'] = valid_df['spam'].map({'ham': 0,'spam': 1})

train_df

Unnamed: 0,spam,text
0,0,ahhhh...just woken up!had a bad dream about u ...
1,0,you can never do nothing
2,0,"now u sound like manky scouse boy steve,like! ..."
3,0,mum say we wan to go then go... then she can s...
4,0,never y lei... i v lazy... got wat? dat day ü ...
...,...,...
4174,0,just woke up. yeesh its late. but i didn't fal...
4175,0,what do u reckon as need 2 arrange transport i...
4176,1,free entry into our £250 weekly competition ju...
4177,1,-pls stop bootydelious (32/f) is inviting you ...


In [44]:
# We combine both df to tokenize the words 
combined_df = pd.concat([train_df, valid_df])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_df['text'])

# Convert text to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
valid_sequences = tokenizer.texts_to_sequences(valid_df['text'])

word_index = tokenizer.word_index
vocab_size = len(word_index)
print("Number of unique words:", vocab_size)

# Padding sequences to ensure they have the same length
max_sequence_length = 100

train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
valid_sequences = pad_sequences(valid_sequences, maxlen=max_sequence_length)

Number of unique words: 8995


In [43]:
# We get the train and test labels
train_labels = train_df.pop('spam')
valid_labels = valid_df.pop('spam')

train_labels

0       0
1       0
2       0
3       0
4       0
       ..
4174    0
4175    0
4176    1
4177    1
4178    0
Name: spam, Length: 4179, dtype: int64

In [48]:
# We create the model by adding a Embedding layer that will make indices for our words, an lstm layer and finally a dense layer with one output with a sigmoid activation where 1 = spam and 0 = ham

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=vocab_size + 1, output_dim=32, input_length=max_sequence_length))
model.add(tf.keras.layers.LSTM(32))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

# We compile the model
model.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy']
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 32)           287872    
                                                                 
 lstm_2 (LSTM)               (None, 32)                8320      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 296225 (1.13 MB)
Trainable params: 296225 (1.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [68]:
# Finally, we train the model
epochs = 10
history = model.fit(train_sequences, train_labels, epochs=20, validation_data=(valid_sequences,valid_labels), batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [114]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  prediction = ''
  input_text = [pred_text]
  input_sequences = tokenizer.texts_to_sequences(input_text)
  input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length)

  predictions = model.predict(input_sequences)
  rounded_predictions = np.round(predictions)

  if rounded_predictions == 1.0:
     prediction = 'spam'
  else:
    prediction = 'ham'
  return prediction

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

ham


In [115]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

You passed the challenge. Great job!
