In [3]:
df_train = pd.read_csv(train_file_path, sep = '\t', header = None, names = ['Class', 'Message'])
df_test = pd.read_csv(test_file_path, sep = '\t', header = None, names = ['Class', 'Message'])

In [4]:
print(df_train.shape)
print(df_test.shape)

(4179, 2)
(1392, 2)


In [5]:
df_train.head(10)

Unnamed: 0,Class,Message
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...
5,ham,in xam hall boy asked girl tell me the startin...
6,ham,genius what's up. how your brother. pls send h...
7,ham,they finally came to fix the ceiling.
8,spam,urgent! call 09066350750 from your landline. y...
9,ham,now that you have started dont stop. just pray...


In [6]:
df_test.head(10)

Unnamed: 0,Class,Message
0,ham,i am in hospital da. . i will return home in e...
1,ham,"not much, just some textin'. how bout you?"
2,ham,i probably won't eat at all today. i think i'm...
3,ham,don‘t give a flying monkeys wot they think and...
4,ham,who are you seeing?
5,ham,your opinion about me? 1. over 2. jada 3. kusr...
6,ham,yesterday its with me only . now am going home.
7,ham,yes. it's all innocent fun. o:-)
8,ham,a boy was late 2 home. his father: power of fr...
9,ham,is ur changes 2 da report big? cos i've alread...


In [7]:
print(df_train.isna().sum())
print(df_test.isna().sum())

Class      0
Message    0
dtype: int64
Class      0
Message    0
dtype: int64


In [8]:
print(df_train.dtypes)
print(df_test.dtypes)

Class      object
Message    object
dtype: object
Class      object
Message    object
dtype: object


In [9]:
# Create dummy variables

df_train = pd.get_dummies(df_train, columns = ['Class'], drop_first = True)
df_test = pd.get_dummies(df_test, columns = ['Class'], drop_first = True)
print(df_train.head())
print(df_test.head())

                                             Message  Class_spam
0  ahhhh...just woken up!had a bad dream about u ...       False
1                           you can never do nothing       False
2  now u sound like manky scouse boy steve,like! ...       False
3  mum say we wan to go then go... then she can s...       False
4  never y lei... i v lazy... got wat? dat day ü ...       False
                                             Message  Class_spam
0  i am in hospital da. . i will return home in e...       False
1         not much, just some textin'. how bout you?       False
2  i probably won't eat at all today. i think i'm...       False
3  don‘t give a flying monkeys wot they think and...       False
4                                who are you seeing?       False


In [10]:
df_train['Class_spam'] = df_train['Class_spam'].astype(int)
df_test['Class_spam'] = df_test['Class_spam'].astype(int)
print(df_train.head())
print(df_test.head())

                                             Message  Class_spam
0  ahhhh...just woken up!had a bad dream about u ...           0
1                           you can never do nothing           0
2  now u sound like manky scouse boy steve,like! ...           0
3  mum say we wan to go then go... then she can s...           0
4  never y lei... i v lazy... got wat? dat day ü ...           0
                                             Message  Class_spam
0  i am in hospital da. . i will return home in e...           0
1         not much, just some textin'. how bout you?           0
2  i probably won't eat at all today. i think i'm...           0
3  don‘t give a flying monkeys wot they think and...           0
4                                who are you seeing?           0


In [11]:
# Data processing

def text_processing(df):

  import string
  import re

  # Eliminate punctuation signs and number

  punctuation = string.punctuation
  spaces = ' ' * len(punctuation)
  df['Message'] = df['Message'].apply(lambda x: x.translate(str.maketrans(punctuation, spaces)))
  df['Message'] = df['Message'].apply(lambda x: re.sub(r'\d+', '', x))

  # Tokenize the messages

  df['Message'] = df['Message'].astype(str)
  df['Message'] = df['Message'].apply(lambda x: x.split())

  return df

In [12]:
df_train = text_processing(df_train)
df_test = text_processing(df_test)

In [13]:
df_train.head()

Unnamed: 0,Message,Class_spam
0,"[ahhhh, just, woken, up, had, a, bad, dream, a...",0
1,"[you, can, never, do, nothing]",0
2,"[now, u, sound, like, manky, scouse, boy, stev...",0
3,"[mum, say, we, wan, to, go, then, go, then, sh...",0
4,"[never, y, lei, i, v, lazy, got, wat, dat, day...",0


In [14]:
df_test.head()

Unnamed: 0,Message,Class_spam
0,"[i, am, in, hospital, da, i, will, return, hom...",0
1,"[not, much, just, some, textin, how, bout, you]",0
2,"[i, probably, won, t, eat, at, all, today, i, ...",0
3,"[don‘t, give, a, flying, monkeys, wot, they, t...",0
4,"[who, are, you, seeing]",0


In [15]:
# Create the vocabulary

vocabulary = [word for message in df_train['Message'] for word in message]
vocabulary = list(set(vocabulary))
total_words = len(vocabulary) + 1
print(total_words)

6782


In [16]:
# Getting the max length of a message

max_length = max(len(message) for message in df_train['Message'])
print(max_length)

190


In [17]:
# Create an encoder dictionary

encoder_dict = {}

for word in vocabulary:
    if word not in encoder_dict:
        encoder_dict[word] = len(encoder_dict) + 1

print(encoder_dict)



In [18]:
def encode(df, encoder):
  df['Message'] = df['Message'].apply(lambda x: [encoder.get(word, -1) for word in x])
  return df

In [19]:
encode(df_train, encoder_dict)
encode(df_test, encoder_dict)
print(df_train.head())
print(df_test.head())

                                             Message  Class_spam
0  [2439, 945, 4054, 1005, 6371, 269, 2348, 1781,...           0
1                       [807, 3622, 779, 6537, 1477]           0
2  [4753, 2492, 6760, 958, 3441, 2039, 933, 3465,...           0
3  [2025, 5057, 4480, 5359, 3111, 2305, 2087, 230...           0
4  [779, 5942, 1051, 1868, 2184, 1633, 1966, 2063...           0
                                             Message  Class_spam
0  [1868, 1634, 2695, 3652, 443, 1868, 2324, 1244...           0
1       [2853, 1588, 945, 5074, -1, 1314, 2251, 807]           0
2  [1868, 206, 2620, 2418, 2783, 336, 2453, 3205,...           0
3  [2366, 3698, 269, -1, -1, 6026, 4148, 637, 240...           0
4                            [4778, 1513, 807, 6645]           0


In [20]:
# Create train and test labels and convert them to numpy arrays

train_labels = df_train.pop('Class_spam').to_numpy()
test_labels = df_test.pop('Class_spam').to_numpy()


df_train = np.squeeze(df_train.to_numpy())
df_test = np.squeeze(df_test.to_numpy())

print(df_train)
print(df_test)
print(train_labels)
print(test_labels)

[list([2439, 945, 4054, 1005, 6371, 269, 2348, 1781, 1522, 2492, 831, 2488, 1868, 6050, 958, 2492, 1568, 4753, 1868, 1199, 4389, 2027, 1522, 1712, 869, 4970, 1868, 2831, 2173, 1005, 5985, 818])
 list([807, 3622, 779, 6537, 1477])
 list([4753, 2492, 6760, 958, 3441, 2039, 933, 3465, 958, 1868, 4903, 5641, 2088, 443, 5222, 4238, 6026, 6594, 2492, 1863, 4323, 5085, 1688])
 ...
 list([1180, 4634, 4584, 2053, 3056, 3202, 4841, 945, 2137, 468, 4197, 2902, 3111, 4753, 2418, 5872, 5956, 2703, 1027, 801])
 list([1686, 201, 5418, 623, 4903, 3750, 807, 3111, 3116, 1986, 4065, 569, 3808, 6085, 471, 4856, 1986, 5956, 1351, 6659, 2492, 5418, 201, 181, 201, 1660, 3111])
 list([788, 3228, 2348, 3517, 5820, 2492, 52, 2914, 2695, 6049, 1868, 1859, 5750, 3111, 1627, 2695, 3118, 2673, 1868, 1859, 3182, 1100, 3228, 1139, 1852, 1855, 6636, 5985, 5843, 569, 3116, 2697, 3879, 3245])]
[list([1868, 1634, 2695, 3652, 443, 1868, 2324, 1244, 4238, 2695, 4943])
 list([2853, 1588, 945, 5074, -1, 1314, 2251, 807])
 l

In [21]:
print(df_train[0])

[2439, 945, 4054, 1005, 6371, 269, 2348, 1781, 1522, 2492, 831, 2488, 1868, 6050, 958, 2492, 1568, 4753, 1868, 1199, 4389, 2027, 1522, 1712, 869, 4970, 1868, 2831, 2173, 1005, 5985, 818]


In [22]:
# Data homogeneity

data_length = max_length // 2
from keras.preprocessing import sequence

df_train = sequence.pad_sequences(df_train, data_length)
df_test = sequence.pad_sequences(df_test, data_length)

In [23]:
print(df_train[0])
print(df_test[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0 2439  945 4054 1005 6371  269 2348
 1781 1522 2492  831 2488 1868 6050  958 2492 1568 4753 1868 1199 4389
 2027 1522 1712  869 4970 1868 2831 2173 1005 5985  818]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
 1868 1634 2695 3652  443 1868 2324 1244 4238 2695 4943]


In [24]:
# Building the model

model = keras.Sequential([
    keras.layers.Embedding(total_words, 62),
    keras.layers.LSTM(62),
    keras.layers.Dense(1, activation = 'sigmoid')
])
model.summary()

In [25]:
# Training the model

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])
history = model.fit(df_train, train_labels, epochs = 10)

Epoch 1/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 61ms/step - acc: 0.8814 - loss: 0.3311
Epoch 2/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 71ms/step - acc: 0.9889 - loss: 0.0425
Epoch 3/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 49ms/step - acc: 0.9963 - loss: 0.0171
Epoch 4/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 65ms/step - acc: 0.9988 - loss: 0.0099
Epoch 5/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 50ms/step - acc: 0.9971 - loss: 0.0144
Epoch 6/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 51ms/step - acc: 0.9996 - loss: 0.0027
Epoch 7/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 50ms/step - acc: 0.9992 - loss: 0.0042
Epoch 8/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 60ms/step - acc: 0.9995 - loss: 0.0021
Epoch 9/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [26]:
results = model.evaluate(df_test, test_labels)
print(results)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - acc: 0.9888 - loss: 0.0637
[0.05511484295129776, 0.9885057210922241]


In [27]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])

def predict_message(pred_text):

  if isinstance(pred_text, str):
    pred_text = [pred_text]

  predictions_list = []
  for text in pred_text:
    prediction = []

    # Data preprocessing

    data_dict = {'Message': [text]}
    data = pd.DataFrame(data_dict)

    data = text_processing(data)
    data = encode(data, encoder_dict)
    data = np.squeeze(data.to_numpy())
    data = np.expand_dims(data, axis = 0)
    data = sequence.pad_sequences(data, data_length)

    pred = model.predict(data)
    prediction.append(pred[0][0])

    if pred[0][0] <= 0.5:
      label = "ham"
      prediction.append(label)
    else:
      label = "spam"
      prediction.append(label)

    predictions_list.append(prediction)

  predictions = []

  for list_value in predictions_list:
    for element in list_value:
      predictions.append(element)

  return predictions

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296ms/step
[5.7279718e-05, 'ham']


In [28]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
You passed the challenge. Great job!
