### Importing Libraries

In [29]:
import tensorflow as tf
import pandas as pd

import tensorflow_datasets as tfds
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to /home/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Downloading Data

In [30]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

--2023-08-06 13:55:13--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 2606:4700:20::ac43:4695, 2606:4700:20::681a:321, 2606:4700:20::681a:221, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|2606:4700:20::ac43:4695|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2023-08-06 13:55:15 (516 KB/s) - ‘train-data.tsv’ saved [358233/358233]

--2023-08-06 13:55:15--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 2606:4700:20::681a:221, 2606:4700:20::681a:321, 2606:4700:20::ac43:4695, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|2606:4700:20::681a:221|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2023-08-06 13:55:16 (325 KB/s) -

### Data reading

In [31]:
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

train_df = pd.read_csv(train_file_path, sep="\t", header=None, names=["type", "msg"]).dropna()

test_df = pd.read_csv(test_file_path, sep="\t", header=None, names=["type", "msg"]).dropna()

In [32]:
train_df.head()

Unnamed: 0,type,msg
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


### Turning classes into numerical values

In [33]:
train_df["type"] = pd.factorize(train_df["type"])[0]
test_df["type"] = pd.factorize(test_df["type"])[0]

### Writing a function for removing unnecessary words and stemming the text

In [34]:
from typing import Union
def clean_and_stem(messages:Union[str,list,pd.Series]):
    corpus = []
    if isinstance(messages, list) or isinstance(messages, pd.Series):
        for i in range(0, len(messages)):
            review = re.sub('[^a-zA-Z0-9]', ' ', messages[i]).lower().split()
            ps = PorterStemmer()
            review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
            review = ' '.join(review)
            corpus.append(review)
    elif isinstance(messages, str):
        review = re.sub('[^a-zA-Z0-9]', ' ', messages).lower().split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus = [review]
    return corpus

In [35]:
corpus_train = clean_and_stem(train_df["msg"])
corpus_test = clean_and_stem(test_df["msg"])

### Turning words into numerical values

In [36]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(corpus_train)

X_train = tokenizer.texts_to_sequences(corpus_train)
X_test = tokenizer.texts_to_sequences(corpus_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(corpus_train[2])
print(X_train[2])

u sound like manki scous boy steve like travel da bu home wot u inmind 4 recreat di eve
[1, 311, 15, 3049, 3050, 220, 3051, 15, 812, 43, 334, 37, 401, 1, 3052, 7, 3053, 312, 429]


### Writing the phrases into sequences of the same size

In [37]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

y_train = train_df["type"]
y_test = test_df["type"]

print(X_train[0, :])

[3048 2071  324  242    1  591   42   15    1   99  370   14  114 1375
   55  219   92    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


### Writing the model

In [38]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))

model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPool1D())

model.add(layers.Dense(10, activation='relu'))

model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 50)           311350    
                                                                 
 conv1d_1 (Conv1D)           (None, 96, 128)           32128     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 344779 (1.32 MB)
Trainable params: 344779 (1.32 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

### Fitting the model

In [39]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.9892


In [41]:
def predict_message(pred_text):
  corpus = clean_and_stem(pred_text)

  sequence = tokenizer.texts_to_sequences(corpus)
  # pad the sequence
  sequence = pad_sequences(sequence, maxlen=maxlen)

  prediction = model.predict(sequence)

  if prediction >= 0.5:
    prediction = ([prediction[0], 'spam'])
  else:
    prediction = ([prediction[0], 'ham'])

  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[array([0.00104446], dtype=float32), 'ham']


In [42]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print(prediction)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[array([0.00104446], dtype=float32), 'ham']
[array([0.99780947], dtype=float32), 'spam']
[array([1.15783e-05], dtype=float32), 'ham']
[array([0.9999987], dtype=float32), 'spam']
[array([0.9999951], dtype=float32), 'spam']
[array([0.00105725], dtype=float32), 'ham']
[array([1.2022678e-05], dtype=float32), 'ham']
You passed the challenge. Great job!
