In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
df = pd.read_csv("train-data.tsv",sep = "\t",header=None)
df.columns = ['label','message']
df.tail()

In [None]:
df.describe()

In [None]:
df.groupby('label').describe().T

In [None]:
ham_msg = df[df.label =='ham']
spam_msg = df[df.label=='spam']
# Create numpy list to visualize using wordcloud

ham_msg_text = " ".join(ham_msg.message.to_numpy().tolist())
spam_msg_text = " ".join(spam_msg.message.to_numpy().tolist())
print(ham_msg_text)
print(spam_msg_text)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
ham_msg_cloud = WordCloud(width =320, height =160, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(ham_msg_text)
plt.figure(figsize=(10,8))
plt.imshow(ham_msg_cloud, interpolation='bilinear')
plt.axis('off') # turn off axis
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
ham_msg_cloud = WordCloud(width =320, height =160, stopwords=STOPWORDS,max_font_size=50, background_color ="white", colormap='autumn').generate(spam_msg_text)
plt.figure(figsize=(10,8))
plt.imshow(ham_msg_cloud, interpolation='bilinear')
plt.axis('off') # turn off axis
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert 'label' to categorical type
df['label'] = df['label'].astype('category')

# Create count plot
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df)
plt.show()

# Percentage of spam messages
spam_percentage = (len(df[df['label'] == 'spam']) / len(df)) * 100
ham_percentage = (len(df[df['label'] == 'ham']) / len(df)) * 100

print(f"Percentage of spam messages: {spam_percentage:.2f}%")
print(f"Percentage of ham messages: {ham_percentage:.2f}%")


In [None]:
ham_msg_df = ham_msg.sample(n = len(spam_msg), random_state = 0)
spam_msg_df = spam_msg
print(ham_msg_df.shape, spam_msg_df.shape)#(747, 2) (747, 2)


In [None]:
msg_df = ham_msg_df.append(spam_msg_df).reset_index(drop=True)
plt.figure(figsize=(8,6))
sns.countplot(x='label',data = df)
plt.title('Distribution of ham and spam email messages (after downsampling)')

In [None]:
# Get length column for each text
msg_df['text_length'] = msg_df['message'].apply(len) #Calculate average length by label types
labels = msg_df.groupby('label').mean()
labels

In [None]:
df_test = pd.read_csv("valid-data.tsv",sep='\t', header= None)
df_test.columns =['label', 'message']
df_test.tail()

In [None]:
msg_df['msg_type']= msg_df['label'].map({'ham': 0, 'spam': 1})
df_test['msg_type']= df_test['label'].map({'ham': 0, 'spam': 1})
print(msg_df.tail())
print(df_test.tail())


In [None]:
train_label = msg_df['msg_type']
train_msg = msg_df['message']
test_msg = df_test['message']
test_label = df_test['msg_type']
print(test_msg)
print(train_msg)

In [None]:
# Defining pre-processing hyperparameters
max_len = 50
trunc_type = "post"
padding_type = "post"
oov_tok = ""
vocab_size = 500

In [None]:
!pip install --upgrade tensorflow
!pip install --upgrade keras

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = vocab_size, char_level=False, oov_token = oov_tok)
tokenizer.fit_on_texts(train_msg)

In [None]:
word_index = tokenizer.word_index
tot_words = len(word_index)
print('There are %s unique tokens in training data. ' % tot_words)

In [None]:
training_sequences = tokenizer.texts_to_sequences(train_msg)
training_padded = pad_sequences (training_sequences, maxlen = max_len, padding = padding_type, truncating = trunc_type )
testing_sequences = tokenizer.texts_to_sequences(test_msg)
testing_padded = pad_sequences(testing_sequences, maxlen = max_len,padding = padding_type, truncating = trunc_type)

In [None]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

In [None]:
print(training_padded[0])


In [None]:
vocab_size = 500 # As defined earlier
embedding_dim = 16
drop_value = 0.2 # dropout
n_dense = 24

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])

In [None]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(training_padded, train_label, epochs=num_epochs, validation_data=(testing_padded, test_label),callbacks =[early_stop], verbose=2)

In [None]:
model.evaluate(testing_padded, test_label)

In [None]:
metrics = pd.DataFrame(history.history)

In [None]:
metrics.rename(columns = {'loss': 'Training_Loss', 'accuracy': 'Training_Accuracy', 'val_loss': 'Validation_Loss', 'val_accuracy': 'Validation_Accuracy'}, inplace = True)

In [None]:
def plot_graphs1(var1, var2, string):
    metrics[[var1, var2]].plot()
    plt.title('Training and Validation ' + string)
    plt.xlabel ('Number of epochs')
    plt.ylabel(string)
    plt.legend([var1, var2])

In [None]:
plot_graphs1('Training_Loss', 'Validation_Loss', 'loss')

In [None]:
plot_graphs1('Training_Accuracy', 'Validation_Accuracy', 'accuracy')

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text1):

  pred_text = []
  pred_text.append(pred_text1)
  new_seq = tokenizer.texts_to_sequences(pred_text)
  padded = pad_sequences(new_seq, maxlen =max_len,padding = padding_type,truncating=trunc_type)
  prediction = model.predict(padded)
  for i in prediction:
    if i > 0.5:
      return((float(i),"spam"))
    else:
      return((float(i),"ham"))

pred_text = " you have won £1000 cash! call to claim"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
