In [None]:
import pandas as pd
import numpy as np 
import sklearn
import tensorflow_datasets as tfds
import tensorflow as tf 
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1")

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
print(df.columns)
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
df.head()
# "ham" messages are those that are not spam

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.rename(columns = {"v1" : "type"}, inplace = True)
df.rename(columns = {"v2" : "text"}, inplace = True)

In [None]:
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
(df["type"]).value_counts()

0    4825
1     747
Name: type, dtype: int64

In [None]:
# We encode the values in column "type": 0 for non-spam messages, 1 for spam
df["type"] = df["type"].apply(lambda x: 0 if x=="ham" else 1)

In [None]:
weights = 1/(df["type"]).value_counts()
weights = weights * len(df)/2
weights = {index : values for index , values in zip(weights.index,weights.values)}
weights

{0: 0.5774093264248704, 1: 3.72958500669344}

In [None]:
df.head()

Unnamed: 0,type,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['type'].nunique()

2

In [None]:
df['text'].nunique()

5169

In [None]:
# checking if there are missing values in the dataset
df.isnull().sum() 

type    0
text    0
dtype: int64

In [None]:
!python -m spacy download en_core_web_lg -q

2022-12-06 16:03:21.526018: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[K     |████████████████████████████████| 587.7 MB 16 kB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
df["text_clean"] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
df["text_clean"] = df["text_clean"].apply(lambda x: x.replace("  "," ").lower().strip())
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

In [None]:
df.head()

Unnamed: 0,type,text,text_clean
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun early hor u c
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live


In [None]:
for i in range(0, 15):
  print(df["type"].loc[i], df["text"].loc[i])

0 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
0 Ok lar... Joking wif u oni...
1 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
0 U dun say so early hor... U c already then say...
0 Nah I don't think he goes to usf, he lives around here though
1 FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv
0 Even my brother is not like to speak with me. They treat me like aids patent.
0 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
1 WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
1 Had your mobile 11 mont

In [None]:
for i in range(0, 15):
  print(df["text_clean"].loc[i])

jurong point crazy available bugis n great world la e buffet cine amore wat
ok lar joke wif u oni
free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s
u dun early hor u c
nah think usf live
freemsg hey darle 3 week word d like fun tb ok xxx std chgs send å150 rcv
brother like speak treat like aids patent
request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune
winner value network customer select receivea å900 prize reward claim 09061701461 claim code kl341 valid 12 hour
mobile 11 month u r entitle update late colour mobile camera free mobile update co free 08002986030
m home soon want talk stuff anymore tonight k ve cry today
chance win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6days 16 tsandcs apply reply hl 4 info
urgent win 1 week free membership å100000 prize jackpot txt word claim 81010 tc wwwdbuknet lccltd pobox 4403ldnw1a7rw18
ve search 

In [None]:
# we create a copy of the dataframe to look more closely 
spam_df = df[df["type"]=="spam"].copy()
spam_df = spam_df.reset_index(drop = True)

In [None]:
"""for i in range(0, 15):
  print(spam_df["text"].loc[i])
  print()"""

KeyError: ignored

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df["text_clean"])
df["text_encoded"] = tokenizer.texts_to_sequences(df["text_clean"])

In [None]:
df.head()

Unnamed: 0,type,text,text_clean,text_encoded
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,"[3602, 232, 446, 464, 943, 35, 51, 205, 944, 7..."
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni,"[9, 193, 465, 287, 1, 1450]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[11, 296, 3, 534, 659, 33, 1451, 849, 423, 145..."
3,0,U dun say so early hor... U c already then say...,u dun early hor u c,"[1, 124, 150, 2363, 1, 83]"
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live,"[704, 22, 660, 130]"


In [None]:
#df["text_padded_encoded"] = df["text_encoded"].apply(lambda x: tf.keras.preprocessing.sequence.pad_sequences(x, padding="post"))

In [None]:
df.head()

Unnamed: 0,type,text,text_clean,text_encoded
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,"[3602, 232, 446, 464, 943, 35, 51, 205, 944, 7..."
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni,"[9, 193, 465, 287, 1, 1450]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[11, 296, 3, 534, 659, 33, 1451, 849, 423, 145..."
3,0,U dun say so early hor... U c already then say...,u dun early hor u c,"[1, 124, 150, 2363, 1, 83]"
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live,"[704, 22, 660, 130]"


In [None]:
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df["text_encoded"], padding="post")

In [None]:
full_ds = tf.data.Dataset.from_tensor_slices((text_pad, df["type"]))

In [None]:
df.shape

(5572, 4)

In [None]:
TAKE_SIZE = int(0.7*df.shape[0])

train_data = full_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
train_data = train_data.batch(64)

test_data = full_ds.skip(TAKE_SIZE)
test_data = test_data.batch(64)

In [None]:
type(train_data.take(1))

tensorflow.python.data.ops.dataset_ops.TakeDataset

In [None]:
for text, text_type in train_data.take(1):
  print(text, text_type)

tf.Tensor(
[[ 175   38    0 ...    0    0    0]
 [1030    3   10 ...    0    0    0]
 [ 116 6400 3383 ...    0    0    0]
 ...
 [ 107   34  105 ...    0    0    0]
 [   2  387 1320 ...    0    0    0]
 [ 220  158  177 ...    0    0    0]], shape=(64, 72), dtype=int32) tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0], shape=(64,), dtype=int64)


In [None]:
type(text)

tensorflow.python.framework.ops.EagerTensor

In [None]:
text.shape

TensorShape([64, 72])

In [None]:
text.shape[1]

72

In [None]:
text

In [None]:
df.shape[1]

4

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM

vocab_size = len(tokenizer.word_index)
model_lstm = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[text.shape[1],],name="embedding"),
                  # text.shape[1]: 72
                  LSTM(units=64, return_sequences=True, name = "ltsm_1"), # maintains the sequential nature
                  LSTM(units=64, return_sequences=False, name = "lstm_2"), # returns the last output
                  Dense(16, activation='relu', name = "dense_1"),
                  Dense(8, activation='relu'),
                  Dense(1, activation="sigmoid", name="last")
                  ])

In [None]:
model_lstm.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 72, 64)            525504    
                                                                 
 ltsm_1 (LSTM)               (None, 72, 64)            33024     
                                                                 
 lstm_2 (LSTM)               (None, 64)                33024     
                                                                 
 dense_1 (Dense)             (None, 16)                1040      
                                                                 
 dense_9 (Dense)             (None, 8)                 136       
                                                                 
 last (Dense)                (None, 1)                 9         
                                                                 
Total params: 592,737
Trainable params: 592,737
Non-tr

In [None]:
optimizer= tf.keras.optimizers.Adam()

model_lstm.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model_lstm.fit(train_data,
              epochs=100, 
              validation_data=test_data,
               class_weight=weights)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f3af393f9a0>

In [None]:
model_lstm.save("model_lstm.h5")

In [None]:
import json
json.dump(model_lstm.history.history, open("/content/LSTM_history.json", 'w'))

In [None]:
LSTM_history = json.load(open("/content/LSTM_history.json", 'r'))
model_lstm = tf.keras.models.load_model("/content/model_lstm.h5")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=LSTM_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=LSTM_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()
