# Binary Classification RNN

In [3]:
!ls

sample_data


In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Model

from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt

In [9]:
df = pd.read_csv('/content/spam.csv',encoding='latin-1')

In [10]:
# convert labels to binary numbers
df['class'] = df['v1'].map({'ham': 0, 'spam': 1})
Y = df['class'].values
df_train, df_test, Ytrain, Ytest = train_test_split(df['v2'], Y, test_size=0.2)

In [11]:

# Convert string into vector of integers
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_SIZE = 160

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df_train)
sequences_train = tokenizer.texts_to_sequences(df_train)
sequences_test = tokenizer.texts_to_sequences(df_test)

# pad the sequences so each vector in the matrix has same length
data_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_SIZE)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_SIZE)

print('Shape of data train tensor:', data_train.shape)
print('Shape of data test tensor:', data_test.shape)

Shape of data train tensor: (4457, 160)
Shape of data test tensor: (1115, 160)


In [12]:
# Create the model

x = Input(shape=(data_train.shape[1],))
y = Embedding(len(tokenizer.word_index) + 1, 20)(x)
y = LSTM(15, return_sequences=True)(y)
y = GlobalMaxPooling1D()(y)
y = Dense(1, activation='sigmoid')(y)

model = Model(inputs=x, outputs=y)

In [14]:
# Compile and fit
model.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy']
)

fitted_model = model.fit(
  x=data_train,
  y=Ytrain,
  epochs=30,
  validation_split=0.2,
)

Epoch 1/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9997 - loss: 0.0050 - val_accuracy: 0.9888 - val_loss: 0.0540
Epoch 2/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9989 - loss: 0.0054 - val_accuracy: 0.9899 - val_loss: 0.0546
Epoch 3/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9998 - loss: 0.0027 - val_accuracy: 0.9888 - val_loss: 0.0576
Epoch 4/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9999 - loss: 0.0016 - val_accuracy: 0.9877 - val_loss: 0.0615
Epoch 5/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9996 - loss: 0.0028 - val_accuracy: 0.9877 - val_loss: 0.0656
Epoch 6/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9997 - loss: 0.0021 - val_accuracy: 0.9877 - val_loss: 0.0692
Epoch 7/30
[1m112/112[

In [15]:
Ypreds = (model.predict(data_test) > 0.5).astype("int32")
print('Model Accuracy score:', metrics.accuracy_score(Ytest, Ypreds))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Model Accuracy score: 0.9838565022421525
