In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D, Conv1D, MaxPooling1D, LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data', encoding='latin-1')
df

In [None]:
df['Labels'] = df['v1'].map({'ham':0, 'spam':1})
df.rename(columns={'v2':'Message'}, inplace=True)
df = df.drop(['Unnamed: 2', 'Unnamed: 3',	'Unnamed: 4'], axis=1)
df

In [None]:
X = df['Message']
y = df['Labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
MAX_VOC = 20000
tokenizer = Tokenizer(num_words=MAX_VOC)
tokenizer.fit_on_texts(X_train)
seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)

In [None]:
word2idx = tokenizer.index_word
V = len(word2idx)
V

In [None]:
X_train = pad_sequences(seq_train)
T = X_train.shape[1]
X_test = pad_sequences(seq_test, maxlen=T)

In [None]:
from tensorflow.keras.utils import set_random_seed
set_random_seed(42)

D = 20
M = 15

i = Input(shape=(T,))
x = Embedding(V +1, D)(i) 
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i, x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
r = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

In [None]:
plt.plot(r.history['accuracy'])
plt.plot(r.history['val_accuracy'])

In [None]:
plt.plot(r.history['loss'])
plt.plot(r.history['val_loss'])

In [None]:
model.save('data')