# Ex 3.2 Classifying Text Data with a LSTM Network

In [None]:
# Ignore this -- it is just for timing how long the program runs.
import time
start = time.perf_counter()

In [None]:
import pandas as pd
import numpy as np
from keras.models import Model, Sequential
from keras.layers import LSTM, Dense,Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

## Loading the data

The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains  5,574 SMS messages. The messages are in English and are tagged acording to being ham (legitimate) or spam.

In [None]:
df = pd.read_csv("spam.csv", encoding="latin")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Just keep the relevant columns.
df = df[["v1","v2"]]

In [None]:
df.head()

In [None]:
df["v1"].value_counts()

## Preprocessing

We shall create a column that holds the comments with stop words removed.

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['v3'] = df['v2'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
df.head()

Prepare the features and targets. Features have to be integers so let "ham" become `0` and "spam" become `1`.

In [None]:
lab_map = {"ham":0, "spam":1}

In [None]:
# Target as 1's and 0's.
Y = df["v1"].map(lab_map).values
Y

In [None]:
# Features as an array of strings.
X = df["v3"].values
X

## Tokenize the comments so they become a sequence of numbers

1. Convert to sequences
2. Pad sequences so they are all the same length

 Tokenizer will vectorize a text corpus by turning each text into a sequence of integers. Each of the integers is the index of the corresponding token in a dictionary. Since there are many words we use the _max_words_ most frequent ones. The tokenizer filters out punctuation by default.

In [None]:
max_words = 1000
mytokenizer = Tokenizer(num_words=max_words,lower=True, split=" ")

In [None]:
# Updates internal vocabulary based on a list of texts.
mytokenizer.fit_on_texts(X)

In [None]:
mytokenizer.document_count

Let's take a look at the most frequent 100 words.

In [None]:
reversed_dictionary = {value : key for (key, value) in mytokenizer.word_index.items()}
for i in range(1,100):
    print(i,reversed_dictionary[i])

We tokenize the features (comments) so that each comment becomes an array of numbers.

In [None]:
text_tokenized = mytokenizer.texts_to_sequences(X)

In [None]:
text_tokenized [:10]

We pad the sequences of numbers with zeros so that each comment has a array of length 50.

In [None]:
max_len = 50
sequences = pad_sequences(text_tokenized,maxlen=max_len)
sequences

## Building the LSTM model

In [None]:
model = Sequential()
model.add(Embedding(max_words, 20, input_length=max_len))
model.add(LSTM(64))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.summary()

## Training the model

In [None]:
%%time
# Less than 1 min
model.fit(sequences,Y,batch_size = 128, epochs = 10, validation_split = 0.2)

## Predict on new test data

Create a test comment.

In [None]:
test_sentences = np.array([
"WINNER! U win a 500 prize reward & free entry to FA cup final tickets! Text FA to 34212 to receive award",
"FA Cup, is an annual knockout football competition in men's domestic English football"]
)

Tokenize the sentences.

In [None]:
test_sequences = mytokenizer.texts_to_sequences(test_sentences)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len)

In [None]:
model.predict(test_sequences_matrix)

The prediction is that the first sentence is probably spam but the second one is probably ham.

In [None]:
end = time.perf_counter()
print("Time taken: in min", (end - start)/60)