In [1]:
import argparse
import gensim.downloader as api
import numpy as np
import os
import shutil
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
def download_and_read(url):
   local_file = url.split('/')[-1]
   p = tf.keras.utils.get_file(local_file, url,
       extract=True, cache_dir=".")
   labels, texts = [], []
   local_file = os.path.join("datasets", "SMSSpamCollection")
   with open(local_file, "r") as fin:
       for line in fin:
           label, text = line.strip().split('\t')
           labels.append(1 if label == "spam" else 0)
           texts.append(text)
   return texts, labels

In [3]:
DATASET_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
texts, labels = download_and_read(DATASET_URL)

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
  98304/Unknown - 0s 4us/step

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)

In [7]:
text_sequences = tokenizer.texts_to_sequences(texts)

In [9]:
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    text_sequences)


In [10]:
text_sequences

array([[   0,    0,    0, ...,   58, 4437,  144],
       [   0,    0,    0, ...,  473,    6, 1941],
       [   0,    0,    0, ...,  661,  393, 2997],
       ...,
       [   0,    0,    0, ...,  106,  251, 9011],
       [   0,    0,    0, ...,  200,   12,   47],
       [   0,    0,    0, ...,    2,   61,  269]])

In [12]:
num_records = len(text_sequences)

In [13]:
max_seqlen = len(text_sequences[0])

In [16]:
print("{:d} sentences, max length: {:d}".format(
    num_records, max_seqlen))

5574 sentences, max length: 189


In [18]:
NUM_CLASSES = 2
cat_labels = tf.keras.utils.to_categorical(
    labels, num_classes=NUM_CLASSES)

In [19]:
cat_labels

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [20]:
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
word2idx["PAD"] = 0
idx2word[0] = "PAD"
vocab_size = len(word2idx)
print("vocab size: {:d}".format(vocab_size))

vocab size: 9013


In [21]:
dataset = tf.data.Dataset.from_tensor_slices(
    (text_sequences, cat_labels))

In [22]:
dataset = dataset.shuffle(10000)

In [23]:
test_size = num_records // 4
val_size = (num_records - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

In [24]:
len(train_dataset)

3763

In [26]:
BATCH_SIZE = 128
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [28]:
def build_embedding_matrix(sequences, word2idx, embedding_dim,
       embedding_file):
   if os.path.exists(embedding_file):
       E = np.load(embedding_file)
   else:
       vocab_size = len(word2idx)
       E = np.zeros((vocab_size, embedding_dim))
       word_vectors = api.load(EMBEDDING_MODEL)
       for word, idx in word2idx.items():
           try:
               E[idx] = word_vectors.word_vec(word)
           except KeyError:   # word not in embedding
               pass
       np.save(embedding_file, E)
   return E

In [30]:
EMBEDDING_DIM = 300
DATA_DIR = ""
EMBEDDING_NUMPY_FILE = os.path.join(DATA_DIR, "E.npy")
EMBEDDING_MODEL = "glove-wiki-gigaword-300"
E = build_embedding_matrix(text_sequences, word2idx, 
   EMBEDDING_DIM,
   EMBEDDING_NUMPY_FILE)
print("Embedding matrix:", E.shape)

Embedding matrix: (9013, 300)


  E[idx] = word_vectors.word_vec(word)


In [33]:
run_mode = "scratch"

In [34]:
class SpamClassifierModel(tf.keras.Model):
   def __init__(self, vocab_sz, embed_sz, input_length,
           num_filters, kernel_sz, output_sz,
           run_mode, embedding_weights,
           **kwargs):
       super(SpamClassifierModel, self).__init__(**kwargs)
       if run_mode == "scratch":
           self.embedding = tf.keras.layers.Embedding(vocab_sz,
               embed_sz,
               input_length=input_length,
               trainable=True)
       elif run_mode == "vectorizer":
           self.embedding = tf.keras.layers.Embedding(vocab_sz,
               embed_sz,
               input_length=input_length,
               weights=[embedding_weights],
               trainable=False)
       else:
           self.embedding = tf.keras.layers.Embedding(vocab_sz,
               embed_sz,
               input_length=input_length,
               weights=[embedding_weights],
               trainable=True)
       self.conv = tf.keras.layers.Conv1D(filters=num_filters,
           kernel_size=kernel_sz,
           activation="relu")
       self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
       self.pool = tf.keras.layers.GlobalMaxPooling1D()
       self.dense = tf.keras.layers.Dense(output_sz,
           activation="softmax")
   def call(self, x):
       x = self.embedding(x)
       x = self.conv(x)
       x = self.dropout(x)
       x = self.pool(x)
       x = self.dense(x)
       return x
# model definition
conv_num_filters = 256
conv_kernel_size = 3
model = SpamClassifierModel(
   vocab_size, EMBEDDING_DIM, max_seqlen,
   conv_num_filters, conv_kernel_size, NUM_CLASSES,
   run_mode, E)
model.build(input_shape=(None, max_seqlen))

In [35]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])