In [3]:
import pandas as pd
data=pd.read_csv('spam.csv',encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":'label', "v2":'text'})
#print(data.head())
tags = data["label"]
texts = data["text"]
print(texts[0:5])
print(tags[0:5])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: text, dtype: object
0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object


In [4]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

def process_sms(vocab):
    for t in texts:
        tokens = clean_txt(t)
        vocab.update(tokens)

def clean_txt(txt):
    # split into tokens 
    txt = str(txt)
    tokens = txt.split()
    # remove punctuation 
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove  not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter  stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

vocab = Counter()
process_sms(vocab)
print(len(vocab))
print(vocab.most_common(50))

10081
[('call', 392), ('get', 335), ('Im', 313), ('ur', 293), ('ltgt', 276), ('You', 267), ('know', 251), ('go', 247), ('like', 230), ('dont', 217), ('come', 207), ('got', 206), ('time', 191), ('day', 180), ('No', 173), ('want', 167), ('Ill', 165), ('lor', 160), ('Call', 158), ('home', 156), ('send', 153), ('going', 152), ('one', 152), ('need', 150), ('Ok', 147), ('good', 145), ('love', 143), ('How', 143), ('back', 141), ('still', 137), ('text', 135), ('But', 133), ('If', 133), ('im', 129), ('later', 127), ('see', 124), ('da', 121), ('ok', 119), ('So', 119), ('Just', 119), ('We', 119), ('think', 118), ('Its', 117), ('free', 116), ('FREE', 113), ('Do', 113), ('today', 112), ('Sorry', 112), ('week', 111), ('phone', 111)]


In [3]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
vocab = set(tokens)

4517


In [4]:

def load_embedding(filename):
    file = open(filename,'r',encoding="utf-8")
    lines = file.readlines()
    file.close()
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

In [5]:
from numpy import asarray
from numpy import zeros
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

In [6]:
def process_sms():
    documents = list()
    for text in texts:
        tokens = clean_text(text, vocab)
        documents.append(tokens)
    return documents

def clean_text(text, vocab):
    # split into tokens 
    tokens = str(text).split()
    # remove punctuation 
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

vocab = set(tokens)
train_texts = process_sms()


In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import merge, Dense, LSTM, Dropout
from keras.layers import Flatten, Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_texts)
# sequence encode
encoded_texts = tokenizer.texts_to_sequences(train_texts)
# pad sequences

max_length = max([len(s.split()) for s in train_texts])
All_sms = pad_sequences(encoded_texts, maxlen=max_length, padding='post')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [8]:
All_labels = []
for t in tags:
    if t == 'ham':
        All_labels.append(0)
    elif t == 'spam':
        All_labels.append(1)
        

#Splitting test and train sets
Xtrain = All_sms[:4180] #75% of all smses
Xtest = All_sms[4180:]

ytrain = All_labels[:4180]
ytest = All_labels[4180:]

In [9]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
# load embedding from file
raw_embedding = load_embedding('glove.6B.100d.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)


In [10]:
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(Xtrain, ytrain, epochs=10, verbose=2)

loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 77, 100)           357400    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 73, 128)           64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 36, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4608)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 4609      
Total params: 426,137
Trainable params: 68,737
Non-trainable params: 357,400
_________________________________________________________________
None
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/

In [11]:
model_conv_LSTM = Sequential()
model_conv_LSTM.add(embedding_layer)
model_conv_LSTM.add(Dropout(0.2))
model_conv_LSTM.add(Conv1D(64, 5, activation='relu'))
model_conv_LSTM.add(MaxPooling1D(pool_size=4))
model_conv_LSTM.add(LSTM(100))
model_conv_LSTM.add(Dense(1, activation='sigmoid'))
model_conv_LSTM.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])
model_conv_LSTM.fit(Xtrain, ytrain, validation_split=0.3, epochs = 10)

Train on 2926 samples, validate on 1254 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a30258c908>

In [12]:
loss, acc = model_conv_LSTM.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


Test Accuracy: 97.198278


In [13]:
# define 2 CNN Layer model
model_2CNN = Sequential()
model_2CNN.add(embedding_layer)
model_2CNN.add(Dropout(0.3))
model_2CNN.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model_2CNN.add(MaxPooling1D(pool_size=4))
model_2CNN.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
model_2CNN.add(MaxPooling1D(pool_size=2))
model_2CNN.add(Conv1D(filters=32, kernel_size=2, activation='relu'))
model_2CNN.add(MaxPooling1D(pool_size=2))
model_2CNN.add(Flatten())
model_2CNN.add(Dense(1, activation='sigmoid'))
print(model_2CNN.summary())
# compile network
model_2CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model_2CNN.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model_2CNN.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 77, 100)           357400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 77, 100)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 73, 128)           64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 18, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 15, 64)            32832     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 7, 64)             0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 6, 32)            