In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten #, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


## Load the dataset

In [2]:
df = pd.read_json('Sms_spam.json').sort_index()

In [3]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Tokenization

In [4]:
def clean_text(text):
    
    # Remove puncuation
    text = text.encode('utf-8').translate(None,string.punctuation)
    
    # Clean the text
    text = re.sub('[^0-9a-zA-Z]', ' ', text)
    
    # Split words and convert to lower case
    text = text.lower().split()
    
    # Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    
    # Stemming
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words).encode('utf-8')

    return text

In [5]:
x = df['message'].apply(clean_text)

In [6]:
x.head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri 2 wkli comp win fa cup final tkts 2...
3                  u dun say earli hor u c alreadi say
4            nah dont think goe usf live around though
Name: message, dtype: object

In [7]:
x.shape

(5574,)

In [8]:
vocabulary_size = 20000
max_sequence_length = 100

In [9]:
tokenizer = Tokenizer(num_words=vocabulary_size)

In [10]:
tokenizer.fit_on_texts(x)

In [11]:
x_sequences = pad_sequences(tokenizer.texts_to_sequences(x), maxlen=max_sequence_length)

In [12]:
x_sequences

array([[   0,    0,    0, ...,   20, 3740,   71],
       [   0,    0,    0, ...,  367,    1, 1597],
       [   0,    0,    0, ..., 2494,  295, 2495],
       ...,
       [   0,    0,    0, ..., 1069, 8099, 1416],
       [   0,    0,    0, ...,  839,  141,   13],
       [   0,    0,    0, ..., 2219,  393,  177]], dtype=int32)

In [13]:
x_sequences.shape

(5574, 100)

## Word Embedding

Embedding maps each word to a vector of fixed size with real-valued elements (embedding_size << unique_words).

In [14]:
model = Sequential()
model.add(Embedding(vocabulary_size, 50, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [15]:
model.compile(loss='binary_crossentropy', optimizer='Adadelta', metrics=['accuracy'])

In [16]:
model.fit(x_sequences, df['label'], batch_size=100, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x123ccff50>

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5001      
Total params: 1,005,001
Trainable params: 1,005,001
Non-trainable params: 0
_________________________________________________________________


In [18]:
word_embeddings = model.layers[0].get_weights()[0]

In [19]:
word_embeddings.shape #unique_words x embedding_size

(20000, 50)

In [20]:
word_embeddings

array([[-0.03467964, -0.0764759 , -0.08461228, ...,  0.05950912,
        -0.05391125, -0.04655339],
       [-0.041122  , -0.10640411, -0.07045071, ..., -0.0394556 ,
        -0.12431665, -0.06359202],
       [ 0.17339815,  0.20029253,  0.08297867, ..., -0.03272231,
         0.1911557 ,  0.30110705],
       ...,
       [-0.01723021, -0.03966566,  0.0013684 , ..., -0.01163833,
         0.00375935,  0.02926156],
       [-0.01292429, -0.00625401, -0.01763848, ..., -0.01225555,
         0.0356218 ,  0.0471295 ],
       [ 0.01247825,  0.04260305, -0.04180735, ..., -0.01268365,
         0.02387741, -0.02819849]], dtype=float32)

## Find vector corresponding to a word 

In [21]:
vocabulary = tokenizer.word_index

In [22]:
vocabulary['yellow']

3440

In [23]:
word_vector = word_embeddings[vocabulary['yellow'],:]

In [24]:
word_vector.shape

(50,)

In [25]:
word_vector

array([-0.01237871, -0.0215675 , -0.00296395,  0.00951132,  0.00900213,
        0.02102075,  0.02590144, -0.02602286, -0.00037933, -0.01240916,
        0.04669224, -0.04731259, -0.00795651, -0.03207969, -0.00451689,
        0.02558075, -0.03716354,  0.02632653,  0.01892984, -0.03049967,
       -0.0424594 , -0.03754866,  0.01686222, -0.03153073,  0.02181299,
       -0.03240712,  0.03730588, -0.01545383,  0.01298006,  0.01718994,
        0.04779264, -0.02932018,  0.04253927,  0.029955  , -0.03882325,
        0.04938944, -0.02285092,  0.03938728,  0.04326139,  0.03764124,
       -0.00064969, -0.00927598,  0.04209021,  0.01135303, -0.00718706,
       -0.03928187, -0.01846687, -0.00130797,  0.03312313,  0.03497282],
      dtype=float32)