# Importing Libraries

In [120]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import Sequential
np.random.seed(1)
from keras import layers
from keras.preprocessing.sequence import pad_sequences
%matplotlib inline



# Reading CSV FILE

In [121]:
df=pd.read_csv("spam.csv",encoding='latin-1')

In [122]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [123]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

# DATA PRE-Processing

# Concatinating the columns

In [124]:
df['Unnamed: 2'][df['Unnamed: 2'].notna()].head()

95                                           PO Box 5249
281     the person is definitely special for u..... B...
444     HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE ...
671     wanted to say hi. HI!!!\" Stop? Send STOP to ...
710      this wont even start........ Datz confidence.."
Name: Unnamed: 2, dtype: object

In [125]:
df.rename(columns={'v1':'labels','v2':'sms'},inplace=True)

In [126]:
df.head()

Unnamed: 0,labels,sms,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Filling the null values with " "(Space)

In [127]:
df.fillna(" ",inplace=True)
df.sms=df.sms+" "+df["Unnamed: 2"]+" "+df["Unnamed: 3"]+" "+df["Unnamed: 4"]
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True,axis=1)

In [128]:
df.head()

Unnamed: 0,labels,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [129]:
df.isna().sum()

labels    0
sms       0
dtype: int64

# Label Encoding the labels column

In [130]:
encoder = LabelEncoder()
df.labels = encoder.fit_transform(df.labels)
df.labels

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: labels, Length: 5572, dtype: int32

In [131]:
encoder.inverse_transform(df.labels)

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

ham-0 <br>
spam-1

In [132]:
df

Unnamed: 0,labels,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [133]:
df.labels.values

array([0, 0, 1, ..., 0, 0, 0])

# Splitting Dataframe in to Train Data -75% Test data -25%

In [134]:
sentences_train, sentences_test, Y_train, Y_test = train_test_split(
        df.sms.values, df.labels.values, test_size=0.25, random_state=1000)

In [135]:
sentences_train

array(['Oh yah... We never cancel leh... Haha       ',
       'No..few hours before.went to hair cut .      ', 'Yup ok...      ',
       ...,
       'Mila, age23, blonde, new in UK. I look sex with UK guys. if u like fun with me. Text MTALK to 69866.18 . 30pp/txt 1st 5free. å£1.50 increments. Help08718728876      ',
       'A guy who gets used but is too dumb to realize it.      ',
       'Bought one ringtone and now getting texts costing 3 pound offering more tones etc      '],
      dtype=object)

# Converting Label in to One-hot Encoding form

In [136]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [137]:
Y_oh_train = convert_to_one_hot(Y_train, C = 2)
Y_oh_test = convert_to_one_hot(Y_test, C = 2)

# Using Keras Tokenizer to tokenize the words

In [138]:
tokenizer1 = Tokenizer()
texts = [sentences_train[0],sentences_train[1],sentences_train[2],sentences_train[3]]
for i in texts:
    print(i)
tokenizer1.fit_on_texts(texts)
print(tokenizer1.word_index)
tokenizer1.texts_to_sequences(["i am not a fan of his movie it sucked"])


Oh yah... We never cancel leh... Haha       
No..few hours before.went to hair cut .      
Yup ok...      
What i told before i tell. Stupid hear after i wont tell anything to you. You dad called to my brother and spoken. Not with me.      
{'to': 1, 'i': 2, 'before': 3, 'tell': 4, 'you': 5, 'oh': 6, 'yah': 7, 'we': 8, 'never': 9, 'cancel': 10, 'leh': 11, 'haha': 12, 'no': 13, 'few': 14, 'hours': 15, 'went': 16, 'hair': 17, 'cut': 18, 'yup': 19, 'ok': 20, 'what': 21, 'told': 22, 'stupid': 23, 'hear': 24, 'after': 25, 'wont': 26, 'anything': 27, 'dad': 28, 'called': 29, 'my': 30, 'brother': 31, 'and': 32, 'spoken': 33, 'not': 34, 'with': 35, 'me': 36}


[[2, 34]]

In [139]:
tokenizer = Tokenizer(num_words=2500,split=' ')
tokenizer.fit_on_texts(sentences_train)
sentences_train = tokenizer.texts_to_sequences(sentences_train)
X_train = pad_sequences(sentences_train,maxlen=100)

In [140]:
X_test = tokenizer.texts_to_sequences(sentences_test)
X_test = pad_sequences(X_test,maxlen=100)

In [141]:
print(sentences_train[2])
print(X_train[2])

[275, 46]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0 275  46]


In [142]:
Y_oh_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [143]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath,encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

# Download GloVe file and replace the path

In [144]:
path=r'C:\Users\balaa\Desktop\balaaje\deep learning\coursera files\sequence models\glove.6B\glove.6B.50d.txt'
embedding_dim = 50
embedding_matrix = create_embedding_matrix(path,tokenizer.word_index, embedding_dim)
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.11891   ,  0.15255   , -0.082073  , ..., -0.57511997,
        -0.26671001,  0.92120999],
       [ 0.68046999, -0.039263  ,  0.30186   , ..., -0.073297  ,
        -0.064699  , -0.26043999],
       ...,
       [-0.21689001, -0.014534  ,  0.43816999, ...,  0.74862999,
        -0.60641998,  0.44712001],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.82858998,  0.77203   , -0.036755  , ...,  0.90109998,
        -0.14999001,  0.28321001]])

In [145]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
vocab_size

7682

In [146]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.739911481385056

In [147]:
maxlen = 100

In [148]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))
model.add(LSTM(128,return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences = False))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('sigmoid'))

In [149]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [150]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           384100    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100, 128)          91648     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_2 (Activation)    (None, 2)                

In [116]:
model.fit(X_train, Y_oh_train, epochs = 5, batch_size = 32, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1f33a6c5e48>

In [117]:
loss, acc = model.evaluate(X_test, Y_oh_test)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.980617344379425
