In [2]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
import keras
from keras import Sequential
from keras.layers import Dense
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.utils import plot_model
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [3]:
# Loading dataset
data = pd.read_csv("/Users/aaditkapoor/Documents/sms.tsv", delimiter='\t', header=None, names=["outcome", 'message'])

In [4]:
data.head()

Unnamed: 0,outcome,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Using label encoding 
data.outcome = data.outcome.map({'ham':1, 'spam':0})

In [7]:
data.head()

Unnamed: 0,outcome,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
features = data.message.values
labels = data.outcome.values

In [7]:
len(features)

5572

In [9]:
# Changing
c = []
for i in features:
    c.append(text_to_word_sequence(i))


In [10]:
t = Tokenizer()

In [11]:
t.fit_on_texts(c)

In [12]:
features = t.texts_to_matrix(c, mode="tfidf")

In [13]:
# Building model
model = Sequential()
model.add(Dense(100, input_dim=features.shape[1], activation="relu"))
model.add(Dense(100, activation="relu"))
model.add(Dense(100, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss=keras.losses.binary_crossentropy, metrics=["accuracy"])

In [14]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=3)

In [27]:
len(features_train)

4179

In [29]:
len(features_test)

1393

In [15]:
# Using adam, we get good accuracy
model.fit(features_train, labels_train, batch_size=1000, epochs=20, validation_data=(features_test, labels_test))

Train on 4179 samples, validate on 1393 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1505e1710>

In [15]:
plot_model(model, "spam-ham.png")

In [16]:
pred = np.round(model.predict(features_test))

In [17]:
# Printing confusion matrix
cm = confusion_matrix(labels_test, pred)

In [18]:
print (cm)

[[ 178   18]
 [   0 1197]]


In [20]:
pred = pred.astype(int)

In [22]:
a = model.evaluate(features_test, labels_test)



In [23]:
a[1]

0.98707824842756975

In [27]:
np.round(model.predict(features_test[0].reshape(1,-1))).astype(int)

array([[0]])

In [28]:
labels_test[0]

0

In [32]:
labels_test[0:50]

array([0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1])

In [16]:
model.save("keras-spam-ham.h5")

In [17]:
features[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [18]:
features[1]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [20]:
model.predict(features[1].reshape(1,-1))

array([[ 0.99999976]], dtype=float32)

In [22]:
features_train.shape[1]

9010

In [26]:
features_train[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [27]:
labels[1]

1

In [31]:
len(features[1])

9010

In [32]:
data

Unnamed: 0,outcome,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


In [34]:
data.message.values

array([ 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [36]:
data.message.values[0].shape

AttributeError: 'str' object has no attribute 'shape'

In [37]:
t.fit_on_texts(data.message.values[0])

In [45]:
c = t.texts_to_matrix(data.message.values[0]).reshape(-1,1)

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
vect = CountVectorizer()

In [50]:
vect.fit([data.message.values[0]])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [52]:
c = vect.transform([data.message.values[0]])

In [53]:
c.shape

(1, 18)

In [54]:
c

<1x18 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [57]:
c.argmax(axis=0)

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [58]:
model.predict(c)

ValueError: Error when checking : expected dense_1_input to have shape (9010,) but got array with shape (18,)

In [59]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               901100    
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 921,401
Trainable params: 921,401
Non-trainable params: 0
_________________________________________________________________


In [75]:
model.evaluate(features_test, labels_test)



[0.12154602778176941, 0.98564249824810102]

In [76]:
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB()
mb.fit(features_train, labels_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [77]:
mb.score(features_test, labels_test)

0.9712849964106246

In [133]:
to_check = "hi how are you i am fine"

In [134]:
t.fit_on_texts([to_check])

In [135]:
c = t.texts_to_matrix([to_check], mode="tfidf")

In [91]:
mb.predict(c)

array([1])

In [92]:
model.predict(c)

array([[ 0.99975389]], dtype=float32)

In [93]:
len(to_check)

24

In [171]:
model = Sequential()
model.add(Dense(100, activation="relu", input_dim=features.shape[1]))
model.add(Dense(75, activation="relu"))
model.add(Dense(2, activation=new_act))
model.compile(optimizer="adam", loss=keras.losses.categorical_crossentropy, metrics=["acc"])

In [172]:
model.fit(features_train, labels_train)

Epoch 1/1


<keras.callbacks.History at 0x16047dac8>

In [179]:
np.argmax(model.predict(features_train[0].reshape(1,-1)), axis=1)

array([1])

In [97]:
from keras.utils import to_categorical
from keras.preprocessing.text import one_hot

In [116]:
labels_train = to_categorical(labels_train)


In [117]:
len(labels_train)

4179

In [158]:
model.fit(features_train, labels_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x159cbcc18>

In [159]:
pred = model.predict(features_test)

In [122]:
pred

array([[  1.00000000e+00,   4.00809803e-08],
       [  8.17754393e-13,   1.00000000e+00],
       [  9.99991775e-01,   8.17603177e-06],
       ..., 
       [  1.78483255e-19,   1.00000000e+00],
       [  7.51471703e-07,   9.99999285e-01],
       [  5.87718416e-12,   1.00000000e+00]], dtype=float32)

In [125]:
pred =np.argmax(pred, axis=1)


AxisError: axis 1 is out of bounds for array of dimension 1

In [107]:
labels_train = np.argmax(labels_train, axis=1)

In [108]:
from sklearn.metrics import accuracy_score

In [109]:
accuracy_score(labels_train, pred)

ValueError: Found input variables with inconsistent numbers of samples: [4179, 1393]

In [113]:
len(labels_train)

4179

In [115]:
len(pred)

1393

In [126]:
pred

array([0, 1, 0, ..., 1, 1, 1])

In [127]:
accuracy_score(labels_test, pred)

0.98061737257717152

In [130]:
model.evaluate(features_test, to_categorical(labels_test))



[0.14275856994900435, 0.98061737261996029]

In [131]:
features_test

array([[ 0.        ,  0.        ,  1.45883543, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.46477442,  1.45883543, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  1.46477442,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.46477442,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [132]:
labels

array([1, 1, 0, ..., 1, 1, 1])

In [138]:
np.argmax(model.predict(c), axis=1)

array([1])

In [139]:
mb.predict(c)

array([1])

In [143]:
a = "Let's meet for dinner!"
t.fit_on_texts([a])
c = t.texts_to_matrix([a], mode="tfidf")

In [144]:
c

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [145]:
mb.predict(c)

array([1])

In [147]:
np.argmax(model.predict(c), axis=1)

array([1])

In [155]:
len(np.argmax(model.predict(features_test), axis=1))

1393

In [160]:
from keras.callbacks import EarlyStopping

In [167]:
callback = EarlyStopping(monitor="loss", min_delta=0.2, patience=5)

In [168]:
model.fit(features_train, labels_train, epochs=5, callbacks=[callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15c1aa278>

In [169]:
import keras.backend as K

In [170]:
def new_act(x):
    return K.sigmoid(K.relu(K.tanh(x*100)))

In [180]:
new_act(12.)

TypeError: Value passed to parameter 'x' has DataType int32 not in list of allowed values: float16, bfloat16, float32, float64, complex64, complex128