In [11]:
import pandas as pd #Basic packages for creating dataframes and loading dataset
import numpy as np

import matplotlib.pyplot as plt #Package for visualization

import re #importing package for Regular expression operations

from sklearn.model_selection import train_test_split #Package for splitting the data

from sklearn.preprocessing import LabelEncoder #Package for conversion of categorical to Numerical

from keras.preprocessing.text import Tokenizer #Tokenization
from keras.preprocessing.sequence import pad_sequences #Add zeros or crop based on the length
from keras.models import Sequential #Sequential Neural Network
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D #For layers in Neural Network
from keras.utils.np_utils import to_categorical

## Apply the code on spam data set available in the source code (text classification on the spam.csv data set)

In [12]:
data = pd.read_csv('spam.csv',encoding="ISO-8859-1")

# required columns
data = data[['v1','v2']]
data['v2'] = data['v2'].apply(lambda x: x.lower())
data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [13]:
data

Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will _ b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggest...
5570,ham,the guy did some bitching but i acted like id ...


In [14]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

In [15]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['v2'].values)
X = tokenizer.texts_to_sequences(data['v2'].values)

X = pad_sequences(X)

In [16]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['v1'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [17]:
X.shape

(5572, 152)

In [18]:
def createmodel1():
  embed_dim=128
  lstm_out=196
  model = Sequential()
  model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
  model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(2,activation='sigmoid'))
  model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
  return model

In [20]:
batch_size = 32
model1 = createmodel1()
history=model1.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)
score,acc = model1.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model1.metrics_names)

Epoch 1/5
117/117 - 85s - loss: 0.1979 - accuracy: 0.9328
Epoch 2/5
117/117 - 85s - loss: 0.0414 - accuracy: 0.9882
Epoch 3/5
117/117 - 104s - loss: 0.0196 - accuracy: 0.9941
Epoch 4/5
117/117 - 107s - loss: 0.0186 - accuracy: 0.9957
Epoch 5/5
117/117 - 97s - loss: 0.0107 - accuracy: 0.9973
58/58 - 8s - loss: 0.0759 - accuracy: 0.9793
0.07593722641468048
0.9793366193771362
['loss', 'accuracy']


In [22]:
print(history.history.keys())

dict_keys(['loss', 'accuracy'])
