In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Classify Text Sms using LSTM**
This program takes the span.csv file as input dataset. The file contains one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text. The dataset can be downloaded from the link: https://www.kaggle.com/uciml/sms-spam-collection-dataset

The objective of this program is to classify the text sms into the two classes: ham & spam using LSTM

**Import the necessary libraries**

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import tensorflow as tf
sns.set_style('darkgrid')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder as le
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

**Load the data into Pandas dataframe**

In [3]:
df = pd.read_csv(r"../input/spam-and-ham/spam.csv", delimiter = ',', encoding = 'latin-1')
df.head()

In [4]:
df.shape

**Drop the columns that are not required for the neural network.**

In [5]:
X = df['v2']
y = df['v1']

**Understand the distribution better.**

In [6]:
count = y.value_counts()
print(count)
sns.countplot(y)

**Label encoding of target**

In [7]:
target = le().fit_transform(y)
target


**Split into training and test data.**

In [8]:
x_train,x_test,y_train,y_test = train_test_split(X,target,test_size=0.2)

**Process the data**

Tokenize the data and convert the text to sequences.
Words are called tokens and splitting of the words are called tokenization.

Add padding to ensure that all the sequences have the same shape.
There are many ways of taking the max_len and here an arbitrary length of 200 is chosen.

In [9]:
max_words = 1000
max_len = 200
token = Tokenizer(num_words=max_words)
token.fit_on_texts(x_train)

In [10]:
sequence_train = token.texts_to_sequences(x_train)
sequence_matrix = sequence.pad_sequences(sequence_train,maxlen = max_len)

sequence_test =  token.texts_to_sequences(x_test)
matrix_test = sequence.pad_sequences(sequence_test,maxlen = max_len)

**Defining RNN model with tensorflow2.0**

In [11]:
RNN_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape= [max_len]),
    tf.keras.layers.Embedding(max_words,50,input_length=max_len),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(256),
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
RNN_model.summary()

**Compiling of model 'ADAM' optimizer**

In [12]:
RNN_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

**Fitting of training dataset into the model**

In [13]:
fitting = RNN_model.fit(sequence_matrix,y_train,batch_size=80,epochs=10, validation_split = 0.2)

**Evaluation of the model basis of test dataset**

In [14]:
RNN_model.evaluate(matrix_test,y_test)

**Plotting of test and validation accuracy**

In [15]:
plt.plot(fitting.history['accuracy'], label = 'Training_accuracy')
plt.plot(fitting.history['val_accuracy'], label = 'Validation_accuracy')
plt.yticks([0.94,0.95,0.96,0.97,0.98,0.99,1])
plt.xticks(np.arange(1,10,step=0.5))
plt.legend(loc='lower right')
plt.show()

**Testing model by giving some sentence by self**

In [16]:
self_test = ['hello! How are you?',"+123 Congratulations - in this week's competition draw u have won the å£1450 prize to claim just call 09050002311 b4280703. T&Cs/stop SMS 08718727868. Over 18 only 150ppm"]
sequence_self_test = token.texts_to_sequences(self_test)
sequence_matrix_self_test = sequence.pad_sequences(sequence_self_test,maxlen = max_len)
pred=RNN_model.predict(sequence_matrix_self_test)
for i in range(len(self_test)):
    p = "ham" if (pred[i] <0.5) else "spam"
    print(p, "\t", self_test[i])

**Testing on test dataset**

In [17]:
test_pred = RNN_model.predict(matrix_test)
X_test = np.array(x_test)
for i in range(len(x_test)):
    p = "ham" if (test_pred[i] <0.5) else "spam"
    print(p, "\t", X_test[i])