## Imports

In [180]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, Lambda
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers import Embedding
import re
import string



In [181]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading Data

In [183]:
#Reading input data
file = open('/content/SMSSpamCollection', 'r')
data = file.readlines()

data_dict = {'text': [], 'label': []}

#Convert target class to numeric values
for line in data:
  line_split = line.rstrip().split('\t')
  data_dict['text'].append(' '.join(line_split[1:]))
  data_dict['label'].append(0 if line_split[0] == 'ham' else 1)


data_df = pd.DataFrame.from_dict(data_dict)
data_df

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5569,This is the 2nd time we have tried 2 contact u...,1
5570,Will ü b going to esplanade fr home?,0
5571,"Pity, * was in mood for that. So...any other s...",0
5572,The guy did some bitching but I acted like i'd...,0


## Preprocessing of data

In [221]:
total_stopwords = set([word.replace("'",'') for word in stopwords.words('english')])
# Call the lemmatizer class
lemma = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '',text)
    text = re.sub('[^a-zA-Z]',' ',text)
    words = text.split() # Split the text into words by looking at the space between the texts
    # Stem the words and also remove stopwords
    words = [lemma.lemmatize(word) for word in words if (word not in total_stopwords) and (len(word)>1)] # Remove stop words
    # Put the clean words back together as a text message
    text = " ".join(words)
    return text


# data_collection.Text = data_collection.Text.apply(preprocess_text)
data_df.text=data_df.text.apply(preprocess_text)
data_df

Unnamed: 0,text,label
0,go jurong point crazy available bugis great wo...,0
1,ok lar joking wif oni,0
2,free entry wkly comp win fa cup final tkts st ...,1
3,dun say early hor already say,0
4,nah think go usf life around though,0
...,...,...
5569,nd time tried contact pound prize claim easy c...,1
5570,going esplanade fr home,0
5571,pity mood suggestion,0
5572,guy bitching acted like id interested buying s...,0


## Tokenization and train/test split

In [222]:
X_train, X_test, y_train, y_test = train_test_split(data_df.text, data_df.label, test_size=0.20)

max_features = 4000
max_len = 100

t = Tokenizer(num_words=max_features)

t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
tokenized_train = t.texts_to_sequences(X_train)
X_train = pad_sequences(tokenized_train, maxlen=max_len)
tokenized_test = t.texts_to_sequences(X_test)
X_test = pad_sequences(tokenized_test, maxlen=max_len)


## Glove embedding

In [223]:
EMBEDDING_FILE = '/content/drive/MyDrive/glove.6B.100d.txt'
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8"))



In [224]:
all_embeddings = np.stack(list(embeddings_index.values()))

In [225]:
all_embeddings.shape

(400001, 100)

In [226]:
embedding_mean, embedding_std = all_embeddings.mean(), all_embeddings.std()
embedding_size = all_embeddings.shape[1]

In [231]:
word_index = t.word_index
max_features = min(max_features, len(word_index))

embedding_matrix = np.random.normal(embedding_mean, embedding_std, (max_features, max_len))
for word, i in word_index.items():
    if i >= max_features:
      continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

## Mean pooling

In [232]:
batch_size = 64
embed_size = 100

In [233]:
model = Sequential()

model.add(Embedding(max_features, output_dim=embedding_size, weights=[embedding_matrix], input_length = max_len, trainable=True))
model.add(Lambda(lambda x: tensorflow.reduce_mean(x, axis=1)))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [234]:
history = model.fit(X_train, y_train, batch_size = batch_size , validation_data = (X_test,y_test), epochs =10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [217]:
print("Accuracy of the model on Testing Data is - " , model.evaluate(X_test,y_test)[1]*100 , "%")

Accuracy of the model on Testing Data is -  88.16143274307251 %


## Max Pooling


In [235]:
model2 = Sequential()

model2.add(Embedding(max_features, output_dim=embedding_size, weights=[embedding_matrix], input_length = max_len, trainable=True))
model2.add(Lambda(lambda x: tensorflow.reduce_max(x, axis=1)))
model2.add(Flatten())
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [236]:
history = model.fit(X_train, y_train, batch_size = batch_size , validation_data = (X_test,y_test), epochs =10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [237]:
print("Accuracy of the model on Testing Data is - " , model2.evaluate(X_test,y_test)[1]*100 , "%")

Accuracy of the model on Testing Data is -  69.14798021316528 %
