# Install dependencies

In [None]:
! pip install nltk tensorflow pandas numpy matplotlib scikit-learn

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import pandas as pd
import nltk
nltk.download('stopwords')

# Mounting data from drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Read the data and preprocessing it

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Text Classification/spam.csv' , encoding = 'ISO-8859-1')
data.rename(columns = {'v1' : 'Target' , 'v2' : 'Text'} , inplace = True)
data= data[['Target' , 'Text']]

In [None]:
# Removing stop words
stopwords_list = stopwords.words('english')
data['Text'] = data['Text'].apply(lambda x : " ".join(x for word in x.split() if word not in stopwords_list))
data['Text'] = data['Text'].apply(lambda x : re.sub('[!@#$:).;,?&]','',x.lower()))
data['Text']= data['Text'].apply(lambda x : re.sub(' ', ' ' , x))
data.head(2)

In [None]:
# Check the null valuse
data.isna().sum()

In [None]:
# Train and test split with 80:20 ratio
xTrain , xTest , yTrain , yTest = train_test_split(data['Text'] , data['Target'] , test_size= 0.2)

# Tokenization and padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
# Define the sequence length, max number of words and embedding dimensions
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
# Get the frequently occurring words
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(xTrain)
train_sequences = tokenizer.texts_to_sequences(xTrain)
test_sequences = tokenizer.texts_to_sequences(xTest)

In [None]:
from keras.preprocessing.sequence import pad_sequences
# dictionary containing words and their index
word_index = tokenizer.word_index
# Padding the sequences to 300 length
train_data = pad_sequences(train_sequences , maxlen = MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences , maxlen= MAX_SEQUENCE_LENGTH)

# Encoding the labels

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(yTrain)
train_labels = le.transform(yTrain)
test_labels = le.transform(yTest)
test_labels.shape

In [None]:
# One hot encoding for the labels
import numpy as np
from tensorflow.keras.utils import to_categorical
encoded_train_labels = to_categorical(np.asarray(train_labels))
encoded_test_labels = to_categorical(np.asarray(test_labels))
print(encoded_train_labels.shape)

# Building the models

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras import initializers, regularizers, constraints,optimizers, layers
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout,BatchNormalization , SimpleRNN
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Sequential

# Building First Model (1-D CNN)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS , EMBEDDING_DIM , input_length = MAX_SEQUENCE_LENGTH))
model.add(Dropout(0.5))
for i in range(2):
  model.add(Conv1D(128 , 5, activation= 'relu'))
  model.add(MaxPooling1D(5))
  model.add(Dropout(0.5))
  model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(units= 128 , activation= 'relu'))
model.add(Dense(units= 2 , activation= 'softmax'))
with tf.device('/device:GPU:0'):
  model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics=['acc'])
  model.fit(train_data , encoded_train_labels , batch_size =8 , epochs = 5, validation_data=(test_data , encoded_test_labels))

In [None]:
predictaion = model.predict(test_data)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(encoded_test_labels , predictaion.round()))

# Build the second model (Simple RNN)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS , EMBEDDING_DIM , input_length= MAX_SEQUENCE_LENGTH))
model.add(SimpleRNN(2 , input_shape = (None , 1)))
model.add(Dense(50 , activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(2 , activation = 'softmax'))
with tf.device('/device:GPU:0'):
  model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics=['acc'])
  model.fit(train_data , encoded_train_labels , batch_size =16 , epochs = 5, validation_data=(test_data , encoded_test_labels))

In [None]:
predictaion = model.predict(test_data)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(encoded_test_labels , predictaion.round()))

# Building the Third model (LSTM)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS , EMBEDDING_DIM ,input_length = MAX_SEQUENCE_LENGTH))
model.add(LSTM(units = 2,activation='relu' , return_sequences= True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(2 , activation='softmax'))
with tf.device('/device:GPU:0'):
  model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics=['acc'])
  model.fit(train_data , encoded_train_labels , batch_size =8 , epochs = 5, validation_data=(test_data , encoded_test_labels))

In [None]:
predictaion = model.predict(test_data)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(encoded_test_labels , predictaion.round()))

# Building the Fourth model (Bidirectional LSTM)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS , EMBEDDING_DIM ,input_length = MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(units = 2,activation='relu' , return_sequences= True , dropout = 0.1)))
model.add(Conv1D (16 , kernel_size =3))
model.add(GlobalMaxPool1D())
model.add(Dense(50 , activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(2 ,activation = 'softmax'))
with tf.device('/device:GPU:0'):
  model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics=['acc'])
  model.fit(train_data , encoded_train_labels , batch_size =8 , epochs = 5, validation_data=(test_data , encoded_test_labels))

In [None]:
predictaion = model.predict(test_data)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(encoded_test_labels , predictaion.round()))