In [None]:
!pip install -q tensorflow==2.0.0-beta1
import tensorflow as tf
print(tf.__version__)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequence
from tensorflow.keras.layers import Dense,Input,GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D , MaxPooling2D, Embedding
from tensorflow.keras.models import Model

In [None]:
!wget https://lazyprogrammer.me/course_files/spam.csv

In [None]:
df = pd.read_csv('spam.csv', encoding ='ISO-8859-1')

In [None]:
df.head()

In [None]:
df.drop(["Unnamed: 2","Unnamed: 3", "Unnamed: 4"], axis = 1)

In [None]:
df.head()

In [None]:
df.columns = ['labels', 'data']

In [None]:
df.head()

In [None]:
#create binary labels
df['b_labels'] = df['labels'].map({'ham':0, 'spam':1})
Y= df['b_labels'].values

In [None]:
#split up the data
df_train, df_test, Ytrain, Ytest = train_test_split(df['data'], Y, test_size= 0.33)

In [None]:
#convert sentences to sequences
MAX_VOCAB_SIZE =20000 #most common words in english language has 3000 words, so 3000 will cover 95% of most texts
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE) #instantaite Tokenizer class with a max number of words
tokenizer.fit_on_texts(df_train) #this is just like sklearn .fit and .transorm packages
sequences_train= tokenizer.texts_to_sequences(df_train) #this is just like sklearn .fit and .transorm packages
sequences_test= tokenizer.texts_to_sequences(df_test) #this is just like sklearn .fit and .transorm packages

In [None]:
# get word -> integer mapping

word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique Tokens.' % V)

In [None]:
#pad sequence so that we get a N x T matrix
data_train = pad_sequences(sequences_train)
print('Shape of data train tensor:', data_train.shape)

#get sequence length
T= data_train.shape[1]

In [None]:
data_test = pad_sequence(sequences_test, maxlen=T)
print('Shape of data tet tensor:', data_test.shape)

In [None]:
#create the model
#we get to choose embedding dimensionally
D = 20



#Note we actually want te size of teh embedding to be (V+1) x D,
#because the first index starts from 1 and not 0
#thus , if the final index of teh embedding matrix is V,
#then it actually must have size V +1

i = Input(shape=(T,)) #number of time steps
x = Embedding(V+1, D)(i) #number of input features, for 1D convolution we need T x D number fo time steps and number of features
x = Conv1D(32,3, activation = 'relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i,x)

In [None]:
#compile and fit
model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)

print('Training Model....')
r= model.fit(
data_train,
Ytrain,
epochs=10,
validation_data=(data_test,Y_test)
)

In [None]:
#loss per iteration
import matplotlib.pyplot as plt
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()

In [None]:
#accuracy per iteration
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()