In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Conv1D, Dense, Dropout, Embedding, Flatten, GlobalAveragePooling1D, MaxPooling1D

nltk.download('stopwords')
nltk.download('wordnet')

# reading data
df=pd.read_csv('SMSSpamCollection' ,sep='\t',names=["labels","message"])
y=df.iloc[:,0]
x=df.iloc[:,-1]


label_encoder = preprocessing.LabelEncoder()
df['labels']= label_encoder.fit_transform(df['labels'])
y=df['labels']
df['labels'].unique()

wl = WordNetLemmatizer()
processed_data = []
for i in range(0, len(df)):
    data = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    data = data.lower()
    data = data.split()
    
    data = [wl.lemmatize(word) for word in data if not word in stopwords.words('english')]
    data = ' '.join(data)
    processed_data.append(data)

df['message']=processed_data
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,labels,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c already say
4,0,nah think go usf life around though


In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
# defining hyperparameters
max_len = 50
# vocab_size = 500
oov_tok = "<OOV>"
# for out of vocabulary words
tk = Tokenizer(filters='!@"#$%^&*()-/+:;.?=<>`~[]{|}\\t\n', lower=True, split=" ",oov_token=oov_tok)

tk.fit_on_texts(X_train)
trained_seq = tk.texts_to_sequences(X_train)
pad_train = pad_sequences(trained_seq, maxlen=max_len, padding='post', truncating='post')
vocab_size_train = len(tk.word_index) + 1

test_sequence = tk.texts_to_sequences(X_test)
pad_test = pad_sequences(test_sequence, maxlen=max_len, padding='post', truncating='post')
vocab_size_test = len(tk.word_index) + 1

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size_train, output_dim=16, input_length=max_len, trainable=True))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 16)            114368    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dropout (Dropout)            (None, 24)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 114,801
Trainable params: 114,801
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 10

model.fit(pad_train, y_train, epochs=num_epochs,validation_split=0.2)
y_train

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


4380    0
3887    0
4755    0
2707    0
4747    0
       ..
4931    1
3264    0
1653    1
2607    0
2732    0
Name: labels, Length: 3900, dtype: int64

In [None]:
loss, accuracy = model.evaluate(pad_test, y_test, verbose=0)
print("Loss on test data: ", loss)
print("Accuracy of test data: ", accuracy)

Loss on test data:  0.04906924441456795
Accuracy of test data:  0.9826555252075195


In [None]:
y_pred=model.predict(pad_test)
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,roc_curve
# print("Precision = ",precision_score(y_pred,y_test))
pred=[]
for d in y_pred:
  if d>0.5:
    pred.append(1)
  else:
    pred.append(0)
p=pd.Series(pred)    
print("Precision = ",precision_score(p,y_test),"\n")
print("Recall = ",recall_score(p,y_test),"\n")
print("F1 score = ",f1_score(p,y_test),"\n")

t = Tokenizer()
t.fit_on_texts(X_train)

seq_tr = t.texts_to_sequences(X_train)
pad_seq_tr = pad_sequences(seq_tr, maxlen=max_len, padding='post', truncating='post')
vocab_size_tr = len(t.word_index) + 1

seq_tt = t.texts_to_sequences(X_test)
pad_seq_tt = pad_sequences(seq_tt, maxlen=max_len, padding='post', truncating='post')
vocab_size_tt = len(t.word_index) + 1

Precision =  0.8914027149321267 

Recall =  0.9752475247524752 

F1 score =  0.9314420803782507 



In [None]:
lr_probs = model.predict_proba(pad_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, ]
# calculate scores
ns_auc = roc_auc_score(p, y_test)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, p)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

No Skill: ROC AUC=0.976
Logistic: ROC AUC=0.995




In [None]:

!ls "/content/drive/My Drive/glove"
!unzip -q "/content/drive/My Drive/glove/glove.6B.zip"

glove.6B.zip


In [None]:
from google.colab import drive
drive.mount('/content/drive')


import zipfile
zip_ref = zipfile.ZipFile("/content/drive/My Drive/glove/glove.6B.zip", 'r')
zip_ref.extractall("/tmp")
zip_ref.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
embedding_vectors = {}
import numpy as np
f = open("glove.6B.200d.txt",encoding="utf8" )
for line in f:
    value = line.split(" ")
    word = value[0]
    coef = np.array(value[1:], dtype='float32')
    embedding_vectors[word] = coef
f.close()

print("Total word vectors: ", len(embedding_vectors))



Total word vectors:  400000


In [None]:
m = 50

t = Tokenizer()
t.fit_on_texts(X_train)

seq_tr = t.texts_to_sequences(X_train)
pad_seq_tr = pad_sequences(seq_tr, maxlen=max_len, padding='post', truncating='post')
vocab_size_tr = len(t.word_index) + 1

seq_tt = t.texts_to_sequences(X_test)
pad_seq_tt = pad_sequences(seq_tt, maxlen=max_len, padding='post', truncating='post')
vocab_size_tt = len(t.word_index) + 1          # add 1 so that if any word comes into the model which is not seen by the model before, it is assigned that place

In [None]:
# creating a matrix only of the words present in our corpus and their vectors
embedding_matrix = np.zeros((vocab_size_tr, 200))
for word, i in t.word_index.items():
    embedding_value = embedding_vectors.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [None]:
drop_value = 0.2
n_dense = 24

# building model
model = Sequential()
model.add(Embedding(input_dim=vocab_size_tr, output_dim=200, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Conv1D(filters = 128 , kernel_size =3 ,activation = 'relu'))#converts output to 128 value which is mutiple of 128 
model.add(MaxPooling1D(5))#here max pooling window size is 5 while strides default is 1 ex : {1,2,3,4}->array([[1.],[2.],[3.]) but max size of a element is 5
model.add(Flatten())# it converts the input matrix into 2 X n version, ex : (none,1,2,40)-> (none,80)
model.add(Dense(256, activation='relu'))#256 gives best output so 256, and 2 with softmax while 1 with sigmoid in dense final layer
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))
# model compilation
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 200)           1511600   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 128)           76928     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 9, 128)            0         
_________________________________________________________________
flatten (Flatten)            (None, 1152)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               295168    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [None]:
model.fit(pad_seq_tr, y_train, epochs=10, validation_split=0.2)
y_pred=model.predict(pad_seq_tt)
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,roc_curve,accuracy_score
# print("Precision = ",precision_score(y_pred,y_test))
pred=[]
for d in y_pred:
  if d>0.5:
    pred.append(1)
  else:
    pred.append(0)
p=pd.Series(pred)    
print("Precision = ",precision_score(p,y_test),"\n")
print("Recall = ",recall_score(p,y_test),"\n")
print("F1 score = ",f1_score(p,y_test),"\n")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Precision =  0.8914027149321267 

Recall =  0.9752475247524752 

F1 score =  0.9314420803782507 



In [None]:
!pip install scikit-plot



In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt
# Y_test_bin=np.argmax(y_test)
# pred_bin=np.argmax(pred)
# skplt.metrics.plot_roc_curve(y_test, p)
# plt.show()


KeyError: ignored