In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
"""!python -m spacy download en_core_web_lg -q"""

'!python -m spacy download en_core_web_lg -q'

In [3]:
import pandas as pd
import string
import re
import numpy as np 
import sklearn
from sklearn.model_selection import train_test_split


# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')

#import en_core_web_lg
from spacy.lang.en.stop_words import STOP_WORDS

import tensorflow as tf

from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification





In [4]:
url = 'https://attspamdetector.s3.eu-west-3.amazonaws.com/spam.csv'
df = pd.read_csv(url, encoding = "ISO-8859-1")


In [5]:
len(df)

5572

In [6]:
df.head()
# "ham" messages are those that are not spam

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
print(df.columns)

# Dropping empty columns
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

# Giving more explicit names to columns
df.rename(columns = {"v1" : "type"}, inplace = True)
df.rename(columns = {"v2" : "text"}, inplace = True)

# We encode the values in column "type": 0 for non-spam messages, 1 for spam
df["type"] = df["type"].apply(lambda x: 0 if x=="ham" else 1)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [8]:
(df["type"]).value_counts()

0    4825
1     747
Name: type, dtype: int64

In [9]:
df['type'].nunique()

2

In [10]:
# checking if there are missing values in the dataset
df.isnull().sum() 

type    0
text    0
dtype: int64

#Text Preprocessing



In [11]:
"""nlp = en_core_web_lg.load()"""

'nlp = en_core_web_lg.load()'

In [12]:
# Removing punctuation by keeping only alphanumeric characters in the text
df["text_clean"] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))

# Removing capitalization
df["text_clean"] = df["text_clean"].apply(lambda x: x.replace("  "," ").lower().strip())

"""# Removing stop words
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))"""

'# Removing stop words\ndf["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))'

In [13]:
df.head()

Unnamed: 0,type,text,text_clean
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [14]:
for i in range(0, 15):
  print(df["type"].loc[i], df["text"].loc[i])

0 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
0 Ok lar... Joking wif u oni...
1 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
0 U dun say so early hor... U c already then say...
0 Nah I don't think he goes to usf, he lives around here though
1 FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv
0 Even my brother is not like to speak with me. They treat me like aids patent.
0 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
1 WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
1 Had your mobile 11 mont

In [15]:
spam_df = df[df["type"]==1].copy()
spam_df = spam_df.reset_index(drop = True)

In [16]:
ham_df = df[df["type"]==0].copy()
ham_df = ham_df.reset_index(drop = True)

In [17]:
spam_df.head()

Unnamed: 0,type,text,text_clean
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
1,1,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling its been 3 weeks now...
2,1,WINNER!! As a valued network customer you have...,winner as a valued network customer you have b...
3,1,Had your mobile 11 months or more? U R entitle...,had your mobile 11 months or more u r entitled...
4,1,"SIX chances to win CASH! From 100 to 20,000 po...",six chances to win cash from 100 to 20000 poun...


In [18]:
ham_df.head()

Unnamed: 0,type,text,text_clean
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
3,0,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...
4,0,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...


In [19]:
# Checking is some rows of the dataframe became empty after text cleaning:
empty_rows_indices = []
for i in range(0, len(df)):
  if len(df["text_clean"].loc[i].split())==0:
    empty_rows_indices.append(i)

print(empty_rows_indices)

[3374, 4822]


In [20]:
len(empty_rows_indices)

2

In [21]:
df["text_clean"].loc[43]

'who are you seeing'

In [22]:
df["text_clean"].loc[42]

'07732584351 rodger burns msg we tried to call you re your reply to our sms for a free nokia mobile free camcorder please call now 08000930705 for delivery tomorrow'

In [23]:
df.drop(axis=0, index=empty_rows_indices, inplace=True)
df.reset_index(inplace=True)

In [24]:
len(df)

5570

In [25]:
df["text_clean"].loc[43]

'who are you seeing'

In [26]:
"""max_len = 100"""

'max_len = 100'

In [27]:
"""# After cleaning, how many words does the longest message contain?
max_len = 0

for i in range(0, len(df)):
  max_len = max(max_len, len(df["text_clean"].loc[i].split()))

print("Maximum message length in words:", max_len)"""

'# After cleaning, how many words does the longest message contain?\nmax_len = 0\n\nfor i in range(0, len(df)):\n  max_len = max(max_len, len(df["text_clean"].loc[i].split()))\n\nprint("Maximum message length in words:", max_len)'

In [28]:
for i in range(0, 15):
  print(spam_df["text_clean"].loc[i])

free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s
freemsg hey there darling its been 3 weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send å150 to rcv
winner as a valued network customer you have been selected to receivea å900 prize reward to claim call 09061701461 claim code kl341 valid 12 hours only
had your mobile 11 months or more u r entitled to update to the latest colour mobiles with camera for free call the mobile update co free on 08002986030
six chances to win cash from 100 to 20000 pounds txt csh11 and send to 87575 cost 150pday 6days 16 tsandcs apply reply hl 4 info
urgent you have won a 1 week free membership in our å100000 prize jackpot txt the word claim to no 81010 tc wwwdbuknet lccltd pobox 4403ldnw1a7rw18
xxxmobilemovieclub to use your credit click the wap link in the next txt message or click here httpwap xxxmobilemovieclubcomnqjk

In [29]:
for i in range(0, 15):
  print(ham_df["text_clean"].loc[i])

go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
ok lar joking wif u oni
u dun say so early hor u c already then say
nah i dont think he goes to usf he lives around here though
even my brother is not like to speak with me they treat me like aids patent
as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press 9 to copy your friends callertune
im gonna be home soon and i dont want to talk about this stuff anymore tonight k ive cried enough today
ive been searching for the right words to thank you for this breather i promise i wont take your help for granted and will fulfil my promise you have been wonderful and a blessing at all times
i have a date on sunday with will
oh kim watching here
eh u remember how 2 spell his name yes i did he v naughty make until i v wet
fine if thatåõs the way u feel thatåõs the way its gota b
is that seriously how you spell his name
iûm going 

In [30]:
# Saving text of sms messages as numpy array
texts = df["text_clean"].values
print(texts[:5])

# Saving labels as numpy array
labels = df["type"].values
print(labels[:5])

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
 'ok lar joking wif u oni'
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s'
 'u dun say so early hor u c already then say'
 'nah i dont think he goes to usf he lives around here though']
[0 0 1 0 0]


In [31]:
texts.shape

(5570,)

In [32]:
labels.shape

(5570,)

In [33]:
# Splitting the arrays into training and validation datasets
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, stratify=labels)

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [35]:
max_len = 0
for text in texts:
  tokenized_text = tokenizer.tokenize(text)
  max_len = max(max_len, len(tokenized_text))

print(max_len)

max_len = max_len+2

print(max_len)

201
203


In [36]:
print('Actual text:' , texts[2])

Actual text: free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s


In [37]:
print('Tokens:' , tokenizer.tokenize(texts[2]))

Tokens: ['free', 'entry', 'in', '2', 'a', 'w', '##k', '##ly', 'com', '##p', 'to', 'win', 'fa', 'cup', 'final', 't', '##kt', '##s', '21st', 'may', '2005', 'text', 'fa', 'to', '87', '##12', '##1', 'to', 'receive', 'entry', 'questions', '##t', '##d', 'tx', '##t', 'rate', '##tc', '##s', 'apply', '08', '##45', '##28', '##100', '##75', '##over', '##18', '##s']


In [38]:
print('Token to ids:', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(texts[2])))

Token to ids: [2489, 4443, 1999, 1016, 1037, 1059, 2243, 2135, 4012, 2361, 2000, 2663, 6904, 2452, 2345, 1056, 25509, 2015, 7398, 2089, 2384, 3793, 6904, 2000, 6584, 12521, 2487, 2000, 4374, 4443, 3980, 2102, 2094, 19067, 2102, 3446, 13535, 2015, 6611, 5511, 19961, 22407, 18613, 23352, 7840, 15136, 2015]


In [39]:
print(texts[:5])

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
 'ok lar joking wif u oni'
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s'
 'u dun say so early hor u c already then say'
 'nah i dont think he goes to usf he lives around here though']


In [40]:
type(texts)

numpy.ndarray

In [41]:
"""encoded_dict = tokenizer.encode_plus(X[0],
                                    add_special_tokens = True,
                                    max_length = max_len,
                                    padding = 'max_length',
                                    return_attention_mask = True)"""

"encoded_dict = tokenizer.encode_plus(X[0],\n                                    add_special_tokens = True,\n                                    max_length = max_len,\n                                    padding = 'max_length',\n                                    return_attention_mask = True)"

In [42]:
"""type(encoded_dict)"""

'type(encoded_dict)'

In [43]:
"""encoded_dict.keys"""

'encoded_dict.keys'

In [44]:
"""print("dict", encoded_dict['input_ids'])"""

'print("dict", encoded_dict[\'input_ids\'])'

# Creating inputs for BERT model

In [45]:
# Creating inputs for BERT model

def inputs_for_bert(texts, max_len):
  input_ids = []
  attention_masks = []

  for text in texts:
    encoded_dict = tokenizer.encode_plus(text,
                                      add_special_tokens = True,
                                      max_length = max_len,
                                      padding = 'max_length',
                                      return_attention_mask = True)
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


  input_ids = tf.convert_to_tensor(input_ids)
  attention_masks = tf.convert_to_tensor(attention_masks)

  return input_ids, attention_masks

In [46]:
texts_train_input_ids, texts_train_attention_masks = inputs_for_bert(texts_train, max_len)
print(texts_train_input_ids.shape)
texts_val_input_ids, texts_val_attention_masks = inputs_for_bert(texts_val, max_len)
print(texts_val_input_ids.shape)

(4177, 203)
(1393, 203)


In [47]:
labels_train = tf.convert_to_tensor(labels_train)
print(labels_train.shape)

labels_val = tf.convert_to_tensor(labels_val)
print(labels_val.shape)

(4177,)
(1393,)


# Transfer Learning

In [48]:
# Initiating BERT model
model_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
"""# Freeze all the layers in BERT
model_bert.trainable = False"""

'# Freeze all the layers in BERT\nmodel_bert.trainable = False'

In [50]:
# Defining the optimizer
optimizer= tf.keras.optimizers.Adam(learning_rate=2e-5)

# Compiling the model
model_bert.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

model_bert.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training the model
model_bert.fit([texts_train_input_ids, texts_train_attention_masks],
               labels_train, batch_size = 32, 
               epochs=5,
               validation_data=([texts_val_input_ids, texts_val_attention_masks], labels_val))

Epoch 1/5
Epoch 2/5

In [None]:
# Saving the model
model_bert.save("model_bert_keeping_stopwords.tf")

In [None]:
import json
json.dump(model_bert.history.history, open("/content/bert_keeping_stopwords_history.json", 'w'))

In [None]:
# Loading model to plot performance over epochs
bert_history = json.load(open("/content/bert_keeping_stopwords_history.json", 'r'))
model_bert = tf.keras.models.load_model("/content/model_bert_keeping_stopwords.tf")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=bert_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=bert_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()


In [None]:
#!zip -r /content/file.zip /content/model_bert.tf

In [None]:
#from google.colab import files
#files.download("/content/file.zip")