In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy download en_core_web_lg -q

2023-02-18 13:13:57.341325: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-18 13:13:59.326251: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-18 13:13:59.326371: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-18 13:14:02.269058: E tensorfl

In [3]:
import pandas as pd
import string
import re
import numpy as np 
import sklearn
from sklearn.model_selection import train_test_split


# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')

import en_core_web_lg
from spacy.lang.en.stop_words import STOP_WORDS

import tensorflow as tf

from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification





In [4]:
df = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1")
# https://attspamdetector.s3.eu-west-3.amazonaws.com/spam.csv

In [5]:
len(df)

5572

In [6]:
df.head()
# "ham" messages are those that are not spam

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
print(df.columns)

# Dropping empty columns
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

# Giving more explicit names to columns
df.rename(columns = {"v1" : "type"}, inplace = True)
df.rename(columns = {"v2" : "text"}, inplace = True)

# We encode the values in column "type": 0 for non-spam messages, 1 for spam
df["type"] = df["type"].apply(lambda x: 0 if x=="ham" else 1)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [8]:
(df["type"]).value_counts()

0    4825
1     747
Name: type, dtype: int64

In [9]:
df['type'].nunique()

2

In [10]:
# checking if there are missing values in the dataset
df.isnull().sum() 

type    0
text    0
dtype: int64

#Text Preprocessing



In [11]:
nlp = en_core_web_lg.load()

In [12]:
# Removing punctuation by keeping only alphanumeric characters in the text
df["text_clean"] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))

# Removing capitalization
df["text_clean"] = df["text_clean"].apply(lambda x: x.replace("  "," ").lower().strip())

# Removing stop words
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

In [13]:
df.head()

Unnamed: 0,type,text,text_clean
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun early hor u c
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live


In [14]:
for i in range(0, 15):
  print(df["type"].loc[i], df["text"].loc[i])

0 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
0 Ok lar... Joking wif u oni...
1 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
0 U dun say so early hor... U c already then say...
0 Nah I don't think he goes to usf, he lives around here though
1 FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv
0 Even my brother is not like to speak with me. They treat me like aids patent.
0 As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
1 WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
1 Had your mobile 11 mont

In [15]:
spam_df = df[df["type"]==1].copy()
spam_df = spam_df.reset_index(drop = True)

In [16]:
ham_df = df[df["type"]==0].copy()
ham_df = ham_df.reset_index(drop = True)

In [17]:
spam_df.head()

Unnamed: 0,type,text,text_clean
0,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
1,1,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darle 3 week word d like fun tb ok...
2,1,WINNER!! As a valued network customer you have...,winner value network customer select receivea ...
3,1,Had your mobile 11 months or more? U R entitle...,mobile 11 month u r entitle update late colour...
4,1,"SIX chances to win CASH! From 100 to 20,000 po...",chance win cash 100 20000 pound txt csh11 send...


In [18]:
ham_df.head()

Unnamed: 0,type,text,text_clean
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,0,U dun say so early hor... U c already then say...,u dun early hor u c
3,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live
4,0,Even my brother is not like to speak with me. ...,brother like speak treat like aids patent


In [19]:
# Checking is some rows of the dataframe became empty after text cleaning:
empty_rows_indices = []
for i in range(0, len(df)):
  if len(df["text_clean"].loc[i].split())==0:
    empty_rows_indices.append(i)

print(empty_rows_indices)

[43, 959, 1087, 1190, 1236, 1407, 2740, 2805, 2871, 2927, 3374, 3767, 4271, 4573, 4640, 4822, 5529]


In [20]:
len(empty_rows_indices)

17

In [21]:
df["text_clean"].loc[43]

''

In [22]:
df["text_clean"].loc[42]

'07732584351 rodger burn msg try reply sm free nokia mobile free camcorder 08000930705 delivery tomorrow'

In [23]:
df.drop(axis=0, index=empty_rows_indices, inplace=True)
df.reset_index(inplace=True)

In [24]:
len(df)

5555

In [25]:
df["text_clean"].loc[43]

'great hope like man endowed ltgt inch'

In [26]:
max_len = 100

In [27]:
"""# After cleaning, how many words does the longest message contain?
max_len = 0

for i in range(0, len(df)):
  max_len = max(max_len, len(df["text_clean"].loc[i].split()))

print("Maximum message length in words:", max_len)"""

'# After cleaning, how many words does the longest message contain?\nmax_len = 0\n\nfor i in range(0, len(df)):\n  max_len = max(max_len, len(df["text_clean"].loc[i].split()))\n\nprint("Maximum message length in words:", max_len)'

In [28]:
for i in range(0, 15):
  print(spam_df["text_clean"].loc[i])

free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s
freemsg hey darle 3 week word d like fun tb ok xxx std chgs send å150 rcv
winner value network customer select receivea å900 prize reward claim 09061701461 claim code kl341 valid 12 hour
mobile 11 month u r entitle update late colour mobile camera free mobile update co free 08002986030
chance win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6days 16 tsandcs apply reply hl 4 info
urgent win 1 week free membership å100000 prize jackpot txt word claim 81010 tc wwwdbuknet lccltd pobox 4403ldnw1a7rw18
xxxmobilemovieclub use credit click wap link txt message click httpwap xxxmobilemovieclubcomnqjkgighjjgcbl
england v macedonia miss goalsteam news txt ur national team 87077 eg england 87077 trywale scotland 4txtì¼120 poboxox36504w45wq 16
thank subscription ringtone uk mobile charge å5month confirm reply yes reply charge
07732584351 rodger burn msg try r

In [29]:
for i in range(0, 15):
  print(ham_df["text_clean"].loc[i])

jurong point crazy available bugis n great world la e buffet cine amore wat
ok lar joke wif u oni
u dun early hor u c
nah think usf live
brother like speak treat like aids patent
request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune
m home soon want talk stuff anymore tonight k ve cry today
ve search right word thank breather promise help grant fulfil promise wonderful blessing time
date sunday
oh kim watch
eh u remember 2 spell yes v naughty v wet
fine thatåõs way u feel thatåõs way gota b
seriously spell
iûm try 2 month ha ha joke
ì pay lar da stock comin


In [30]:
# Saving text of sms messages as numpy array
texts = df["text_clean"].values
print(texts[:5])

# Saving labels as numpy array
labels = df["type"].values
print(labels[:5])

['jurong point crazy available bugis n great world la e buffet cine amore wat'
 'ok lar joke wif u oni'
 'free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s'
 'u dun early hor u c' 'nah think usf live']
[0 0 1 0 0]


In [31]:
texts.shape

(5555,)

In [32]:
labels.shape

(5555,)

In [33]:
# Splitting the arrays into training and validation datasets
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels)

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [35]:
print('Actual text:' , texts[2])

Actual text: free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s


In [36]:
print('Tokens:' , tokenizer.tokenize(texts[2]))

Tokens: ['free', 'entry', '2', 'w', '##k', '##ly', 'com', '##p', 'win', 'fa', 'cup', 'final', 't', '##kt', '##s', '21st', '2005', 'text', 'fa', '87', '##12', '##1', 'receive', 'entry', 'questions', '##t', '##d', 'tx', '##t', 'rate', '##tc', '##s', 'apply', '08', '##45', '##28', '##100', '##75', '##over', '##18', '##s']


In [37]:
print('Token to ids:', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(texts[2])))

Token to ids: [2489, 4443, 1016, 1059, 2243, 2135, 4012, 2361, 2663, 6904, 2452, 2345, 1056, 25509, 2015, 7398, 2384, 3793, 6904, 6584, 12521, 2487, 4374, 4443, 3980, 2102, 2094, 19067, 2102, 3446, 13535, 2015, 6611, 5511, 19961, 22407, 18613, 23352, 7840, 15136, 2015]


In [38]:
print(texts[:5])

['jurong point crazy available bugis n great world la e buffet cine amore wat'
 'ok lar joke wif u oni'
 'free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s'
 'u dun early hor u c' 'nah think usf live']


In [39]:
type(texts)

numpy.ndarray

In [40]:
"""encoded_dict = tokenizer.encode_plus(X[0],
                                    add_special_tokens = True,
                                    max_length = max_len,
                                    padding = 'max_length',
                                    return_attention_mask = True)"""

"encoded_dict = tokenizer.encode_plus(X[0],\n                                    add_special_tokens = True,\n                                    max_length = max_len,\n                                    padding = 'max_length',\n                                    return_attention_mask = True)"

In [41]:
"""type(encoded_dict)"""

'type(encoded_dict)'

In [42]:
"""encoded_dict.keys"""

'encoded_dict.keys'

In [43]:
"""print("dict", encoded_dict['input_ids'])"""

'print("dict", encoded_dict[\'input_ids\'])'

# Creating inputs for BERT model

In [45]:
# Creating inputs for BERT model

def inputs_for_bert(texts, max_len):
  input_ids = []
  attention_masks = []

  for text in texts:
    encoded_dict = tokenizer.encode_plus(text,
                                      add_special_tokens = True,
                                      max_length = max_len,
                                      padding = 'max_length',
                                      return_attention_mask = True)
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


  input_ids = tf.convert_to_tensor(input_ids)
  attention_masks = tf.convert_to_tensor(attention_masks)

  return input_ids, attention_masks

In [47]:
texts_train_input_ids, texts_train_attention_masks = inputs_for_bert(texts_train, max_len)
print(texts_train_input_ids.shape)
texts_val_input_ids, texts_val_attention_masks = inputs_for_bert(texts_train, max_len)
print(texts_val_input_ids.shape)

(4166, 100)


In [48]:
labels_train = tf.convert_to_tensor(labels_train)

labels_val = tf.convert_to_tensor(labels_val)

# Transfer Learning

In [49]:
# Initiating BERT model
model_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Downloading (…)"tf_model.h5";:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
# Defining the optimizer
optimizer= tf.keras.optimizers.Adam()

# Compiling the model
model_bert.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

model_bert.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training the model
model_bert.fit([texts_train_input_ids, texts_train_attention_masks],
               labels_train, batch_size = 32, 
               epochs=5,
               validation_data=([texts_val_input_ids, texts_val_attention_masks], labels_val))

In [None]:
# Saving the model
model_bert.save("model_bert.h5")

In [None]:
import json
json.dump(model_bert.history.history, open("/content/bert_history.json", 'w'))

In [None]:
# Loading model to plot performance over epochs
bert_history = json.load(open("/content/bert_history.json", 'r'))
model_bert = tf.keras.models.load_model("/content/model_bert.h5")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=bert_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=bert_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()


In [None]:
"""print(input_ids)"""

In [None]:
"""len(input_ids)"""

In [None]:
"""all(len(i) == len(input_ids[0]) for i in input_ids)"""

In [None]:
"""# Checking if 
for i in range(0, len(input_ids)):
  if len(input_ids[i]) != max_len:
    print(i, ":", len(input_ids[i]), input_ids[i])
  else:
    pass
"""

In [None]:
"""len(input_ids[-1])"""

In [None]:
"""#Configure the model
BERT_MODEL = "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2"
# Importing the preprocessing thant matches the model
PREPROCESS_MODEL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3""""

In [None]:
sentences = df["text"]

In [None]:
sentences

In [None]:
for i in range(0, 15):
  print(df["text_clean"].loc[i])
  

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df["text_clean"])
df["text_encoded"] = tokenizer.texts_to_sequences(df["text_clean"])

In [None]:
df.head()

In [None]:
#df["text_padded_encoded"] = df["text_encoded"].apply(lambda x: tf.keras.preprocessing.sequence.pad_sequences(x, padding="post"))

In [None]:
df.head()

In [None]:
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df["text_encoded"], padding="post")

In [None]:
full_ds = tf.data.Dataset.from_tensor_slices((text_pad, df["type"]))

In [None]:
df.shape

In [None]:
TAKE_SIZE = int(0.7*df.shape[0])

train_data = full_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
train_data = train_data.batch(64)

test_data = full_ds.skip(TAKE_SIZE)
test_data = test_data.batch(64)

In [None]:
type(train_data.take(1))

In [None]:
for text, text_type in train_data.take(1):
  print(text, text_type)

In [None]:
type(text)

In [None]:
text.shape

In [None]:
text.shape[1]

In [None]:
text

In [None]:
df.shape[1]

#Transfer Learning

In [None]:
base_model = 

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM

vocab_size = len(tokenizer.word_index)
model_lstm = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[text.shape[1],],name="embedding"),
                  # text.shape[1]: 72
                  LSTM(units=64, return_sequences=True, name = "ltsm_1"), # maintains the sequential nature
                  LSTM(units=64, return_sequences=False, name = "lstm_2"), # returns the last output
                  Dense(16, activation='relu', name = "dense_1"),
                  Dense(8, activation='relu'),
                  Dense(1, activation="sigmoid", name="last")
                  ])

In [None]:
model_lstm.summary()

In [None]:
optimizer= tf.keras.optimizers.Adam()

model_lstm.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model_lstm.fit(train_data,
              epochs=100, 
              validation_data=test_data,
               class_weight=weights)

In [None]:
model_lstm.save("model_lstm.h5")

In [None]:
import json
json.dump(model_lstm.history.history, open("/content/LSTM_history.json", 'w'))

In [None]:
LSTM_history = json.load(open("/content/LSTM_history.json", 'r'))
model_lstm = tf.keras.models.load_model("/content/model_lstm.h5")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=LSTM_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=LSTM_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()
