## Text Classification using BERT:

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [12]:
df_spam = df[df['Category'] == 'spam']
df_spam.shape

(747, 2)

In [6]:
df_ham = df[df['Category'] == 'ham']
df_ham.shape

(4825, 2)

In [14]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [8]:
df_balanced = pd.concat([df_spam, df_ham_downsampled])
df_balanced.shape

(1494, 2)

In [9]:
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [15]:
df_balanced.sample(5)

Unnamed: 0,Category,Message
4538,ham,Normally i use to drink more water daily:)
1072,spam,URGENT! We are trying to contact U. Todays dra...
1328,spam,Ur balance is now £500. Ur next question is: W...
1518,spam,Our brand new mobile music service is now live...
10,ham,I'm gonna be home soon and i don't want to tal...


In [18]:
df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(10)

Unnamed: 0,Category,Message,spam
2684,ham,I'm okay. Chasing the dream. What's good. What...,0
4797,spam,URGENT This is our 2nd attempt to contact U. Y...,1
1747,ham,I don know account details..i will ask my mom ...,0
1856,ham,K.:)you are the only girl waiting in reception...,0
3340,ham,Babe !!!! I LOVE YOU !!!! *covers your face in...,0
3817,ham,How long does it take to get it.,0
1986,ham,The length is e same but e top shorter n i got...,0
5269,spam,"If you don't, your prize will go to another cu...",1
164,spam,-PLS STOP bootydelious (32/F) is inviting you ...,1
2947,ham,make that 3! 4 fucks sake?! x,0


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'], stratify=df_balanced['spam'])
X_train.head(4)

808                     Boooo you always work. Just quit.
463     UpgrdCentre Orange customer, you may now claim...
1120    Bored of speed dating? Try SPEEDCHAT, txt SPEE...
4637                           K k pa Had your lunch aha.
Name: Message, dtype: object

In [20]:
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')



In [21]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    '500$ discount, hurry up',
    'Bhavin, are you up for a volleybal game tomorrow?'
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.7734203 , -0.501377  , -0.80737436, ..., -0.60576564,
        -0.74096286,  0.8849831 ],
       [-0.8720835 , -0.50543964, -0.94446677, ..., -0.8584752 ,
        -0.7174535 ,  0.8808299 ]], dtype=float32)>

In [22]:
e = get_sentence_embeding([
    'banana',
    'grapes',
    'mango',
    'jeff bezos',
    'elon musk',
    'bill gates'
])

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[3]], [e[4]])

array([[0.98720354]], dtype=float32)

In [26]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [29]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                    

In [30]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name = 'recall')
]

model.compile(optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = METRICS)

model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x162b14f1690>

In [31]:
model.evaluate(X_test, y_test)



[0.27102574706077576,
 0.9090909361839294,
 0.9005235433578491,
 0.9197860956192017]

### Inference :

In [32]:
reviews = [
    'Reply to win 1100$ weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end',
    'You are awared a SiPix Digital Camera! call 086528557886 from landline. Delivery within 28days.',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    'Why dont you wait til at least wednesday to see if you get your.']

model.predict(reviews)



array([[0.7712761 ],
       [0.60812867],
       [0.7085078 ],
       [0.17091317],
       [0.06942579]], dtype=float32)