# Text Classification Using BERT and Transformer

In [1]:
# !pip install --upgrade pip

In [2]:
!pip install h5py
!pip install typing-extensions
!pip install wheel



In [3]:
!pip install --quiet "tensorflow-text==2.15.*"
!pip install tensorflow_hub



## BERT: Classify spam vs no spam emails

In [4]:
import tensorflow as tf
import tensorflow_hub as tfhub
import tensorflow_text as tftext

## Import dataset

In [5]:
import pandas as pd
df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Categorise the ham and spam messages

In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
747 / 4825

0.15481865284974095

#### 15% spam emails, 85% ham emails: This indicates class imbalance

In [9]:
df_spam = df[df['Category'] == 'spam']
df_spam.shape

(747, 2)

In [10]:
df_ham = df[df['Category'] == 'ham']
df_ham.shape

(4825, 2)

#### Need to balance the database of ham by downgrade

In [16]:
df_ham_downgrade = df_ham.sample(df_spam.shape[0])
df_ham_downgrade.shape

(747, 2)

#### Connect both balance ham and spam database into one pandas dataset

In [19]:
df_balanced = pd.concat([df_spam, df_ham_downgrade])
df_balanced

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
2855,ham,Haha... Hope ü can hear the receipt sound... G...
2769,ham,I am on the way to ur home
3113,ham,Just haven't decided where yet eh ?
3884,ham,"Gumby's has a special where a &lt;#&gt; "" che..."


In [20]:
df_balanced.shape

(1494, 2)

In [22]:
df_balanced['Category'].value_counts()

Category
spam    747
ham     747
Name: count, dtype: int64

#### Addition of column 'spam' and declare 'spam' as 1 and else 0 (means ham as 0)

In [26]:
df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
# df_balanced['spam']
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
3642,spam,"You can stop further club tones by replying ""S...",1
4553,ham,"SYMPTOMS when U are in love: ""1.U like listeni...",0
2442,ham,Back in brum! Thanks for putting us up and kee...,0
3298,spam,Todays Voda numbers ending 5226 are selected t...,1
3397,spam,URGENT! Your Mobile number has been awarded wi...,1
