In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
torch.cuda.device_count()

1

In [6]:
torch.cuda.get_device_name(0)

'NVIDIA RTX A1000 6GB Laptop GPU'

In [7]:
torch.cuda.get_device_capability()

(8, 6)

In [8]:
torch.cuda.get_device_properties(0).total_memory / 1024**3


5.99951171875

In [9]:
df["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
df["Category"] = df["Category"].map({"spam" : 1 , "ham" : 0 })

df.info()

df["Category"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   int64 
 1   Message   5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


array([0, 1])

In [11]:
df.Category.nunique()

2

In [12]:
df_spam = df[df["Category"] == 1]
df_spam.shape

(747, 2)

In [16]:
dir(df)

['Category',
 'Message',
 'T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__arrow_c_stream__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__dataframe__',
 '__dataframe_consortium_standard__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pandas_priority__',
 '__

In [19]:
df_ham_small = df[df["Category"] == 0].sample(1000)
df_ham_small.shape

(1000, 2)

In [20]:
df_ham_small.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 4217 to 3531
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  1000 non-null   int64 
 1   Message   1000 non-null   object
dtypes: int64(1), object(1)
memory usage: 23.4+ KB


In [21]:
df_small = pd.concat([df_spam , df_ham_small])
df_small.Category.value_counts()

Category
0    1000
1     747
Name: count, dtype: int64

In [22]:
df_small.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 2 to 3531
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  1747 non-null   int64 
 1   Message   1747 non-null   object
dtypes: int64(1), object(1)
memory usage: 40.9+ KB


In [24]:
df_small.columns

Index(['Category', 'Message'], dtype='object')

In [28]:
X_train , X_test , y_train , y_test = train_test_split(df_small.Message , df_small.Category , test_size = 0.2 , random_state = 42)

In [29]:
X_train.shape

(1397,)

In [30]:
y_test.shape

(350,)

In [32]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [42]:
def tokenize_function(texts , labels):
    encoding = tokenizer(texts , padding = 'max_length' ,  max_length = 128 , truncation = True , return_tensors ='pt')
    return encoding["input_ids"] , encoding["attention_mask"] , torch.tensor(labels , dtype = torch.float)
    

In [43]:
tokenize_function(["Hurry up, click here", "I will see you tomorrow"], [1,0])

(tensor([[  101,  9241,  2039,  1010, 11562,  2182,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [44]:
train_input_ids , train_attention_mask , train_labels  = tokenize_function(X_train.values.tolist() , y_train.values.tolist())
test_input_ids , test_attention_mask , test_labels = tokenize_function(X_test.values.tolist() , y_test.values.tolist())

In [46]:
train_dataset = torch.utils.data.TensorDataset(train_input_ids  , train_attention_mask , train_labels)
test_dataset = torch.utils.data.TensorDataset(test_input_ids , test_attention_mask , test_labels)

In [50]:
batch_size = 64

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset , batch_size = batch_size , shuffle = True)
test_loader = DataLoader(test_dataset , batch_size = batch_size , shuffle = True)

In [51]:
bert = BertModel.from_pretrained('bert-base-uncased')
bert
bert.config.hidden_size

768

### Model

In [60]:
class SentimentalClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        for params in self.bert.parameters():
            params.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size , 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256 , 1),
            nn.Sigmoid()
        )

    def forward(self , input_ids , attention_mask):
        bert_output = self.bert(input_ids  = input_ids ,attention_mask  = attention_mask)
        sentence_embedding = bert_output.last_hidden_state[:,0,:]
        return self.classifier(sentence_embedding)
        
        

In [63]:
model  = SentimentalClassifier()
criterion  = nn.BCELoss()
optimizer = optim.Adam(model.parameters() , lr = 0.0001)

model = model.to(device)
criterion = criterion.to(device)


In [67]:
epochs = 5


for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch, (input_ids , attention_mask , labels ) in enumerate(train_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device, dtype=torch.float)

        optimizer.zero_grad()
        output = model(input_ids , attention_mask).squeeze()
        loss = criterion(output , labels)

        loss.backward()

        optimizer.step()


        print(f"Batch: {batch}, Epoch: {epoch}, Loss:  {loss.item():0.2f}")
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}')

Batch: 0, Epoch: 0, Loss:  0.21
Batch: 1, Epoch: 0, Loss:  0.18
Batch: 2, Epoch: 0, Loss:  0.14
Batch: 3, Epoch: 0, Loss:  0.17
Batch: 4, Epoch: 0, Loss:  0.18
Batch: 5, Epoch: 0, Loss:  0.19
Batch: 6, Epoch: 0, Loss:  0.14
Batch: 7, Epoch: 0, Loss:  0.17
Batch: 8, Epoch: 0, Loss:  0.15
Batch: 9, Epoch: 0, Loss:  0.20
Batch: 10, Epoch: 0, Loss:  0.14
Batch: 11, Epoch: 0, Loss:  0.15
Batch: 12, Epoch: 0, Loss:  0.20
Batch: 13, Epoch: 0, Loss:  0.21
Batch: 14, Epoch: 0, Loss:  0.08
Batch: 15, Epoch: 0, Loss:  0.16
Batch: 16, Epoch: 0, Loss:  0.21
Batch: 17, Epoch: 0, Loss:  0.11
Batch: 18, Epoch: 0, Loss:  0.19
Batch: 19, Epoch: 0, Loss:  0.14
Batch: 20, Epoch: 0, Loss:  0.15
Batch: 21, Epoch: 0, Loss:  0.15
Epoch 1/5, Training Loss: 0.1658430959690701
Batch: 0, Epoch: 1, Loss:  0.13
Batch: 1, Epoch: 1, Loss:  0.16
Batch: 2, Epoch: 1, Loss:  0.19
Batch: 3, Epoch: 1, Loss:  0.17
Batch: 4, Epoch: 1, Loss:  0.15
Batch: 5, Epoch: 1, Loss:  0.10
Batch: 6, Epoch: 1, Loss:  0.17
Batch: 7, Epoch

### Evaluation

In [74]:
# Evaluation
model.eval()
total_val_loss = 0
correct_predictions = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device, dtype=torch.float)

        outputs = model(input_ids, attention_mask).squeeze()
        loss = criterion(outputs, labels.view_as(outputs))
        total_val_loss += loss.item()

        preds = (outputs > 0.5).float()
        correct_predictions += torch.sum(preds == labels)
        
avg_val_loss = total_val_loss / len(test_loader)
val_accuracy = correct_predictions.double() / len(test_dataset)
print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy:.4f}')        

Validation Loss: 0.09651684761047363, Validation Accuracy: 0.9714
