In [16]:
import torch 
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader

In [18]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-4.0.0 multiprocess-0.70.16 xxhash-3.5.0


In [19]:
from datasets import load_dataset

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [21]:
df=pd.read_csv("spam.csv")
df.head(4)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


### Category is text we'll convert it into a number

In [22]:
df['Category']=df['Category'].map({'ham':0,'spam':1})

In [23]:
df.head(4)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...


In [24]:
df['Category'].value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

### Reducing the dataset size to reduce training time

In [26]:
df_spam=df[df['Category']==1]
df_spam.shape

(747, 2)

In [27]:
df_ham_small=df[df['Category']==0].sample(n=1000, random_state=42)
df_ham_small.shape

(1000, 2)

In [30]:
df_small=pd.concat([df_spam,df_ham_small])
df_small.shape

(1747, 2)

In [31]:
df_small['Category'].value_counts()

Category
0    1000
1     747
Name: count, dtype: int64

### Train,Test Split

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df_small['Message'],df_small['Category'],test_size=0.2,random_state=42)

In [33]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1397,), (350,), (1397,), (350,))

In [34]:
y_train.value_counts()

Category
0    812
1    585
Name: count, dtype: int64

In [35]:
y_test.value_counts()

Category
0    188
1    162
Name: count, dtype: int64

### Tokenizing the Messages we have

In [None]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts,labels):
    encodings=tokenizer(texts,padding='max_length',max_length=128,truncation=True,return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels,dtype=torch.float)

tokenize_function(["Hello, how are you?", "This is a spam message."], [0, 1])##each token dimension is 128 

(tensor([[  101,  7592,  1010,  2129,  2024,  2017,  1029,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [39]:
type(X_train)

pandas.core.series.Series

### Different ways of converting X_train to list so that we can pass it into the tokenize_function 

In [46]:
checking=X_train.tolist()
checking[3]

'Yes:)sura in sun tv.:)lol.'

In [53]:
X_train.values[3] ##converts the pandas series to numpy array

'Yes:)sura in sun tv.:)lol.'

In [54]:
type(X_train.values)

numpy.ndarray

In [52]:
type(X_train.values.tolist()) ## converts the pandas series to list via numpy array

list

In [55]:
train_input_ids, train_attention_masks, train_labels=tokenize_function(X_train.values.tolist(), y_train.values.tolist())

In [66]:
type(train_input_ids), type(train_attention_masks), type(train_labels)

(torch.Tensor, torch.Tensor, torch.Tensor)

In [67]:
train_input_ids[0]

tensor([  101, 22950,  1011,  4471,  2415,  1009,  4008,  2581,  2581,  2683,
         2581, 19841, 16086,  2692,  2683,  2339,  3524,  1029,  6611,  2005,
         2115,  2925,  8299,  1024,  1013,  1013, 10922,  1012, 22950,  1012,
         4012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [68]:
train_input_ids[0].shape

torch.Size([128])

In [56]:
val_input_ids, val_attention_masks, val_labels=tokenize_function(X_test.values.tolist(), y_test.values.tolist())

In [58]:
train_dataset=torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset=torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, val_labels)

In [80]:
print(f"The first part represents the input_ids of the sentence: {train_dataset[0][0].shape}")
print(f"The second part represents the attention mask: {train_dataset[0][1].shape}")
print(f"The third part represents the label: {train_dataset[0][2]}")

The first part represents the input_ids of the sentence: torch.Size([128])
The second part represents the attention mask: torch.Size([128])
The third part represents the label: 1.0


#### ok,now all these tokens are passed to bert model which gives out contextual embeddings right

In [65]:
train_dataset[0][0].shape ##each word is represented by 128 tokens with the first token being the [CLS] token

torch.Size([128])

In [60]:
batch_size=64
train_loader=DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
bert= BertModel.from_pretrained('bert-base-uncased')
bert.to(device)
bert.config.hidden_size ## hidden size of the BERT model, which is 768 

768

In [62]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        for param in self.bert.parameters():
            param.requires_grad = False ##Freezing all BERT layers

        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),##768 is the hidden size of BERT
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask):
        bert_output=self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embedding=bert_output.last_hidden_state[:,0,:]## Means that for each sentence,take the CLS token output and take all the 768 dimensions of the CLS token output
        return self.classifier(sentence_embedding)##passing the CLS token output through the classifier

#### No, you don't have "64 new 768 neurons" for each batch. Let's clarify what's happening with the dimensions and the neural network.

Your understanding of sentence_embedding is correct: for each sentence in a batch, you extract its 768-dimensional [CLS] token embedding.

Here's how it plays out with your classifier:

Output Dimensions and Neurons
sentence_embedding (Input to self.classifier)

For a batch size of 64, sentence_embedding will have a shape of (64, 768).

This means you have 64 independent 768-dimensional vectors, one for each sentence in the batch. These are inputs to your classifier.

self.classifier Layers

nn.Linear(self.bert.config.hidden_size, 256):

self.bert.config.hidden_size is 768.

This layer takes the (64, 768) sentence_embedding as input.

It maps each 768-dimensional input vector to a 256-dimensional output vector.

The output shape of this layer will be (64, 256).

This layer has 768 input "neurons" (or input features) and 256 output "neurons" (or output features). The connections between these are learned weights.

nn.ReLU(): This is an activation function applied element-wise. It doesn't change the tensor's shape. The shape remains (64, 256).

nn.Dropout(0.3): This randomly sets a fraction of input units to zero during training to prevent overfitting. It also doesn't change the tensor's shape. The shape remains (64, 256).

nn.Linear(256, 1):

This layer takes the (64, 256) tensor from the previous layer.

It maps each 256-dimensional input vector to a 1-dimensional output (a single scalar).

The output shape of this layer will be (64, 1).

This layer has 256 input "neurons" and 1 output "neuron".

nn.Sigmoid(): This activation function squashes the output of the final linear layer to a probability between 0 and 1. It doesn't change the tensor's shape. The final outputs shape is (64, 1).

Conclusion

In [84]:
model=SentimentClassifier()
criterion=nn.BCELoss()
optimizer=AdamW(model.parameters(), lr=0.001)
criterion=criterion.to(device)
model.to(device)

SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

### For a batch size of 64:

outputs (before squeeze()): The shape will be (64, 1). Visually, it's more like [[0.7], [0.1], ..., [0.9]]. Each inner list has one probability.

labels: The shape will be (64,). Visually, it's like [1, 0, ..., 1]. Each element is a single binary label.

After outputs.squeeze(), outputs will become (64,), matching the labels shape.

In [86]:
epochs=2

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch,(input_ids, attention_mask, labels) in enumerate(train_loader):
        input_ids=input_ids.to(device)
        attention_mask=attention_mask.to(device)
        labels=labels.to(device)

        optimizer.zero_grad()
        outputs=model(input_ids=input_ids, attention_mask=attention_mask)##sending the input_ids and attention_mask to the model to get the outputs of the classifier 0 or 1
        loss=criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        
        print(f"Epoch: {epoch+1}, Batch: {batch+1}, Loss: {loss.item()}")
        total_train_loss += loss.item()
    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch: {epoch+1}, Average Training Loss: {avg_train_loss}")

Epoch: 1, Batch: 1, Loss: 0.0763472467660904
Epoch: 1, Batch: 2, Loss: 0.02298722416162491
Epoch: 1, Batch: 3, Loss: 0.1071750819683075
Epoch: 1, Batch: 4, Loss: 0.0867910087108612
Epoch: 1, Batch: 5, Loss: 0.06929303705692291
Epoch: 1, Batch: 6, Loss: 0.07982479780912399
Epoch: 1, Batch: 7, Loss: 0.07815876603126526
Epoch: 1, Batch: 8, Loss: 0.19440428912639618
Epoch: 1, Batch: 9, Loss: 0.08208862692117691
Epoch: 1, Batch: 10, Loss: 0.033225275576114655
Epoch: 1, Batch: 11, Loss: 0.0723995566368103
Epoch: 1, Batch: 12, Loss: 0.04688543826341629
Epoch: 1, Batch: 13, Loss: 0.14591826498508453
Epoch: 1, Batch: 14, Loss: 0.1261478066444397
Epoch: 1, Batch: 15, Loss: 0.20270179212093353
Epoch: 1, Batch: 16, Loss: 0.03362751379609108
Epoch: 1, Batch: 17, Loss: 0.11942648142576218
Epoch: 1, Batch: 18, Loss: 0.09854297339916229
Epoch: 1, Batch: 19, Loss: 0.08300063014030457
Epoch: 1, Batch: 20, Loss: 0.14304859936237335
Epoch: 1, Batch: 21, Loss: 0.05111919716000557
Epoch: 1, Batch: 22, Loss:

In [87]:
## Evaluation
model.eval()
total_eval_loss = 0
correct_predictions = 0

with torch.no_grad():
    for batch, (input_ids, attention_mask, labels) in enumerate(test_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.squeeze(), labels)
        
        total_eval_loss += loss.item()
        
        predictions = (outputs.squeeze() > 0.5).float()  # Convert probabilities to binary predictions
        correct_predictions += (predictions == labels).sum().item()

    avg_eval_loss = total_eval_loss / len(test_loader)
    accuracy = correct_predictions / len(test_dataset)

    print(f"Average Evaluation Loss: {avg_eval_loss}")
    print(f"Accuracy: {accuracy * 100:.2f}%")

Average Evaluation Loss: 0.06580272937814395
Accuracy: 97.71%


In [88]:
def predict(model,text,max_length=128):
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

    #Tokenize the input text
    encoding=tokenizer(
        text,
        padding='max_length',
        max_length=max_length,
        truncation=True,
        return_tensors='pt'
    )

    input_ids=encoding['input_ids'].to(device)##basic token ids of the sentence
    attention_mask=encoding['attention_mask'].to(device)##attention mask to ignore the padding tokens

    model=model.to(device)
    model.eval()

    with torch.no_grad():
        output=model(input_ids=input_ids, attention_mask=attention_mask)
        prediction=(output.squeeze() > 0.5).float()
        return 'spam' if prediction.item() == 1 else 'ham'

In [None]:
predict(model, "Congratulations! You've won a lottery worth $1,000,000! Claim your prize now!")

'spam'

In [92]:
predict(model,"Hello,you are eligible for a discount on your next purchase.Click here to claim your offer.")

'spam'

In [93]:
predict(model, "Hello, how are you?")

'ham'