in this notebook we are going to classify some documents into 5 categs (sport, tech, entertainement , politics and Busness )
for this NLP task which is docs classification we are going to use BERT

In [1]:
import pandas as pd
import torch 
from transformers import BertTokenizer
import numpy as np
from transformers import BertModel
from tqdm import tqdm 


## Intro
in this notebook we are going to classify some documents into 5 categs (sport, tech, entertainement , politics and Busness )
for this NLP task which is docs classification we are going to use BERT

In [2]:
import pandas as pd
import torch 
from transformers import BertTokenizer
import numpy as np
from transformers import BertModel
from tqdm import tqdm 

## Loading  data

In [3]:
source_url = "/kaggle/input/bbc-dataset/bbc-text.csv"
df = pd.read_csv(source_url)
df.groupby(["category"]).size()

category
business         510
entertainment    386
politics         417
sport            511
tech             401
dtype: int64

## Data Preprocessing 

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

we are going to tesst the bert tokenizer on an exemple to have a good idea on how it works

In [5]:
sample = 'Hey my name is BERT'
# truncation : if it is True then we allow bert to truncated every sequence it's length is higher then max_length
# return_tensors : the type of tensors that will be returned (as we are using pytorch then we set "pt")
bert_input  = tokenizer(sample,padding="max_length",max_length=15,truncation=True,return_tensors="pt")


In [6]:
print(bert_input["input_ids"])
# input_ids are the id representation of each token 
# we can decode these inputs to get the original sequence 
print(tokenizer.decode(bert_input["input_ids"][0] ))
# the code 102 is for the [SEP] token and the 0 is for [PAD] token 

tensor([[ 101, 4403, 1139, 1271, 1110,  139, 9637, 1942,  102,    0,    0,    0,
            0,    0,    0]])
[CLS] Hey my name is BERT [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [7]:
print(bert_input["token_type_ids"])
# the token_type_ids identified to which sequence a token belongs, when having just one sequence so it's always 0 

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [8]:
print(bert_input["attention_mask"])
# the attention_mask identified whether the token is a real word or just a token padding
# it's 1 for the real words, the CLS and the SEP tokens, and for the pad token is 0


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])


in this example, we are using the **bert-base-cased** pre-trained model and that's because our dataset is in english, if we are dealing with a dataset which is multilingual the we need to use **bert-base-multilingual-cased**

### Dataset Class 

Now that we know which are the ouputs of our bert tokenizer we are going to build a Dataset Class for our news Dataset 

In [9]:
tokenizer= BertTokenizer.from_pretrained("bert-base-cased")
labels = {
    'business':0,
    'entertainment':1,
    'sport':2,
    'tech':3,
    'politics':4
}
class Dataset(torch.utils.data.Dataset): 
    def __init__(self,df): 
        #extract our labels from the df 
        self.labels = [labels[label] for label in df["category"]]
        #tokenize our texts to the format that BERT expects to get as input 
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,return_tensors="pt") for text in df["text"]] 
    def classes(self):
        return self.labels
    
    def __len__(self): 
        return len(self.labels)
    
    #fetch a batch of labels
    def get_batch_labels(self,indx): 
        return np.array(self.labels[indx])
    # fetch a batch of texts 
    def get_batch_texts(self,indx): 
        return self.texts[indx]

    #get an item with the texts and the label
    def __getitem__(self,indx): 
        batch_texts = self.get_batch_texts(indx)
        batch_y = self.get_batch_labels(indx)
        
        
        return batch_texts, batch_y
        

Now after creating the Dataset Class let's split our dataset into train,validation and test sets

* training set contains : 80% 
* test and validation contains : 10% each

In [10]:
df_train, df_valid,df_test = np.split(df.sample(frac=1,random_state=42),[int(.8*len(df)), int(.9*len(df))])



## Building the model
Now after preparing our data to the Learning process, let's create our model using the pre-trained BERT base model which contains 12 layers of Transformers encoder.

In [11]:
class BertClassifier(torch.nn.Module): 
    def __init__(self,dropout=0.5): 
        super(BertClassifier,self).__init__()
        
        self.bert=BertModel.from_pretrained("bert-base-cased")
        self.dropout = torch.nn.Dropout(dropout)
        # bert output a vector of size 768
        self.lin = torch.nn.Linear(768,5)
        self.relu = torch.nn.ReLU()
    def forward(self,input_id,mask): 
        # as output, the bert model give us first the embedding vector of all the tokens of the sequence 
        # second we get the embedding vector of the CLS token.
        # fot a classification task it's enough to use this embedding for our classifier
        _,pooled_output = self.bert(input_ids= input_id,attention_mask = mask,return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output  = self.lin(dropout_output)
        final_layer = self.relu(linear_output)
        
        return final_layer
        

## Training  

In [12]:
# we are creating a standard pytorch training loop 

def train(model, train_data, val_data, learning_rate, epochs=5):
    #creating a custom Dataset objects using the training and validation data
    train, val = Dataset(train_data), Dataset(val_data)
    #creating dataloaders
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                #print(f"the train input : {train_input}")
                #print(f"train label : {train_label}")

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
    #             print(input_id.shape)

                # get the predictions 
                output = model(input_id, mask)

                #the output is a vector of 5 values (categs)
    #             print(output)
    #             print("the output shape is" ,  output.shape)
    #             print(train_label)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
                # updating the Gradient Descent and Backpropagation operation
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            # now we evaluate on the validation data
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')



In [13]:
          
EPOCHS = 5
model = BertClassifier()
learning_rate = 1e-6
train(model, df_train, df_valid, learning_rate, EPOCHS)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 890/890 [02:02<00:00,  7.29it/s]


Epochs: 1 | Train Loss:  0.726                 | Train Accuracy:  0.401                 | Val Loss:  0.623                 | Val Accuracy:  0.568


100%|██████████| 890/890 [02:01<00:00,  7.34it/s]


Epochs: 2 | Train Loss:  0.496                 | Train Accuracy:  0.678                 | Val Loss:  0.397                 | Val Accuracy:  0.734


100%|██████████| 890/890 [02:01<00:00,  7.34it/s]


Epochs: 3 | Train Loss:  0.247                 | Train Accuracy:  0.898                 | Val Loss:  0.120                 | Val Accuracy:  0.991


100%|██████████| 890/890 [02:01<00:00,  7.34it/s]


Epochs: 4 | Train Loss:  0.102                 | Train Accuracy:  0.978                 | Val Loss:  0.065                 | Val Accuracy:  0.991


100%|██████████| 890/890 [02:01<00:00,  7.34it/s]


Epochs: 5 | Train Loss:  0.056                 | Train Accuracy:  0.988                 | Val Loss:  0.058                 | Val Accuracy:  0.977


## EValuate the model on test data
Now that we trained the model on the training set, we are going to use the test data to evaluate the performance of the model on unseen data 

In [14]:
def evaluate(model,test_df):
    test = Dataset(test_df)
    test_dl = torch.utils.data.DataLoader(test,batch_size=2)
    
    cuda_available = torch.cuda.is_available()
    
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    if cuda_available:
        model = model.cuda()
    
    total_acc = 0
    for test_input , test_label in tqdm(test_dl):
        test_label = test_label.to(device)
        mask = test_input["attention_mask"].to(device)
        input_id = test_input["input_ids"].squeeze(1).to(device)
        output = model(input_id,mask)
        
        acc = (output.argmax(dim=1) == test_label).sum().item()
        total_acc +=acc 
        
    print(f"Test Accuracy : {total_acc / len(test_df): .3f}")
    

In [15]:
evaluate(model,test_df=df_test)

100%|██████████| 112/112 [00:04<00:00, 24.37it/s]

Test Accuracy :  0.996





After running the code above, we got the accuracy of 0.991 from the test data.

## Custom Tests

In [16]:
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
inverse_labels = {v:k for k,v in labels.items()}

In [17]:
def predict(device,model,sentence):
    sentence = sentence
    sentence_input = tokenizer(sentence, padding='max_length', max_length=512, truncation=True,return_tensors="pt").to(device)
    input_id = sentence_input["input_ids"]
    mask = sentence_input["attention_mask"]
    output = model(input_id,mask)
    predicted_class_label = output.argmax(dim=1)
    predicted_class = inverse_labels[predicted_class_label.item()]
    print(f"The predicted class is : {predicted_class}")

In [19]:
predict(device,model,"a period of political and economic stability")

The predicted class is : entertainment


In [21]:
predict(device,model,"Manchester is the great football team in all the history")

The predicted class is : sport


In [None]:
predict(device,model,"An individual or group can initiate, or obstruct, public policy in many political arenas")