in this notebook we are going to classify some documents into 5 categs (sport, tech, entertainement , politics and Busness )
for this NLP task which is docs classification we are going to use BERT

In [1]:
import pandas as pd
import torch 
from transformers import BertTokenizer
import numpy as np
from transformers import BertModel
from tqdm import tqdm 


In [2]:
df = pd.read_csv("bbc-text.csv")
df.groupby(["category"]).size()

category
business         510
entertainment    386
politics         417
sport            511
tech             401
dtype: int64

## Data Preprocessing 

In [58]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [61]:
# we are going to tesst the bert tokenizer on an exemple to have a good idea on how it works

sample = 'Hey my name is BERT'
# truncation : if it is True then we allow bert to truncated every sequence it's length is higher then max_length
# return_tensors : the type of tensors that will be returned (as we are using pytorch then we set "pt")
bert_input  = tokenizer(sample,padding="max_length",max_length=15,truncation=True,return_tensors="pt")


In [67]:
print(bert_input["input_ids"])
# input_ids are the id representation of each token 
# we can decode these inputs to get the original sequence 
print(tokenizer.decode(bert_input["input_ids"][0] ))
# the code 102 is for the [SEP] token and the 0 is for [PAD] token 

tensor([[ 101, 4403, 1139, 1271, 1110,  139, 9637, 1942,  102,    0,    0,    0,
            0,    0,    0]])
[CLS] Hey my name is BERT [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [69]:
print(bert_input["token_type_ids"])
# the token_type_ids identified to which sequence a token belongs, when having just one sequence so it's always 0 

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [71]:
print(bert_input["attention_mask"])
# the attention_mask identified whether the token is a real word or just a token padding
# it's 1 for the real words, the CLS and the SEP tokens, and for the pad token is 0


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])


in this example, we are using the **bert-base-cased** pre-trained model and that's because our dataset is in english, if we are dealing with a dataset which is multilingual the we need to use **bert-base-multilingual-cased**

### Dataset Class 

Now that we know which are the ouputs of our bert tokenizer we are going to build a Dataset Class for our news Dataset 

In [3]:
tokenizer= BertTokenizer.from_pretrained("bert-base-cased")
labels = {
    'business':0,
    'entertainment':1,
    'sport':2,
    'tech':3,
    'politics':4
}
class Dataset(torch.utils.data.Dataset): 
    def __init__(self,df): 
        #extract our labels from the df 
        self.labels = [labels[label] for label in df["category"]]
        #tokenize our texts to the format that BERT expects to get as input 
        self.texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,return_tensors="pt") for text in df["text"]] 
    def classes(self):
        return self.labels
    
    def __len__(self): 
        return len(self.labels)
    
    #fetch a batch of labels
    def get_batch_labels(self,indx): 
        return np.array(self.labels[indx])
    # fetch a batch of texts 
    def get_batch_texts(self,indx): 
        return self.texts[indx]

    #get an item with the texts and the label
    def __getitem__(self,indx): 
        batch_texts = self.get_batch_texts(indx)
        batch_y = self.get_batch_labels(indx)
        
        
        return batch_texts, batch_y
        

Now after creating the Dataset Class let's split our dataset into train,validation and test sets

* training set contains : 80% 
* test and validation contains : 10% each

In [4]:
df_train, df_valid,df_test = np.split(df.sample(frac=1,random_state=42),[int(.8*len(df)), int(.9*len(df))])

## Building the model
Now after preparing our data to the Learning process, let's create our model using the pre-trained BERT base model which contains 12 layers of Transformers encoder.

In [6]:
class BertClassifier(torch.nn.Module): 
    def __init__(self,dropout=0.5): 
        super(BertClassifier,self).__init__()
        
        self.bert=BertModel.from_pretrained("bert-base-cased")
        self.dropout = torch.nn.Dropout(dropout)
        # bert output a vector of size 768
        self.lin = torch.nn.Linear(768,5)
        self.relu = torch.nn.ReLU(inplace=True)
    def forward(self,input_id,mask): 
        # as output, the bert model give us first the embedding vector of all the tokens of the sequence 
        # second we get the embedding vector of the CLS token.
        # fot a classification task it's enough to use this embedding for our classifier
        _,pooled_output = self.bert(input_ids= input_id,attention_mask = mask,return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output  = self.lin(dropout_output)
        final_layer = self.relu(linear_output)
        
        return final_layer
        

## Training  

In [32]:
# we are creating a standard pytorch training loop 
def train(model,train_data, validation_data,optim,criterion,epochs=10):
    #creating a custom Dataset objects using the training and validation data
    train,valid = Dataset(train_data) , Dataset(validation_data)
    
    #creating dataloaders 
    train_dl = torch.utils.data.DataLoader(train,batch_size=128 , shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid,batch_size=128)
    
    #seting the device to run 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        model.cuda()
        criterion.cuda()
    
    # the training loop 
    for epoch in range(1,epochs):
        train_accuracy = 0
        train_loss = 0
        
        for train_input,train_label in tqdm(train_dl): 
#             print(f"the train input : {train_input}")
#             print(f"train label : {train_label}")
            
            train_label = train_label.to(device)
            mask = train_input["attention_mask"].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)
#             print(input_id.shape)
            
            # get the predictions 
            output = model(input_id,mask)
            #the output is a vector of 5 values (categs)
#             print(output)
#             print("the output shape is" ,  output.shape)
#             print(train_label)
            
            
#             print("train label shape is :" , train_label.shape)
            loss = criterion(output,train_label.type(torch.LongTensor))
            train_loss+= loss.item()
            
            accuracy = (output.argmax(dim=1) == train_label).sum().item()
            train_accuracy +=accuracy
            
            
            #update the gradient descent and do the backpropagation 
            model.zero_grad()
            loss.backward()
            optim.step()
        # now we evaluate on the validation data
        
        valid_accuracy = 0
        valid_loss = 0
        with torch.no_grad():
            for val_input, val_label in valid_dl: 
                
                val_label = val_label.to(device)
                mask = val_input["attention_mask"].to(device)
                input_id  = val_input["input_ids"].squeeze(1) .to(device)
                
                output = model(input_id,mask)
                
                loss = criterion(output,val_label.type(torch.LongTensor))
                valid_loss+=loss

                accuracy = (output.argmax(dim=1)==val_label).sum().item()
                valid_accuracy += accuracy
        
        print(
            f"Epoch =[{epoch}]/[{epochs}] train loss = {train_loss / len(train_data): .3f} train accuracy = {train_accuracy / len(train_data): .3f} validation loss = {valid_loss / len(validation_data): .3f} validation accurac = {valid_accuracy / len(validation_data): .3f}"
        )
            

In [33]:
EPOCHS = 5
learning_rate = 1e-06
model = BertClassifier()
criterion = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(),lr=learning_rate)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train(model,df_train,df_valid,optim,criterion,EPOCHS)

  0%|          | 0/14 [00:00<?, ?it/s]