In [1]:
import torch
import pandas as pd
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
df= pd.read_csv('../finetuning dataset/newsCorpora.csv')
# df.head()

##Removing unwanted column and only leaving title of news and the category which will be target
df= df[["Title", "Category"]]

# #Converting the codes to appropriate categories using a dictionary
my_dict= {
    'e': 'Entertainment', 'b':'Business', 't': 'Science', 'm': 'Health'
}

def update_cate(x):
    return my_dict

df['Category']= df['Category'].apply(lambda x: update_cate(x))

encode_dict= {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]= len(encode_dict)
    return encode_dict[x]

df['Encode_cat']= df['Category'].apply(lambda x: encode_cat)

In [4]:
df['Category'].value_counts()

{'e': 'Entertainment', 'b': 'Business', 't': 'Science', 'm': 'Health'}    65534
Name: Category, dtype: int64

In [5]:
df.head()

Unnamed: 0,Title,Category,Encode_cat
0,Fed's Charles Plosser sees high bar for change...,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
1,US open: Stocks fall after Fed official hints ...,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
2,"Fed risks falling 'behind the curve', Charles ...","{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
3,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
4,Plosser: Fed May Have to Accelerate Tapering Pace,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>


## Triage Dataset Class
This class is defined to accept the Dataframe as input and generate tokenized output that is used by the DistilBERT model for training.

We are using the DistilBERT tokenizer to tokenize the data in the TITLE column of the dataframe.

The tokenizer uses the encode_plus method to perform tokenization and generate the necessary outputs, namely: ids, attention_mask

target is the encoded category on the news headline.
The Triage class is used to create 2 datasets, for training and for validation.

Reference: https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb#scrollTo=GGpBugzZ0tzr

In [29]:
#Defining some key variavles that will be use later in the training
MAX_LEN= 512
TRAIN_BATCH_SIZE= 4
VALID_BATCH_SIZE= 2
EPOCHS= 1
LEARNING_RATE= 1e-05
tokenizer= DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [49]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len= len(dataframe)
        self.data= dataframe
        self.tokenizer= tokenizer
        self.max_len= max_len
        
    def __getitem__(self, index):
        title= str(self.data.Title[index])
        title= " ".join(title.split())
        inputs= self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens= True,
            max_length= self.max_len,
            # pad_to_max_lenth= True,
            truncation= True
        )
        ids= inputs['input_ids']
        mask= inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype= torch.long),
            'targets': torch.tensor(self.data.Encode_cat[index], dtype=torch.long)
        }
    def __len__(self):
        return self.len


#### Creating dataset and dataloader for the neural network

In [50]:
train_size= 0.8
train_dataset= df.sample(frac= train_size, random_state= 200)
test_dataset= df.drop(train_dataset.index).reset_index(drop=True)
train_dataset= train_dataset.reset_index(drop= True)

print("FULL Dataset: {}".format(df.shape))
print("Train dataset: {}".format(train_dataset.shape))
print("Test dataset: {}".format(test_dataset.shape))

training_set= Triage(train_dataset, tokenizer, MAX_LEN)
testing_set= Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (65534, 3)
Train dataset: (52427, 3)
Test dataset: (13107, 3)


In [51]:
train_dataset.head()

Unnamed: 0,Title,Category,Encode_cat
0,UK Treasury Breaks Glass Ceiling with IMF's Ne...,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
1,Diddy becomes Puff Daddy again,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
2,Windows Phone 8.1 Could Let Developers Reply T...,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
3,Lena Dunham hosts 'Saturday Night Live' this w...,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>
4,Anita Baker wanted by police in Detroit,"{'e': 'Entertainment', 'b': 'Business', 't': '...",<function encode_cat at 0x0000029495A6FEB0>


In [52]:
train_params= {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers':0
}
test_params= {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

training_loader= DataLoader(training_set, **train_params)
testing_loader= DataLoader(testing_set, **test_params)

#### Creating the neural network for finetuning

We will be creating a neural network with the DistillBERTClass.

This network will have the DistilBERT Language model followed by a dropout and finally a Linear layer to obtain the final outputs.

The data will be fed to the DistilBERT Language model as defined in the dataset.

Final layer outputs is what will be compared to the encoded category to determine the accuracy of models prediction.

We will initiate an instance of the network called model. This instance will be used for training and then to save the final trained model for future inference.

In [53]:
#creating the customized model, by adding a dropout and a dense layer on top of distilbert to get hte final output for the model

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1= DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier= torch.nn.Linear(768,768)
        self.dropout= torch.nn.Dropout(0.3)
        self.classifier= torch.nn.Linear(768,4)

    def forward(self, input_ids, attention_mask):
        output_1= self.l1(input_ids= input_ids, attention_mask= attention_mask)
        hidden_state= output_1[0]
        pooler= hidden_state[:, 0]
        pooler= self.pre_classifier(pooler)
        pooler= torch.nn.ReLU()(pooler)
        pooler= self.dropout(pooler)
        output= self.classifier(pooler)
        return output

In [54]:
model= DistillBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [55]:
model.parameters()

<generator object Module.parameters at 0x00000294B2691A10>

In [56]:
## creating the loss function and optimizer
loss_function= torch.nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(params= model.parameters(), lr= LEARNING_RATE)

#### Finetuning the Model

a. First , the dataloader pases data to the model based on the batch size

b. Subsequent output from the model and the actual category are compared and calculate the loss

c. Loss value is used to optimize the weights of the neurons in the network

d. After every 5000 steps th eloss value is printed in the console

In [57]:
#function to calculate accuracy of the model
def calculate_accuracy(big_idx, targets):
    n_correct= (big_idx==targets).sum().item()
    return n_correct

In [58]:
# tr_loss = 0
# n_correct = 0
# nb_tr_steps = 0
# nb_tr_examples = 0
# model.train()

In [59]:
type(training_loader)

torch.utils.data.dataloader.DataLoader

In [61]:
for i, data in enumerate(training_loader, 0):
    print(data)

In [40]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for i, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calculate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if i % 5000 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}')
    epoch_loss = tr_loss / nb_tr_steps
    # epoch_accu = (n_correct*100)/nb_tr_examples
    epoch_accu = calculate_accuracy(big_idx, targets)
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [39]:
for epoch in range(3):
    train(epoch)

Keyword arguments {'pad_to_max_lenth': True} not recognized.
