dataset used: https://www.kaggle.com/sainijagjit/bbc-dataset

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 8.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 43.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: p

In [2]:
import torch
from torch import nn
from torch import optim
import pandas as pd
from transformers import BertModel
from transformers import BertTokenizer
import numpy as np
from tqdm import tqdm

In [3]:
data_path = '/content/BBC News Train.csv'

In [4]:
#adapted from https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
class ArticleClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(ArticleClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, cls = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(cls)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [5]:
labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class BBCDataset(torch.utils.data.Dataset):
  def __init__(self, df):
    #labels_arr = [labels[label] for label in df['Category']]
    self.labels = [labels[label] for label in df['Category']]
    self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for text in df['Text']]
  
  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    return self.texts[idx], np.array(self.labels[idx])
            

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
def train(model, train_dataset, test_dataset, batch_size, epochs, optimizer):
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
  test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)

  device = torch.device("cuda" if torch.torch.cuda.is_available() else "cpu")

  model.to(device)

  criterion = nn.CrossEntropyLoss()

  for epoch in range(epochs):
    train_total_loss = 0
    train_total_correct = 0
    print('Epoch: ', epoch + 1)
    for input, label in tqdm(train_loader):
      label = label.to(device)

      output = model(input['input_ids'].squeeze(1).to(device), input['attention_mask'].to(device))

      loss = criterion(output, label)

      model.zero_grad()
      loss.backward()
      optimizer.step()

      train_total_loss += loss.item()
      
      test_total_correct = torch.sum(torch.argmax(output, dim=1) == label).item()
    print(' Training Loss: ', train_total_loss, 'Traing Accuracy:', train_total_correct / len(test_dataset))
    
    test_total_loss = 0
    test_total_correct = 0
    with torch.no_grad():
      for input, label in tqdm(test_loader):
        label = label.to(device)

        output = model(input['input_ids'].squeeze(1).to(device), input['attention_mask'].to(device))

        loss = criterion(output, label)
        test_total_loss += loss.item()
        
        test_total_correct += torch.sum(torch.argmax(output, dim=1) == label).item()
    
    
    print(' Test Loss: ', test_total_loss, 'Test Accuracy:', test_total_correct / len(test_dataset))

In [8]:
raw_data = pd.read_csv(data_path)


train_data = raw_data.sample(frac=0.8, random_state=1216)
test_data = raw_data.drop(train_data.index)

train_dataset = BBCDataset(train_data)
test_dataset = BBCDataset(test_data)

In [9]:
model = ArticleClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train(model, train_dataset, test_dataset, 4, 8, optimizer)

Epoch:  1


100%|██████████| 298/298 [01:10<00:00,  4.25it/s]


 Training Loss:  472.7101184129715 Traing Accuracy: 0.0


100%|██████████| 75/75 [00:05<00:00, 12.78it/s]


 Test Loss:  111.57240331172943 Test Accuracy: 0.4261744966442953
Epoch:  2


 23%|██▎       | 70/298 [00:16<00:53,  4.26it/s]