In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# !pip install pytorch_pretrained_bert
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("bbc-text.csv")
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [5]:
df['category'] = df.category.map({'sport':0,'business':1,'tech':2,'entertainment':3,'politics':4})

Creating random seed and suffleing the data

In [6]:
#creating seed
seed = 42
np.random.seed(seed)
#suffleing data
def shuffle(df, n = 3, axis = 0):
  df = df.copy()
  random_states = [2, 42, 4]
  for i in range(n):
    df = df.sample(frac = 1, random_state = random_states[i])
  return df
new_df = shuffle(df)
new_df

Unnamed: 0,category,text
1634,2,sony psp handheld console hits us the latest h...
673,4,msps hear renewed climate warning climate chan...
2212,1,christmas shoppers flock to tills shops all ov...
1383,4,brown comes out shooting labour may have aboli...
1014,4,howard dismisses tory tax fears michael howard...
...,...,...
2024,4,blair blasts tory spending plans tony blair ha...
603,3,sir paul rocks super bowl crowds sir paul mcca...
1148,4,council tax rise reasonable welsh councils s...
228,2,tv s future down the phone line internet tv ha...


Splitting the text in the ratio 8:2

In [7]:
split_idx = int(len(df)*0.8)
print(split_idx)
train_df = new_df.loc[:split_idx,:]
test_df = new_df.loc[split_idx:,:]
print(train_df.groupby(['category'])['text'].count())
print(test_df.groupby(['category'])['text'].count())

1780
category
0    329
1    328
2    271
3    257
4    283
Name: text, dtype: int64
category
0    182
1    182
2    130
3    130
4    134
Name: text, dtype: int64


Tokenizing the train set

In [8]:
sentences = train_df.text
labels = list(train_df.category)

In [9]:
# !pip install transformers
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Tokenizing all the sentence and map the tokens to thier word IDs

In [10]:
MAX_LEN = 128
input_ids = []
attention_marks = []
for sent in sentences:
    #   encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(sent, #Passing sentence to encode
                                          add_special_tokens = True, #Add '[CLS]' and '[SEP]'
                                          max_length = MAX_LEN,
                                          pad_to_max_length = True, #Pad and truncade all sentance
                                          return_attention_mask = True, #Construct attention mask
                                          return_tensors = 'pt', #Return Pytorch tesnor
                                         )
    input_ids.append(encoded_dict['input_ids']) #Add encoded text to the list
    attention_marks.append(encoded_dict['attention_mask'])
#converting list into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_marks = torch.cat(attention_marks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
from torch.utils.data import TensorDataset, random_split

#Combine the trainging inputs into the TensorDataset.
dataset = TensorDataset(input_ids, attention_marks, labels)

#Creating 9:1 training-validation split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

#Dividing the dataset by randomly selecting samples
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5} Training samples'.format(train_size))
print('{:>5} Validation samples'.format(val_size))

 1174 Training samples
  294 Validation samples


In [12]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

#the dataloader need to know the batch size(the numebr of samples used in one forward and backward pass through the network) for the training, which is defined here
#here we take the batch size as 16 for better performence of the BERT model
batch_size = 16

#Creating DataLoader for training and validation dataset, taking training sampler in random oreder
train_dataloader = DataLoader(train_dataset, #the training sample
                              sampler = RandomSampler(train_dataset), #selecting batches randomly
                              batch_size = batch_size, #defining the batchsize for the training
                              )
validation_dataloader = DataLoader(val_dataset,
                                   sampler = SequentialSampler(val_dataset), #for validation the order does not effect much so reading the data sequentially
                                   batch_size = batch_size,
                                   )

In [13]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

#loading the BertForSequenceClassification, a pretrained BERT model with a singel liner classification layer on top
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', #uses the 12 layer BERT model which is not case sensitive(or uncased vocab)
                                                      num_labels = 5, #defining the numerb of output layers
                                                      output_attentions = False, #weather the model will return attention weights.
                                                      output_hidden_states = False, #weather the model returns all the hidden models
                                                      return_dict = False,
                                                      )
#running the model on CPU as I dont have a dedicated GPU
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [14]:
#collecting all the model's parameters as a list of tuples
params = list(model.named_parameters())

In [15]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, #args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8, #args.adam_epsilon  - default is 1e-8.
                  )



In [16]:
from transformers import get_linear_schedule_with_warmup
epochs = 4
total_steps = len(train_dataloader) * epochs

#creating the learning rate schedular
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps,
                                            )

In [17]:
#function to calculate the accuracy of our prediction vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [18]:
import time
import datetime

#takes time in seconds and return a string hh:mm:ss
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds = elapsed_rounded))

In [19]:
import random

#setting the seed value same as before so that the random number generated is same everywhere
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

#measuring total training time for the whole run
total_t0 = time.time()
for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================

  print('')
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')
  #measuring time for
  t0 = time.time()
  #reset total loss for this epoch.
  total_train_loss = 0
  #training the model
  model.train()
  for step, batch in enumerate(train_dataloader):
  #setting process update after every 40 batches
    if step % 40 == 0 and not step == 0:
      #calculate elapsed time in minute
      elapsed = format_time(time.time() - t0)
      print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from our dataloader.
        # As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
         #   [2]: labels
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    # Always clear any previously calculated gradients before performing a
    # backward pass. PyTorch doesn't do this automatically because
    # accumulating the gradients is "convenient while training RNNs".
    model.zero_grad()
    # Perform a forward pass (evaluate the model on this training batch).
    # It returns different numbers of parameters depending on what arguments
    # arge given and what flags are set. For our useage here, it returns
    # the loss (because we provided labels) and the "logits"--the model
    (loss, logits) = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
    # Accumulate the training loss over all of the batches so that we can
    # calculate the average loss at the end. `loss` is a Tensor containing a
    # single value; the `.item()` function just returns the Python value
    # from the tensor.

    # vectorizer = TextVectorization(max_tokens=1000, output_mode="int")
    # vectorizer.adapt([loss])
    # loss = vectorizer(loss)
    total_train_loss += loss.item()
    # Perform a backward pass to calculate the gradients.
    loss.backward()
    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # Update parameters and take a step using the computed gradient.
    optimizer.step()
    # Update the learning rate.\
    scheduler.step()
    # Calculate the average loss over all of the batches.
  avg_train_loss = total_train_loss / len(train_dataloader)
  # Measure how long this epoch took.
  training_time = format_time(time.time() - t0)
  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))
  # ========================================
  #               Validation
  # ========================================
  print("")
  print("Running Validation...")

  t0 = time.time()
  # Put the model in evaluation mode--the dropout layers behave differently
  model.eval()
  #tracking variables
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0
  #evaluating data for one epoch
  for batch in validation_dataloader:
      # Unpack this validation batch from our dataloader.
      # As we unpack the batch, we'll also copy each tensor to the GPU using
      # the `to` method.
      # `batch` contains three pytorch tensors:
      #   [0]: input ids
      #   [1]: attention masks
      #   [2]: labels
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      with torch.no_grad():
        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        (loss, logits) = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        # Accumulate the validation loss.
      total_eval_loss += loss.item()
      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      # Calculate the accuracy for this batch of test sentences, and
      # accumulate it over all batches.
      total_eval_accuracy += flat_accuracy(logits, label_ids)
  #final accuracy for this validation run.
  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
  # Calculate the average loss over all of the batches.
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  # Measure how long the validation run took.
  validation_time = format_time(time.time() - t0)

  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))

  # Record all statistics from this epoch.
  training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
Batch    40 of    74. Elapsed: 0:00:14.

  Average training loss: 1.21
  Training epcoh took: 0:00:25

Running Validation...
  Accuracy: 0.96
  Validation Loss: 0.67
  Validation took: 0:00:02

Training...
Batch    40 of    74. Elapsed: 0:00:14.

  Average training loss: 0.42
  Training epcoh took: 0:00:26

Running Validation...
  Accuracy: 0.98
  Validation Loss: 0.20
  Validation took: 0:00:02

Training...
Batch    40 of    74. Elapsed: 0:00:15.

  Average training loss: 0.16
  Training epcoh took: 0:00:27

Running Validation...
  Accuracy: 0.99
  Validation Loss: 0.12
  Validation took: 0:00:02

Training...
Batch    40 of    74. Elapsed: 0:00:14.

  Average training loss: 0.10
  Training epcoh took: 0:00:26

Running Validation...
  Accuracy: 0.99
  Validation Loss: 0.11
  Validation took: 0:00:02

Training complete!
Total training took 0:01:53 (h:mm:ss)


In [20]:
import pickle
filename = 'BERT_text_classification_final.sav'
pickle.dump(model,open(filename,'wb'))

In [24]:
# pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.214999,0.667995,0.961623,0:00:25,0:00:02
2,0.418424,0.202665,0.980263,0:00:26,0:00:02
3,0.1648,0.120859,0.986842,0:00:27,0:00:02
4,0.096556,0.10502,0.986842,0:00:26,0:00:02
