# Install Transformers Library

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
import jsonlines
intent_dict = {}
intent_counter = 0
with jsonlines.open('data/train.jsonl') as reader: 
    for obj in reader: 
        output_raw = obj['output'] 
        intent = output_raw.split(' ')[0]
        if intent not in intent_dict:
            intent_dict[intent] = intent_counter
            intent_counter += 1
intent_dict

{'Send_digital_object': 0,
 'Get_health_stats': 1,
 'Get_message_content': 2,
 'Add_contact': 3,
 'Initiate_call': 4,
 'Create_note': 5,
 'Add_item_to_list': 6,
 'Create_list': 7,
 'Get_list': 8,
 'Order_menu_item': 9,
 'Find_parking': 10,
 'Get_note': 11,
 'Start_exercise': 12,
 'Stop_exercise': 13,
 'Resume_exercise': 14,
 'Pause_exercise': 15,
 'Log_exercise': 16,
 'Log_nutrition': 17,
 'Check_order_status': 18,
 'Get_bill': 19,
 'Get_security_price': 20,
 'Open_app': 21,
 'Pay_bill': 22,
 'Get_product': 23,
 'Other': 24,
 'Post_message': 25,
 'Record_video': 26,
 'Take_photo': 27,
 'Cancel_ride': 28,
 'Order_ride': 29,
 'BuyEventTickets': 30,
 'Play_game': 31,
 'GetGenericBusinessType': 32}

In [3]:
len(intent_dict)

33

In [4]:
def create_dataset(file_path):
    texts = []
    labels = []
    with jsonlines.open(file_path) as reader: 
        for obj in reader: 
            input_raw = obj['input'] 
            output_raw = obj['output'] 
            intent = output_raw.split(' ')[0]
            texts.append(input_raw)
            labels.append(intent_dict[intent])
    # make labels as class int
    labels = np.array(labels, dtype=np.int64)
    return pd.DataFrame({'text': texts, 'label': labels})


In [5]:
df_train = create_dataset('data/train.jsonl')

In [6]:
df_train.to_csv('data/train.csv', index=False)

In [7]:
df_train.shape

(30993, 2)

In [8]:
df_train['label'].value_counts()

label
24    7729
4     1245
0     1106
6      977
15     914
3      909
18     886
13     884
12     879
31     846
26     819
16     814
11     807
25     792
32     788
1      776
30     773
27     771
21     738
19     722
17     715
20     703
22     695
23     689
28     682
29     674
2      611
5      585
14     458
7      287
9      269
10     262
8      188
Name: count, dtype: int64

In [9]:
# df = pd.read_csv("spamdata_v2.csv")
# df.head()

In [10]:
# df.shape

In [11]:
# # check class distribution
# df['label'].value_counts(normalize = True)

# Split train dataset into train, validation and test sets

In [12]:
# train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], 
#                                                                     random_state=2018, 
#                                                                     test_size=0.3, 
#                                                                     stratify=df['label'])

# # we will use temp_text and temp_labels to create validation and test set
# val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
#                                                                 random_state=2018, 
#                                                                 test_size=0.5, 
#                                                                 stratify=temp_labels)

In [13]:
train_text = df_train['text']
train_labels = df_train['label']

In [14]:
df_val = create_dataset('data/dev.jsonl')
val_text = df_val['text']
val_labels = df_val['label']

In [15]:
# train_labels = [int(i) for i in train_labels]
# val_labels = [int(i) for i in val_labels]

# Import BERT Model and BERT Tokenizer

In [16]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# sample data
text = ["this is a bert model tutorial", "we will fine-tune a bert model"]

# encode text
sent_id = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False)

In [18]:
# output
print(sent_id)

{'input_ids': [[101, 2023, 2003, 1037, 14324, 2944, 14924, 4818, 102, 0], [101, 2057, 2097, 2986, 1011, 8694, 1037, 14324, 2944, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


# Tokenization

In [19]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train_text]
# print(seq_len)
# plot distribution of sequence length
max(seq_len)
# pd.Series(seq_len).hist()

29

In [20]:
max_seq_len = 60

In [21]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False,
    return_tensors='pt'
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False,
    return_tensors='pt'
)

# # tokenize and encode sequences in the test set
# tokens_test = tokenizer.batch_encode_plus(
#     test_text.tolist(),
#     max_length = max_seq_len,
#     pad_to_max_length=True,
#     truncation=True,
#     return_token_type_ids=False
# )



# Convert Integer Sequences to Tensors

In [22]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'], dtype=torch.long)
train_mask = torch.tensor(tokens_train['attention_mask'], dtype=torch.long)
train_y = torch.tensor(train_labels.tolist(), dtype=torch.long)

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'], dtype=torch.long)
val_mask = torch.tensor(tokens_val['attention_mask'], dtype=torch.long)
val_y = torch.tensor(val_labels.tolist(), dtype=torch.long)

# # for test set
# test_seq = torch.tensor(tokens_test['input_ids'])
# test_mask = torch.tensor(tokens_test['attention_mask'])
# test_y = torch.tensor(test_labels.tolist())

  train_seq = torch.tensor(tokens_train['input_ids'], dtype=torch.long)
  train_mask = torch.tensor(tokens_train['attention_mask'], dtype=torch.long)
  val_seq = torch.tensor(tokens_val['input_ids'], dtype=torch.long)
  val_mask = torch.tensor(tokens_val['attention_mask'], dtype=torch.long)


In [23]:
# create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory


# Create DataLoaders

In [24]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# Freeze BERT Parameters

In [25]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

# Define Model Architecture

In [26]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,33)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [27]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [28]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)

2023-04-15 00:17:22.735488: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-15 00:17:23.604024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/pratyush/.local/lib/python3.10/site-packages/
2023-04-15 00:17:23.604135: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/pratyush/.local/lib/python3.10/site-packages/


# Find Class Weights

In [29]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_wts = compute_class_weight(class_weight='balanced', classes = np.unique(train_labels),y= train_labels)

print(class_wts)

[0.84916982 1.21028585 1.53712245 1.03320332 0.75436291 1.60543901
 0.96129152 3.27241052 4.99564797 3.49138222 3.58466343 1.16379407
 1.06846623 1.06242287 2.05061532 1.02755122 1.15378602 1.313541
 1.06002463 1.30080584 1.33596276 1.27260409 1.35134075 1.36310859
 0.12151401 1.18583563 1.14674215 1.21813465 1.37709944 1.39344483
 1.21498295 1.11014399 1.1918551 ]


In [30]:
# convert class weights to tensor
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

# loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 10

# Fine-Tune BERT

In [31]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch
    # print(sent_id[0][0], mask[0][0], labels[0])
    # print(type(sent_id), type(mask), type(labels))
    # print(sent_id.shape, mask.shape, labels.shape)

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

In [34]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      # elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

# Start Model Training

In [35]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch    50  of    969.
  Batch   100  of    969.
  Batch   150  of    969.
  Batch   200  of    969.
  Batch   250  of    969.
  Batch   300  of    969.
  Batch   350  of    969.
  Batch   400  of    969.
  Batch   450  of    969.
  Batch   500  of    969.
  Batch   550  of    969.
  Batch   600  of    969.
  Batch   650  of    969.
  Batch   700  of    969.
  Batch   750  of    969.
  Batch   800  of    969.
  Batch   850  of    969.
  Batch   900  of    969.
  Batch   950  of    969.

Evaluating...
  Batch    50  of    290.
  Batch   100  of    290.
  Batch   150  of    290.
  Batch   200  of    290.
  Batch   250  of    290.

Training Loss: 2.573
Validation Loss: 2.204

 Epoch 2 / 10
  Batch    50  of    969.
  Batch   100  of    969.
  Batch   150  of    969.
  Batch   200  of    969.
  Batch   250  of    969.
  Batch   300  of    969.
  Batch   350  of    969.
  Batch   400  of    969.
  Batch   450  of    969.
  Batch   500  of    969.
  Batch   550  of    969.


KeyboardInterrupt: 

# Load Saved Model

In [36]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [43]:
# clear cuda cache
torch.cuda.empty_cache()

# Get Predictions for Test Data

In [45]:
# get predictions for test data
predictions = []
for step,batch in enumerate(val_dataloader):
  
  # Progress update every 50 batches.
  if step % 50 == 0 and not step == 0:
    
    # Calculate elapsed time in minutes.
    # elapsed = format_time(time.time() - t0)
          
    # Report progress.
    print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

  # push the batch to gpu
  batch = [t.to(device) for t in batch]

  sent_id, mask, labels = batch

  # deactivate autograd
  with torch.no_grad():
    
    # model predictions
    preds = model(sent_id, mask)
    preds = preds.detach().cpu().numpy()
  predictions.append(preds)

  Batch    50  of    290.
  Batch   100  of    290.
  Batch   150  of    290.
  Batch   200  of    290.
  Batch   250  of    290.


In [48]:
predictions  = np.concatenate(predictions, axis=0)

In [49]:
# model's performance
preds = np.argmax(predictions, axis = 1)
print(classification_report(val_y, preds))

              precision    recall  f1-score   support

           0       0.37      0.40      0.39       331
           1       0.61      0.66      0.64       232
           2       0.45      0.57      0.50       182
           3       0.41      0.79      0.54       272
           4       0.45      0.45      0.45       373
           5       0.23      0.44      0.30       174
           6       0.29      0.58      0.38       292
           7       0.12      0.29      0.17        86
           8       0.22      0.25      0.23        56
           9       0.39      0.51      0.44        79
          10       0.32      0.69      0.43        78
          11       0.40      0.50      0.44       242
          12       0.34      0.39      0.36       263
          13       0.52      0.50      0.51       264
          14       0.20      0.74      0.31       137
          15       0.56      0.38      0.45       273
          16       0.34      0.62      0.44       243
          17       0.43    

In [None]:
# confusion matrix
pd.crosstab(test_y, preds)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,710,14
1,9,103
