In [22]:
# Intent Classification

In [23]:
import pandas as pd
import numpy as np
import json
import torch

In [24]:
## Preprocessing 

In [25]:
# Load annotations
with open("./oos-eval/data/data_full.json") as f:
  data = json.load(f)

labels = [ d[1] for d in data['train']] #Get all the labels

# For persistence
np.random.seed(0)

uniqueLabels = np.unique(np.array(labels))

# Labels of interest- 20 selected at random
finalLabels = list(np.random.choice(uniqueLabels,20,replace=False))

In [26]:
#Split dataset into train, val and test

splits = ["train","val","test"]

train_x = []
train_y = []
val_x = []
val_y = []
test_x = []
test_y = []

dataset = {
    "train": [train_x, train_y],
    "val": [val_x, val_y],
    "test" : [test_x, test_y]
}

for split in splits:
  np.random.shuffle(data[split])
  for d in data[split]:
    if d[1] in finalLabels:
      dataset[split][0].append(d[0])
      dataset[split][1].append(d[1])

In [27]:
# Label encoding - from labels to classes

labelToClsIdx = {}
for idx, label in enumerate(finalLabels):
  labelToClsIdx[label] = idx
#print(labelToClsIdx)

for split in splits:
  classIds = [ labelToClsIdx[label] for label in dataset[split][1]]
  dataset[split].append(classIds)

In [28]:
## Classifier

In [29]:
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from sklearn.metrics import accuracy_score

torch.manual_seed(42)

if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"
#print(device)

class BERTClassifier():
    
    def __init__(self, dataset, model_name="distilbert-base-uncased", n_classes=20):
    
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)

        # get the sentences and class ids
        self.train_x = dataset["train"][0]
        self.train_y = dataset["train"][2]
        self.val_x = dataset["val"][0]
        self.val_y = dataset["val"][2]
        self.test_x = dataset["test"][0]
        self.test_y = dataset["test"][2]

        # Feed Forward Network 
        self.ffn = nn.Linear(768, n_classes).to(device)

        self.optimizer = torch.optim.Adam(self.ffn.parameters(), lr=1e-5)


    def tokenize(self, sequences):
    
        bert_encoding = self.tokenizer(sequences, return_tensors="pt", padding="longest", truncation=True)
        input_ids = bert_encoding["input_ids"].to(device)
        attention_mask = bert_encoding["attention_mask"].to(device)

        return input_ids, attention_mask


    def get_embeddings(self, sequences):
        
        inp_ids, a_masks = self.tokenize(sequences)
        outputs = self.model(inp_ids, attention_mask= a_masks, output_hidden_states=True)
        seq_embedding = outputs.hidden_states[-1][:,0]

        return seq_embedding

  
    def train(self, epochs =25, lr = 1e-4, batch_size=32):
    
        loss_fn = nn.CrossEntropyLoss()
        best_acc = 0
        self.optimizer.param_groups[0]['lr'] = lr

        for ep in range(epochs):
          print(f"\nEpoch {ep+1}")
          tr_loss = []

          for batch_idx in range(0,len(self.train_x), batch_size):
            self.optimizer.zero_grad()
            x,y = (list(self.train_x[batch_idx : batch_idx+batch_size]),
                   list(self.train_y[batch_idx : batch_idx+batch_size]))

            with torch.no_grad():
              emb_x = self.get_embeddings(x)

            logits = self.ffn(emb_x)
            loss = loss_fn(logits, torch.LongTensor(y).to(device))

            if device == "cuda":
              tr_loss.append(loss.detach().cpu().numpy())
            else:
              tr_loss.append(loss.detach().numpy())

            loss.backward()
            self.optimizer.step()

          print("Training Loss: {}".format(np.array(tr_loss).mean()))

          dev_acc = self.evaluate()

          print("Dev Accuracy: {}".format(dev_acc))

          if dev_acc > best_acc:
            best_acc = dev_acc 
            self.best_ffn = self.ffn

            torch.save({"epoch": ep, 
                        "ffn_dict": self.ffn.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'accuracy': dev_acc}, f"./model_{ep+1}_{dev_acc}")

    
    
    def evaluate(self, split="val", batch_size=32, use_best_ffn=False):
        
        if split == "val":
          data_x = self.val_x
          data_y = self.val_y
        else:
          data_x = self.test_x
          data_y = self.test_y   

        preds = []
        target = []

        for batch_idx in range(0,len(data_x), batch_size):

            x,y = (list(data_x[batch_idx : batch_idx+batch_size]),
                   list(data_y[batch_idx : batch_idx+batch_size]))

            with torch.no_grad():

              emb_x = self.get_embeddings(x)
              if use_best_ffn:
                logits = self.best_ffn(emb_x)
              else:
                logits = self.ffn(emb_x)
              pred = torch.argmax(logits,dim=1)

              target.extend(y)
              if device == "cuda":
                preds.extend(list(pred.detach().cpu().numpy()))
              else:
                preds.extend(list(pred.detach().numpy()))

        acc = accuracy_score(np.array(target), np.array(preds))

        return acc
        

    def load_model(self, path):
        
        checkpoint = torch.load(path)
        self.ffn.load_state_dict(checkpoint['ffn_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


"\n  def load_model(self, path):\n    checkpoint = torch.load(path)\n    self.ffn.load_state_dict(checkpoint['ffn_dict'])\n    self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])\n"

In [30]:
#Run the model

bert_model = BERTClassifier(dataset)
bert_model.train(epochs =10, lr = 5e-4)
test_acc = bert_model.evaluate(split="test",use_best_ffn=False)
best_dev_acc = bert_model.evaluate(split="val",use_best_ffn=False)

print("\nBest Dev Accuracy: {}".format(best_dev_acc))
print("Test Accuracy: {}".format(test_acc))


Epoch 1
Training Loss: 2.766836404800415
Dev Accuracy: 0.7

Epoch 2
Training Loss: 2.308286190032959
Dev Accuracy: 0.86

Epoch 3
Training Loss: 1.9314401149749756
Dev Accuracy: 0.885

Epoch 4
Training Loss: 1.6273757219314575
Dev Accuracy: 0.8875

Epoch 5
Training Loss: 1.3855042457580566
Dev Accuracy: 0.905

Epoch 6
Training Loss: 1.1936445236206055
Dev Accuracy: 0.91

Epoch 7
Training Loss: 1.0406631231307983
Dev Accuracy: 0.9225

Epoch 8
Training Loss: 0.9174762964248657
Dev Accuracy: 0.9275

Epoch 9
Training Loss: 0.8170788884162903
Dev Accuracy: 0.93

Epoch 10
Training Loss: 0.7342090010643005
Dev Accuracy: 0.9325

Best Dev Accuracy: 0.9325
Test Accuracy: 0.955
