### Notebook Setup

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
!pip install -q torch transformers[sentencepiece] fastbook fastai ohmeow-blurr nbdev

In [None]:
!pip install onnxruntime onnx

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig
from fastai.text.all import *
from blurr.text.data.all import *
from blurr.text.modeling.all import *

In [None]:
from tqdm.notebook import tqdm
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Manuscript-Matcher

# Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Manuscript-Matcher/labelled_data.csv')

In [None]:
asjc_df = pd.read_csv('/content/drive/MyDrive/Manuscript-Matcher/asjc_codes.csv')
asjc_df = asjc_df[['Code', 'ASJC category']]

## Label Encoding

In [None]:
code2cat = dict(zip(asjc_df['Code'], asjc_df['ASJC category']))
cat2code = dict(zip(asjc_df['ASJC category'], asjc_df['Code']))

In [None]:
def count_values(lst):
    counts = {}
    for sublist in lst:
        sublist = eval(sublist)
        for item in sublist:
            if item in counts:
                counts[item] += 1
            else:
                counts[item] = 1
    return counts

In [None]:
revised_categories_count = count_values(df.revised_categories.to_list())
categories_encoding = {code2cat[key]: idx for idx, (key, value) in enumerate(revised_categories_count.items())}

## Train Test Split

In [None]:
splitter = RandomSplitter(valid_pct=0.1, seed=42)
train_ids, valid_ids = splitter(df)
len(train_ids), len(valid_ids)

(28275, 3141)

In [None]:
valid_df = df.loc[valid_ids]
# valid_df.head()

# Inference

In [None]:
model_path = "models/manuscript-matcher-stage-1.pkl"
learner_inf = load_learner(model_path)

In [None]:
learner_inf.blurr_predict("In vitro blood cell viability profiling of polymers used in molecular assembly.")

In [None]:
learner_inf.blurr_predict("In vitro blood cell viability profiling of polymers used in molecular assembly.")[0]['labels']

['Organic Chemistry', 'Polymers and Plastics']

## F1 Evaluation

In [None]:
from sklearn import metrics

def metric_measures(test_df, preds):

  targets = [np.asarray(eval(target)) for target in test_df.label.to_list()]
  outputs = [np.asarray(pred) for pred in preds]

  accuracy = metrics.accuracy_score(targets, outputs)
  f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
  f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

  print(f"F1 Score (Micro) = {f1_score_micro}")
  print(f"F1 Score (Macro) = {f1_score_macro}")

  return

In [None]:
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=len(valid_df)):
  desc = row['text']
  labels = learner_inf.blurr_predict(desc)[0]['labels']
  pred_cats = [0] * len(categories_encoding)
  for label in labels:
    pred_cats[categories_encoding[label]] = 1
  preds.append(pred_cats)

preds[0][:20]

In [None]:
metric_measures(valid_df, preds)

F1 Score (Micro) = 0.7067603922589603
F1 Score (Macro) = 0.5970682523875122


## ONNX Quantization

In [None]:
model_path = "models/manuscript-matcher-stage-1.pkl"
learner_inf = load_learner(model_path)

In [None]:
classifier = learner_inf.model.hf_model.eval()

torch.onnx.export(
    classifier,
    torch.LongTensor([[0] * 512]),
    'models/manuscript-matcher.onnx',
    input_names=['input_ids'],
    output_names=['output'],
    opset_version=13,
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'sequence_len'},
        'output': {0: 'batch_size'}
    }
)

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

onnx_model_path = 'models/manuscript-matcher.onnx'
quantized_onnx_model_path = 'models/manuscript-matcher-quantized.onnx'

quantize_dynamic(
    onnx_model_path,
    quantized_onnx_model_path,
    weight_type=QuantType.QUInt8,
)

## ONNX Inference

In [None]:
import onnxruntime as rt
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

class_labels = list(categories_encoding.keys())

inf_session = rt.InferenceSession('models/manuscript-matcher-quantized.onnx')
input_name = inf_session.get_inputs()[0].name
output_name = inf_session.get_outputs()[0].name

In [None]:
preds = []
for idx, row in tqdm(valid_df.iterrows(), total=valid_df.shape[0]):
  text = row['text']
  input_ids = tokenizer(text)['input_ids'][:512]

  probs = inf_session.run([output_name], {input_name: [input_ids]})[0]
  probs = torch.FloatTensor(probs)

  masks = torch.sigmoid(probs) >= 0.5
  labels = [class_labels[idx] for idx, mask in enumerate(masks[0]) if mask]

  pred_cats = [0] * len(categories_encoding)
  for label in labels:
    pred_cats[categories_encoding[label]] = 1
  preds.append(pred_cats)

In [None]:
metric_measures(valid_df, preds)

F1 Score (Micro) = 0.6891020052310375
F1 Score (Macro) = 0.5803489754056876
