## Dependencies

In [1]:
# install dependencies
!pip install tensorflow transformers -q
!pip install datasets -q
!pip install transformers[torch] -q
!pip install evaluate -q
!pip install seqeval -q
!pip install -U sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

## Load Dataset

In [2]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("medical_questions_pairs")

# 90% train, 10% test + validation
train_testvalid = ds['train'].train_test_split(test_size=0.1)
# gather everyone with datasetsDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': train_testvalid['test']
    })

dataset

Downloading builder script:   0%|          | 0.00/2.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/174k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3048 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label'],
        num_rows: 2743
    })
    test: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label'],
        num_rows: 305
    })
})

## Libraries

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification, pipeline, Trainer, TrainingArguments
import numpy as np
import torch
from sklearn.metrics import precision_score, accuracy_score
import evaluate
import tensorflow as tf

## Fine-tuned BERT

In [18]:
# Initialize BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize function
def tokenize_function(examples):
  return tokenizer(examples['question_1'], examples['question_2'],
                   padding='max_length', truncation=True)

# Tokenize loaded dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/305 [00:00<?, ? examples/s]

In [5]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
  'bert-base-uncased',
  num_labels=2,
  problem_type = "single_label_classification"
)

metric = evaluate.load('accuracy')

# Compute metric function
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

# Training Arguments for Trainer
training_args = TrainingArguments(
  output_dir='test_trainer',
  evaluation_strategy='epoch'
)

# Trainer for model
trainer = Trainer(
  model=bert_model,
  args=training_args,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test'],
  compute_metrics=compute_metrics,
)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.423538,0.82623
2,0.541800,0.479341,0.836066
3,0.233000,0.757296,0.839344


TrainOutput(global_step=1029, training_loss=0.381476883753859, metrics={'train_runtime': 854.6808, 'train_samples_per_second': 9.628, 'train_steps_per_second': 1.204, 'total_flos': 2165140874557440.0, 'train_loss': 0.381476883753859, 'epoch': 3.0})

In [7]:
result = trainer.evaluate()
result

{'eval_loss': 0.7572963833808899,
 'eval_accuracy': 0.839344262295082,
 'eval_runtime': 10.8713,
 'eval_samples_per_second': 28.056,
 'eval_steps_per_second': 3.587,
 'epoch': 3.0}

In [8]:
# use predict from trainer
result = trainer.predict(tokenized_dataset['test'])
for i in range(10):
  q1 = tokenized_dataset['test']['question_1'][i]
  q2 = tokenized_dataset['test']['question_2'][i]
  label = tokenized_dataset['test']['label'][i]
  predict = np.argmax(result[0][i])
  print(f'Question 1: {q1}')
  print(f'Question 2: {q2}')
  print(f'Groundtruth: {label}')
  print(f'Prediction: {predict}\n')

Question 1: I've had body aches, blocked stuffy nose, headaches, pressure in my face and throat tightness and it feels dry for 6 months is it a bad cold?
Question 2: I have been having recurrent attacks of sinus infections. I do not have a fever but notice headache, heaviness, pressure sensation, congestion, body pain and dry cough. I'm currently on course of antibotics but do not seem to help. Should I continue or stop them?
Groundtruth: 0
Prediction: 0

Question 1: Does chiropractic medicine help people with back problems?
Question 2: Would seeing a chirpractor help me with my back problems?
Groundtruth: 1
Prediction: 1

Question 1: Tinnitus 35 yrs. Quiet till breakdown May.Squealing. Up and down.ENT dr thinks stress is cause.Good sign noises vary so.Treatstress and it will recede?
Question 2: Which conditions of the ear cause Tinnitus in 35 year olds?
Groundtruth: 0
Prediction: 0

Question 1: Does endocervical component mean endometriosis?
Question 2: I am a known case of endometros

In [9]:
trainer.save_model('./question_similarity_checking')

## Fine-tuned DistilBERT

In [13]:
# Initialize BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize function
def tokenize_function(examples):
  return tokenizer(examples['question_1'], examples['question_2'], padding='max_length', truncation=True)

# Tokenize loaded dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2743 [00:00<?, ? examples/s]

Map:   0%|          | 0/305 [00:00<?, ? examples/s]

In [14]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
  'distilbert-base-uncased',
  num_labels=2,
  problem_type = "single_label_classification"
)

metric = evaluate.load('accuracy')

# Compute metric function
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

# Training Arguments for Trainer
training_args = TrainingArguments(
  output_dir='test_trainer',
  evaluation_strategy='epoch'
)

# Trainer for model
trainer = Trainer(
  model=bert_model,
  args=training_args,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test'],
  compute_metrics=compute_metrics,
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.505387,0.770492
2,0.559100,0.541659,0.780328
3,0.343300,0.815234,0.77377


TrainOutput(global_step=1029, training_loss=0.4454762168713747, metrics={'train_runtime': 447.4632, 'train_samples_per_second': 18.39, 'train_steps_per_second': 2.3, 'total_flos': 1090074223540224.0, 'train_loss': 0.4454762168713747, 'epoch': 3.0})

In [16]:
result = trainer.evaluate()
result

{'eval_loss': 0.8152340650558472,
 'eval_accuracy': 0.7737704918032787,
 'eval_runtime': 5.5108,
 'eval_samples_per_second': 55.346,
 'eval_steps_per_second': 7.077,
 'epoch': 3.0}

In [17]:
# use predict from trainer
result = trainer.predict(tokenized_dataset['test'])
for i in range(10):
  q1 = tokenized_dataset['test']['question_1'][i]
  q2 = tokenized_dataset['test']['question_2'][i]
  label = tokenized_dataset['test']['label'][i]
  predict = np.argmax(result[0][i])
  print(f'Question 1: {q1}')
  print(f'Question 2: {q2}')
  print(f'Groundtruth: {label}')
  print(f'Prediction: {predict}\n')

Question 1: I've had body aches, blocked stuffy nose, headaches, pressure in my face and throat tightness and it feels dry for 6 months is it a bad cold?
Question 2: I have been having recurrent attacks of sinus infections. I do not have a fever but notice headache, heaviness, pressure sensation, congestion, body pain and dry cough. I'm currently on course of antibotics but do not seem to help. Should I continue or stop them?
Groundtruth: 0
Prediction: 0

Question 1: Does chiropractic medicine help people with back problems?
Question 2: Would seeing a chirpractor help me with my back problems?
Groundtruth: 1
Prediction: 1

Question 1: Tinnitus 35 yrs. Quiet till breakdown May.Squealing. Up and down.ENT dr thinks stress is cause.Good sign noises vary so.Treatstress and it will recede?
Question 2: Which conditions of the ear cause Tinnitus in 35 year olds?
Groundtruth: 0
Prediction: 1

Question 1: Does endocervical component mean endometriosis?
Question 2: I am a known case of endometros