# Medical Question Classification

## Install and Import

In [1]:
# Install required dependencies
# !pip install tensorflow transformers -q
# !pip install datasets -q
# !pip install transformers[torch]
# !pip install evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3

In [2]:
# Import necessary libraries and modules
import tensorflow as tf
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import datasets
from datasets import load_dataset
import evaluate

## Slice and Merge Dataset

In [3]:
# Load climate_sentiment dataset
dataset = load_dataset("fhirfly/medicalquestions")

# Split dataset into small portion (10%)
_, train = dataset['train'].train_test_split(test_size=0.1).values()
train_set, test_set = train.train_test_split(test_size=0.1).values()

# Merge training and test subsets
dataset = datasets.DatasetDict({"train":train_set, "test":test_set})
dataset

Downloading readme:   0%|          | 0.00/3.05k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 2251
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 251
    })
})

## Tokenize Dataset

In [4]:
# Tokenize text data using BERT tokenizer

# Initialize a BERT tokenizer from pre-trained "bert-base-uncased" model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define a tokenize function to apply tokenization to the examples in dataset
def tokenize(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset using tokenize function, applying tokenization to all examples in batches
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2251 [00:00<?, ? examples/s]

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2251
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 251
    })
})

## Initialize Model and Metric

In [5]:
num_labels = len(set(tokenized_dataset['train']['label']))

# Initialize a BERT-based sequence classification model using pre-trained "bert-base-uncased" model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Load a specific evaluation metric (accuracy)
metric = evaluate.load("accuracy")

# Define a custom function to compute evaluation metric
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Initialize TrainingArgs and Trainer

In [7]:
# Define training arguments for trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

# Initialize a Trainer object for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

## Train and Evaluate

In [8]:
# Use 'train' method to start training process
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.040708,0.992032
2,0.054700,0.061465,0.98008
3,0.054700,0.063861,0.984064


TrainOutput(global_step=846, training_loss=0.03514898978790211, metrics={'train_runtime': 647.5078, 'train_samples_per_second': 10.429, 'train_steps_per_second': 1.307, 'total_flos': 1776788956846080.0, 'train_loss': 0.03514898978790211, 'epoch': 3.0})

In [9]:
# Use 'evaluate' method to evaluate trained model on the evaluation dataset
result = trainer.evaluate()
result

{'eval_loss': 0.06386122852563858,
 'eval_accuracy': 0.9840637450199203,
 'eval_runtime': 8.2015,
 'eval_samples_per_second': 30.604,
 'eval_steps_per_second': 3.902,
 'epoch': 3.0}

## Predict Test Data

In [10]:
# Use 'predict' method to make prediction on the test dataset
result = trainer.predict(tokenized_dataset['test'])

# Loop through a subset of examples in the test dataset
for i in range(10, 20):
  print(f"Text: {tokenized_dataset['test']['text'][i]}")
  print(f"Prediction: {np.argmax(result[0][i])}")
  print(f"Groundtruth: {tokenized_dataset['test']['label'][i]}")

Text: What is (are) alanine?
Prediction: 1
Groundtruth: 1
Text: which languages are part of the family of austronesian languages
Prediction: 0
Groundtruth: 0
Text: What are the symptoms of Cataract microcornea syndrome ?
Prediction: 1
Groundtruth: 1
Text: How should a care plan for Acute Myocardial Infarction be structured?
Prediction: 1
Groundtruth: 1
Text: What are the symptoms of Maturity-onset diabetes of the young, type 8 ?
Prediction: 1
Groundtruth: 1
Text: In what format was the collaborations released
Prediction: 0
Groundtruth: 0
Text: What country is tran buu ngoc from
Prediction: 0
Groundtruth: 0
Text: What genre of movie is frozen
Prediction: 0
Groundtruth: 0
Text: what kind of music does wccc play 
Prediction: 0
Groundtruth: 0
Text: what film was kenya featured in
Prediction: 0
Groundtruth: 0


In [11]:
trainer.save_model("medical-bert-classifier")