In [1]:
%%capture
!pip install datasets
!pip install transformers
!pip install evaluate

In [2]:
import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
import numpy as np
import evaluate
import torch

## Data Preparation

In [None]:
df = pd.read_csv('para_data.csv')
df = df.drop(columns='Unnamed: 0')
df.dropna(inplace=True)
df.head()

In [None]:
targets = list(df['Disease'].unique())

id2label = dict()
label2id = dict()
for i, x in enumerate(targets):
  id2label[i] = x
  label2id[x] = i

df['Disease'] = df['Disease'].map(label2id)

In [None]:
# stratified 75-15-10 train-eval-test split
train = pd.DataFrame()
test = pd.DataFrame()
val = pd.DataFrame()
targets = list(df['Disease'].unique())
for target in targets:
  temp = df[df['Disease'] == target].sample(frac=1, random_state=42)
  if len(temp) == 36:
    val = pd.concat([val, temp[:4]])
    test = pd.concat([test, temp[4:9]])
    train = pd.concat([train, temp[9:]])
  elif len(temp) == 24:
    val = pd.concat([val, temp[:3]])
    test = pd.concat([test, temp[3:6]])
    train = pd.concat([train, temp[6:]])
  else:
    val = pd.concat([val, temp[:2]])
    test = pd.concat([test, temp[2:3]])
    train = pd.concat([train, temp[3:]])

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
val = val.reset_index(drop=True)

train.to_csv('train_data.csv')
test.to_csv('test_data.csv')
val.to_csv('val_data.csv')

In [None]:
dataset = load_dataset('csv', data_files={'train': 'train_data.csv', 'val': 'val_data.csv'})
# checkpoint = 'emilyalsentzer/Bio_ClinicalBERT'
# checkpoint = 'bioformers/bioformer-8L'
# checkpoint = 'monologg/biobert_v1.1_pubmed'
# checkpoint = 'dmis-lab/biobert-v1.1'
checkpoint = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'

In [None]:
def tokenize_function(examples):
  return tokenizer(examples['Symptoms'], max_length=128, padding='max_length', truncation=True)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tok_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tok_dataset = tok_dataset.remove_columns(['Unnamed: 0', 'Symptoms'])
tok_dataset = tok_dataset.rename_column('Disease', 'labels')
tok_dataset.set_format('torch')
tok_dataset['train'].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

## Model Training

In [None]:
device  = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=931, id2label=id2label, label2id=label2id)
model.to(device)

In [None]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(output_dir='model', learning_rate=2e-5, per_device_train_batch_size=16, 
                                  per_device_eval_batch_size=16, num_train_epochs=5, weight_decay=0.01, evaluation_strategy='epoch')
trainer = Trainer(model=model, args=training_args, train_dataset=tok_dataset['train'], eval_dataset=tok_dataset['val'], tokenizer=tokenizer, 
                  data_collator=data_collator, compute_metrics=compute_metrics)

trainer.train()

In [None]:
!zip -r /content/model.zip /content/model/checkpoint-2500

## Model Testing

In [None]:
!unzip model.zip

Archive:  model.zip
   creating: content/model/checkpoint-2500/
  inflating: content/model/checkpoint-2500/tokenizer_config.json  
  inflating: content/model/checkpoint-2500/rng_state.pth  
  inflating: content/model/checkpoint-2500/scheduler.pt  
  inflating: content/model/checkpoint-2500/training_args.bin  
  inflating: content/model/checkpoint-2500/vocab.txt  
  inflating: content/model/checkpoint-2500/pytorch_model.bin  
  inflating: content/model/checkpoint-2500/tokenizer.json  
  inflating: content/model/checkpoint-2500/optimizer.pt  
  inflating: content/model/checkpoint-2500/config.json  
  inflating: content/model/checkpoint-2500/special_tokens_map.json  
  inflating: content/model/checkpoint-2500/trainer_state.json  


In [None]:
test = pd.read_csv('test_data.csv')
model = '/content/content/model/checkpoint-2500' 
tokenizer = '/content/content/model/checkpoint-2500'
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

In [None]:
correct = 0
top3_correct = 0
top5_correct = 0
for i, r in test.sample(frac=1, random_state=42).iterrows():
  text = r['Symptoms']
  target = id2label[r['Disease']]

  results = classifier(text, top_k=5)
  for i in range(5):
    if results[i]['label'] == target:
      if i == 0:
        correct += 1
        top3_correct += 1
        top5_correct += 1
        break
      elif i < 3:
        top3_correct += 1
        top5_correct += 1
        break
      elif i < 5:
        top5_correct += 1
        break
 
print(f"Test Accuracy: {correct/len(test)}")
print(f"Test Top 3 Accuracy: {top3_correct/len(test)}")
print(f"Test Top 5 Accuracy: {top5_correct/len(test)}")

Test Accuracy: 0.9701636188642926
Test Top 3 Accuracy: 0.9894128970163619
Test Top 5 Accuracy: 0.9913378248315688


## Model Demo

In [None]:
!unzip model.zip

In [5]:
df = pd.read_csv('para_data.csv')
df = df.drop(columns='Unnamed: 0')
df.dropna(inplace=True)

targets = list(df['Disease'].unique())

id2label = dict()
label2id = dict()
for i, x in enumerate(targets):
  id2label[i] = x
  label2id[x] = i

In [6]:
test = pd.read_csv('test_data.csv')
model = '/content/content/model/checkpoint-2500' 
tokenizer = '/content/content/model/checkpoint-2500'
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

In [7]:
text = test.iloc[100]['Symptoms']
print(f"Symptoms: {text}")
print(f"Disease: {id2label[test.iloc[100]['Disease']]}")

Symptoms: I have diminished vision, pain in the eye, eyes or clumps, eye redness, lacrimation, itchiness of the eye, blindness, eye burns or stings, foreign sensation in eye, itchy eyelid, feeling cold
Disease: glaucoma


In [8]:
classifier(text, top_k=5)

[{'label': 'glaucoma', 'score': 0.1309451162815094},
 {'label': 'floaters', 'score': 0.014649933204054832},
 {'label': 'retinal detachment', 'score': 0.008319108746945858},
 {'label': 'aphakia', 'score': 0.008087607100605965},
 {'label': 'astigmatism', 'score': 0.00558258593082428}]

In [9]:
text = input("What are your symptoms? ")
classifier(text, top_k=5)

What are your symptoms? I am experiencing chest pain, shortness of breath, nauseousness, light-headedness, and unusual fatigue


[{'label': 'peripheral vascular disease', 'score': 0.010615586303174496},
 {'label': 'heart attack', 'score': 0.010305095463991165},
 {'label': 'panic attack', 'score': 0.009891792200505733},
 {'label': 'hypercholesterolemia', 'score': 0.008675008080899715},
 {'label': 'failure heart congestive', 'score': 0.007882450707256794}]

In [None]:
# I am experiencing chest pain, shortness of breath, nauseousness, light-headedness, and unusual fatigue