# Introduction

This notebook is the building block for the URGENT LINK model that finetuned BERT to refer the patients to their needed department for treatment


from google.colab import drive
drive.mount('/content/drive')

## Setup

In [1]:
!pip install -U transformers evaluate accelerate
!pip install tensorboard

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━

## Imports

In [2]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

from datasets import load_dataset

import evaluate
import glob
import numpy as np



## Hyperparameters

In [7]:
BATCH_SIZE =8
NUM_PROCS = 8
LR = 0.00005
EPOCHS = 5
MODEL = 'bert-base-uncased'
OUT_DIR = '/kaggle/working/'

## Download the Dataset

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your CSV dataset
csv_file_path = '/kaggle/input/dtasetd/seniorotle/finaldata.csv'
df = pd.read_csv(csv_file_path)

# Specify the column containing your features (X) and the column containing your labels (y)
X = df.drop('department', axis=1)
y = df['department']


# Split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create DataFrames with both features and encoded labels, ensuring correct alignment
train_data = pd.concat([X_train, pd.Series(y_train, name='encoded_label'), df.loc[X_train.index, 'cases']], axis=1)
test_data = pd.concat([X_test, pd.Series(y_test, name='encoded_label'), df.loc[X_test.index, 'cases']], axis=1)
val_data = pd.concat([X_val, pd.Series(y_val, name='encoded_label'), df.loc[X_val.index, 'cases']], axis=1)

# Save the split datasets to new CSV files
train_data.to_csv('/kaggle/input/dtasetd/seniorotle/train.csv', index=False)
test_data.to_csv('/kaggle/input/dtasetd/seniorotle/test.csv', index=False)
val_data.to_csv('/kaggle/input/dtasetd/seniorotle/val.csv', index=False)




In [12]:
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Replace with the actual paths to your CSV files
train_csv_path = '/kaggle/input/dtasetd/seniorotle/train.csv'
valid_csv_path = '/kaggle/input/dtasetd/seniorotle/val.csv'
test_csv_path = '/kaggle/input/dtasetd/seniorotle/test.csv'

# Load datasets from CSV using pandas
train_df = pd.read_csv(train_csv_path)
valid_df = pd.read_csv(valid_csv_path)
test_df = pd.read_csv(test_csv_path)

# Convert the text column to strings
train_df['text_column'] = train_df['cases'].astype(str)
valid_df['text_column'] = valid_df['cases'].astype(str)
test_df['text_column'] = test_df['cases'].astype(str)

# Drop rows with missing or empty values in the 'text_column'
train_df = train_df.dropna(subset=['cases']).reset_index(drop=True)
valid_df = valid_df.dropna(subset=['cases']).reset_index(drop=True)
test_df = test_df.dropna(subset=['cases']).reset_index(drop=True)

# Specify the column containing your labels
labels_column = 'encoded_label'  # Replace with the actual column name containing your labels
train_labels = train_df[labels_column]
valid_labels = valid_df[labels_column]
test_labels = test_df[labels_column]



# Create dataset dictionaries compatible with datasets library
train_dataset_dict = {
    'text': train_df['cases'].tolist(),
    'label': train_labels.tolist(),
}

valid_dataset_dict = {
    'text': valid_df['cases'].tolist(),
    'label': valid_labels.tolist(),
}

test_dataset_dict = {
    'text': test_df['text_column'].tolist(),
    'label': test_labels.tolist(),
}

# Convert the dataset dictionaries to datasets.Dataset
train_dataset = Dataset.from_dict(train_dataset_dict)
valid_dataset = Dataset.from_dict(valid_dataset_dict)
test_dataset = Dataset.from_dict(test_dataset_dict)

# Display the original labels and encoded labels for the training split
print("Original labels (training):", train_labels.tolist())

# You can also inverse transform to get the original labels back


Original labels (training): [11, 5, 6, 12, 3, 14, 16, 11, 14, 12, 1, 2, 1, 8, 2, 13, 14, 5, 5, 14, 3, 1, 14, 8, 1, 3, 15, 14, 13, 1, 7, 3, 1, 1, 10, 2, 1, 6, 0, 7, 8, 2, 11, 3, 11, 4, 14, 9, 12, 3, 9, 14, 4, 10, 10, 4, 4, 14, 11, 2, 13, 7, 4, 2, 8, 3, 11, 6, 0, 15, 13, 13, 15, 9, 14, 1, 10, 4, 9, 10, 4, 4, 5, 9, 2, 9, 10, 6, 6, 5, 0, 1, 3, 15, 4, 3, 9, 4, 2, 14, 1, 11, 0, 2, 6, 3, 11, 3, 3, 3, 9, 2, 4, 9, 5, 9, 1, 13, 1, 14, 5, 11, 1, 3, 1, 11, 9, 5, 2, 10, 2, 8, 2, 3, 3, 14, 3, 9, 13, 9, 15, 2, 15, 1, 16, 2, 1, 6, 3, 8, 16, 12, 5, 1, 3, 0, 10, 13, 10, 5, 13, 8, 13, 13, 1, 9, 9, 16, 3, 1, 3, 10, 7, 4, 3, 2, 4, 4, 0, 14, 1, 8, 14, 13, 13, 1, 14, 1, 2, 9, 4, 6, 10, 10, 5, 8, 0, 15, 9, 13, 3, 12, 9, 14, 15, 2, 14, 4, 1, 9, 1, 10, 0, 15, 12, 8, 8, 7, 16, 13, 13, 2, 12, 8, 3, 0, 4, 13, 9, 2, 6, 2, 10, 15, 7, 4, 4, 9, 8, 3, 1, 10, 11, 9, 0, 15, 4, 10, 2, 1, 1, 6, 12, 14, 12, 3, 14, 14, 6, 9, 3, 3, 1, 0, 1, 4, 11, 7, 1, 8, 1, 4, 11, 10, 14, 4, 7, 4, 9, 4, 15, 6, 13, 1, 11, 5, 4, 4, 6, 0, 15, 

In [13]:
print(train_dataset)
print(valid_dataset)
print(test_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 3235
})
Dataset({
    features: ['text', 'label'],
    num_rows: 405
})
Dataset({
    features: ['text', 'label'],
    num_rows: 405
})


In [14]:
# Visualize a sample.
train_dataset[0]

{'text': 'hello  i had hydrosalpingography in february and was diagnosed with hydrops . i am currently undergoing in vitro fertilization at jiai . the egg harvesting procedure has been completed . the doctor said that during the ovulation induction process  b -ultrasound showed that there was a lot of water accumulation  and embolization may not be possible .',
 'label': 11}

## Dataset Information

In [15]:
id2label = {
    0: "Allergy / Immunology",
    1: "Cardiovascular / Pulmonary",
    2: "Dermatology",
    3: "Endocrinology",
    4: "ENT - Otolaryngology",
    5: "Gastroenterology",
    6: "Hematology - Oncology",
    7: "Nephrology",
    8: "Neurology",
    9: "Obstetrics / Gynecology",
    10: "Ophthalmology",
    11: "Radiology",
    12: "Surgery",
    13: "Urology",
    14: "Orthopedics",
    15: "Pediatrics",
    16: "Psychiatry / Psychology"
}
label2id= {
    "Allergy / Immunology": 0,
    "Cardiovascular / Pulmonary": 1,
    "Dermatology": 2,
    "Endocrinology": 3,
    "ENT - Otolaryngology": 4,
    "Gastroenterology": 5,
    "Hematology - Oncology": 6,
    "Nephrology": 7,
    "Neurology": 8,
    "Obstetrics / Gynecology": 9,
    "Ophthalmology": 10,
    "Radiology": 11,
    "Surgery": 12,
    "Urology": 13,
    "Orthopedics": 14,
    "Pediatrics": 15,
    "Psychiatry / Psychology": 16
}

## Tokenize the Dataset

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
# Helper function for preprocessing.
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
    )

In [18]:
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

         

#0:   0%|          | 0/51 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/51 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/51 [00:00<?, ?ba/s]

#3:   0%|          | 0/51 [00:00<?, ?ba/s]

   

#4:   0%|          | 0/51 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/51 [00:00<?, ?ba/s]

#5:   0%|          | 0/51 [00:00<?, ?ba/s]

#7:   0%|          | 0/51 [00:00<?, ?ba/s]

In [19]:
tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

            

#0:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/7 [00:00<?, ?ba/s]

#2:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/7 [00:00<?, ?ba/s]

#6:   0%|          | 0/7 [00:00<?, ?ba/s]

#7:   0%|          | 0/7 [00:00<?, ?ba/s]

In [20]:
tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

           

#0:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/7 [00:00<?, ?ba/s]

    

#2:   0%|          | 0/7 [00:00<?, ?ba/s]

#3:   0%|          | 0/7 [00:00<?, ?ba/s]

#5:   0%|          | 0/7 [00:00<?, ?ba/s]

#4:   0%|          | 0/7 [00:00<?, ?ba/s]

#6:   0%|          | 0/7 [00:00<?, ?ba/s]

#7:   0%|          | 0/7 [00:00<?, ?ba/s]

In [21]:
# Initialize data collator.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Sample Tokenization Example

In [22]:
tokenized_sample = preprocess_function(train_dataset[0])

In [23]:
print(tokenized_sample)
print(f"Length of tokenized IDs: {len(tokenized_sample.input_ids)}")
print(f"Length of attention mask: {len(tokenized_sample.attention_mask)}")

{'input_ids': [101, 7592, 1045, 2018, 18479, 12002, 4691, 9888, 1999, 2337, 1998, 2001, 11441, 2007, 18479, 4523, 1012, 1045, 2572, 2747, 14996, 1999, 25714, 10768, 28228, 22731, 2012, 25871, 2072, 1012, 1996, 8288, 21534, 7709, 2038, 2042, 2949, 1012, 1996, 3460, 2056, 2008, 2076, 1996, 1051, 19722, 13490, 15946, 2832, 1038, 1011, 27312, 3662, 2008, 2045, 2001, 1037, 2843, 1997, 2300, 20299, 1998, 7861, 14956, 3989, 2089, 2025, 2022, 2825, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Length of tokenized IDs: 71
Length of attention mask: 71


## Evaluation Metrics

In [24]:
accuracy = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [25]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Model

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=17,
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

109,495,313 total parameters.
109,495,313 training parameters.


## Training Arguments

from itertools import product
from transformers import EarlyStoppingCallback

# Define the grid search parameters
learning_rates = [0.001, 0.01, 0.1]
batch_sizes = [8, 16, 32]

# Initialize variables to keep track of the best model and its metrics
best_model = None
best_metrics = {"eval_accuracy": 0.0}

# Perform grid search
for lr, batch_size in product(learning_rates, batch_sizes):
    OUT_DIR = f"output_lr_{lr}_batch_{batch_size}"
    
    # Update TrainingArguments with new hyperparameters
    training_args = TrainingArguments(
        output_dir=OUT_DIR,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to='tensorboard',
        fp16=True
    )
    
    # Create Trainer with updated hyperparameters and early stopping callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
    )
    
    # Train the model and store results
    history = trainer.train()

    # Check if the current model is the best so far based on eval_accuracy
    if history.metrics["eval_accuracy"] > best_metrics["eval_accuracy"]:
        best_metrics = history.metrics
        best_model = trainer.model

# Save the best model
if best_model is not None:
    best_model.save_pretrained("best_model")


## Training

In [46]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard'
    ,fp16=True
)

In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [62]:
history = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.492903,0.641975
2,No log,1.55152,0.659259
3,0.245300,1.701543,0.674074
4,0.245300,1.785906,0.65679
5,0.135600,1.880779,0.681481


Checkpoint destination directory /kaggle/working/checkpoint-203 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [49]:
model.save_pretrained("/kaggle/working/model3/")


In [55]:
import shutil

# Create a zip archive of the output directory
shutil.make_archive('/kaggle/working/model_output', 'zip', '/kaggle/working/model2')


'/kaggle/working/model_output.zip'

In [58]:
from IPython.display import FileLink

# Provide a link to download the zip file
FileLink(r'model_output.zip')


## Evaluate

In [61]:
trainer.evaluate(tokenized_test)



{'eval_loss': 1.3368359804153442,
 'eval_accuracy': 0.6395061728395062,
 'eval_runtime': 7.1068,
 'eval_samples_per_second': 56.988,
 'eval_steps_per_second': 3.658,
 'epoch': 5.0}

In [2]:
evaluation_results = trainer.evaluate(tokenized_test)

# Assuming precision, recall, and f1_score are keys in the evaluation results dictionary
precision = evaluation_results['precision']
recall = evaluation_results['recall']
f1_score = evaluation_results['f1_score']

# Print the results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Precision: 0.631
Recall: 0.666
F1 Score: 0.648


## Inference

In [51]:
print(history.global_step)

1015


In [52]:
# model = AutoModelForSequenceClassification.from_pretrained(f"{OUT_DIR}/checkpoint-{history.global_step}")
model = AutoModelForSequenceClassification.from_pretrained(f"/kaggle/working/model2")

In [63]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

In [65]:
    content='I have a coug'
    print(content)
    result = classify(content)
    print('PRED: ', result)
    print('\n')

I have a cough
PRED:  [{'label': 'Hematology - Oncology', 'score': 0.4579942524433136}]


