In [1]:

import os
import re
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


# --- Data Loading & Cleaning ---

In [2]:
df = pd.read_csv('/content/drive/MyDrive/datasets/final/medical_final_withId.csv')

#### extra cleaning i haven't tried it yet :/

```
def clean_text(text): ...
df['tokenized_text'] = df['description'].apply(clean_text)
```



## Filtering out small classes (<90 entries):
treating class embalance;


### TODO:

  [ ] explore better strategies


In [3]:
min_samples = 90
df = df.groupby('Specialty').filter(lambda x: len(x) >= min_samples)
print(f"Filtered dataset shape: {df.shape}")

Filtered dataset shape: (18587, 6)


# --- Embedding Generation (Multi-layer CLS Pooling) ---


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 16
token_texts = df['tokenized_text'].tolist()

## Load BioBERT with hidden states


In [5]:
model_name = 'dmis-lab/biobert-base-cased-v1.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

## Stack CLS tokens from last 4 layers, then mean-pool


In [None]:
emb_list = []
for i in range(0, len(token_texts), batch_size):
    batch = token_texts[i:i+batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.hidden_states
    cls_stack = torch.stack([hidden_states[-j][:,0,:] for j in range(1,5)], dim=2)
    cls_mean = torch.mean(cls_stack, dim=2)
    emb_list.append(cls_mean.cpu().numpy())

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

## Saving the generated embeddings:

In [None]:
embeddings_ml = np.vstack(emb_list)
print(f"Generated embeddings shape: {embeddings_ml.shape}")
np.save('embeddings_ml.npy', embeddings_ml)

# Train/Validation/Test split

In [None]:
label_encoder = LabelEncoder()
df['y'] = label_encoder.fit_transform(df['Specialty'])

X_train_val, X_test, y_train_val, y_test = train_test_split(
    embeddings_ml, df['y'], test_size=0.2, random_state=42, stratify=df['y']
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# The classifier

## COnstruction

In [None]:
input_dim = X_train.shape[1]
num_classes = len(label_encoder.classes_)

model_tf = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(num_classes, activation='softmax')
])

model_tf.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model_tf.summary()

## Training

In [None]:
callbacks_list = [
    callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    callbacks.ModelCheckpoint('best_medical_classifier.keras', monitor='val_accuracy', save_best_only=True),
    callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]

history = model_tf.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=callbacks_list
)

### This automatically pauses the training when deemed fit to avoid overfitting

# --- Evaluation ---


Methode(s): we're calculating the accuracy / using a confusion matrix

In [None]:
test_loss, test_acc = model_tf.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {test_acc:.4f}, loss: {test_loss:.4f}")

In [None]:
y_pred = np.argmax(model_tf.predict(X_test), axis=1)
target_names = label_encoder.classes_
print(classification_report(y_test, y_pred, target_names=target_names))

### *Confusion matrix fuss*

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12,10))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
!mkdir drive/MyDrive/clspool

In [None]:
!cp best_medical_classifier.keras drive/MyDrive/clspool

In [None]:
!cp embeddings_ml.npy drive/MyDrive/clspool