In [1]:
# Ensure kagglehub is available and give a clear message if not
try:
    import kagglehub
except ModuleNotFoundError:
    raise ModuleNotFoundError("kagglehub is not installed. Install with `pip install kagglehub` or use the Kaggle API instead.")

# Download latest version
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")

print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/anokhimehta/.cache/kagglehub/datasets/tboyle10/medicaltranscriptions/versions/1


In [2]:
#preview dataset
import pandas as pd
orig_df = pd.read_csv(path + "/mtsamples.csv")
orig_df.head()
print("Number of rows in dataset:", len(orig_df))
orig_df.shape

Number of rows in dataset: 4999


(4999, 6)

In [3]:
# 1. Check class distribution after cleaning
print(orig_df['medical_specialty'].value_counts())


medical_specialty
Surgery                          1103
Consult - History and Phy.        516
Cardiovascular / Pulmonary        372
Orthopedic                        355
Radiology                         273
General Medicine                  259
Gastroenterology                  230
Neurology                         223
SOAP / Chart / Progress Notes     166
Obstetrics / Gynecology           160
Urology                           158
Discharge Summary                 108
ENT - Otolaryngology               98
Neurosurgery                       94
Hematology - Oncology              90
Ophthalmology                      83
Nephrology                         81
Emergency Room Reports             75
Pediatrics - Neonatal              70
Pain Management                    62
Psychiatry / Psychology            53
Office Notes                       51
Podiatry                           47
Dermatology                        29
Cosmetic / Plastic Surgery         27
Dentistry                       

In [4]:
#clean dataset first
#create new df (df) that is subset of orig_df
#drop any empty rows
#drop Unnamed:0 (index col), sample_name?,
#description seems not too informative, so maybe we can drop that as well?
df = orig_df.drop(['Unnamed: 0', 'sample_name', 'description'], axis=1)
df = df[df['transcription'].notna() & df['keywords'].notna()]
df.shape

#reorder to move medical_specialty col to the right
df = df[['transcription', 'keywords', 'medical_specialty']]
df.head()

# 1. Check class distribution after cleaning
print(df['medical_specialty'].value_counts())

#print num rows
print("Number of rows in dataset:", len(df))

#cleaning seems to drop about 1000 rows of data

medical_specialty
Surgery                          1021
Orthopedic                        303
Cardiovascular / Pulmonary        280
Radiology                         251
Consult - History and Phy.        234
Gastroenterology                  195
Neurology                         168
General Medicine                  146
SOAP / Chart / Progress Notes     142
Urology                           140
Obstetrics / Gynecology           130
ENT - Otolaryngology               84
Neurosurgery                       81
Ophthalmology                      79
Discharge Summary                  77
Nephrology                         63
Hematology - Oncology              62
Pain Management                    58
Office Notes                       44
Pediatrics - Neonatal              42
Podiatry                           42
Emergency Room Reports             31
Dermatology                        25
Dentistry                          25
Cosmetic / Plastic Surgery         25
Letters                         

In [5]:
#combine some categories

mapping = {
    'Cosmetic / Plastic Surgery': 'Surgery',
    'Neurosurgery': 'Surgery',
    'Surgery': 'Surgery',
    'ENT - Otolaryngology': 'Surgery',

    'Orthopedic': 'Orthopedics',
    'Podiatry': 'Orthopedics',
    'Physical Medicine - Rehab': 'Orthopedics',
    'Chiropractic': 'Orthopedics',
    'Rheumatology': 'Orthopedics',

    'Cardiovascular / Pulmonary': 'Cardiovascular/Pulmonary',

    'Gastroenterology': 'Gastroenterology',
    'Bariatrics': 'Gastroenterology',

    'Neurology': 'Neurology',
    'Psychiatry / Psychology': 'Neurology',
    'Pain Management': 'Neurology',
    'Sleep Medicine': 'Neurology',

    'Obstetrics / Gynecology': 'Women/Men\'s Reproductive Health',
    'Urology': 'Women/Men\'s Reproductive Health',

    'Hematology - Oncology': 'Kidney & Blood/Oncology',
    'Nephrology': 'Kidney & Blood/Oncology',

    'Radiology': 'Radiology & Diagnostics',
    'Lab Medicine - Pathology': 'Radiology & Diagnostics',

    'General Medicine': 'General Medicine',
    'Consult - History and Phy.': 'General Medicine',
    'SOAP / Chart / Progress Notes': 'General Medicine',
    'Discharge Summary': 'General Medicine',
    'Office Notes': 'General Medicine',
    'Letters': 'General Medicine',
    'Hospice - Palliative Care': 'General Medicine',
    'IME-QME-Work Comp etc.': 'General Medicine',
    'Emergency Room Reports': 'General Medicine',

    'Ophthalmology': 'Other Specialties',
    'Dermatology': 'Other Specialties',
    'Pediatrics - Neonatal': 'Other Specialties',
    'Dentistry': 'Other Specialties',
    'Speech - Language': 'Other Specialties',
    'Endocrinology': 'Other Specialties',
    'Diets and Nutritions': 'Other Specialties',
    'Allergy / Immunology': 'Other Specialties',

}

In [6]:
df['medical_specialty'] = df['medical_specialty'].str.strip()
df['specialty_grouped'] = df['medical_specialty'].map(mapping)

# Check results
print(df['specialty_grouped'].value_counts())

specialty_grouped
Surgery                            1211
General Medicine                    703
Orthopedics                         367
Cardiovascular/Pulmonary            280
Women/Men's Reproductive Health     270
Neurology                           263
Radiology & Diagnostics             259
Gastroenterology                    213
Other Specialties                   207
Kidney & Blood/Oncology             125
Name: count, dtype: int64


In [7]:
# Combine multiple text fields
df['combined_text'] = df['transcription'].fillna('') + ' ' + df['keywords'].fillna('')
X = df['combined_text']
y = df['specialty_grouped']

In [8]:
#split into train and test sets
from sklearn.model_selection import train_test_split

X = df['combined_text']
y = df['specialty_grouped']

# Correct assignment order: X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# quick sanity-check shapes
print('Shapes ->', X_train.shape, X_test.shape, y_train.shape, y_test.shape)


Shapes -> (3118,) (780,) (3118,) (780,)


In [9]:
#TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,3),
    stop_words='english'
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
#BERT model training
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Ecode labels because BERT cannot handle string labels 

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['specialty_grouped'])

num_labels = len(label_encoder.classes_)
print(num_labels, label_encoder.classes_)

#tokenize
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

#map labels to encoded labels
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/anokhimehta/miniconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/anokhimehta/miniconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/anokhimehta/miniconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 758, in start
    self.io_loop.start(

10 ['Cardiovascular/Pulmonary' 'Gastroenterology' 'General Medicine'
 'Kidney & Blood/Oncology' 'Neurology' 'Orthopedics' 'Other Specialties'
 'Radiology & Diagnostics' 'Surgery' "Women/Men's Reproductive Health"]


In [None]:
#create pytorch dataset
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)
    

train_dataset = MedicalDataset(train_encodings, y_train_encoded)
test_dataset = MedicalDataset(test_encodings, y_test_encoded)

In [12]:
#load BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

#define metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc}

    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    # precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    # acc = accuracy_score(labels, preds)
    # return {
    #     'accuracy': acc,
    #     'f1': f1,
    #     'precision': precision,
    #     'recall': recall
    # }

In [14]:
#training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,   # Reduce if out of memory
    per_device_eval_batch_size=8,
    # warmup_steps=500,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
)

In [15]:
#trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [16]:
#train BERT model
trainer.train()

#evaluate
trainer.evaluate()

RuntimeError: Could not infer dtype of numpy.int64

In [None]:
#Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)

model.fit(X_train_tfidf, y_train)
print("Model training complete.")


Model training complete.


In [None]:
#evaluate model
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_tfidf)
#accuracy
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy:.4f}")

                                 precision    recall  f1-score   support

       Cardiovascular/Pulmonary       0.49      0.70      0.58        56
               Gastroenterology       0.40      0.72      0.52        43
               General Medicine       0.71      0.55      0.62       141
        Kidney & Blood/Oncology       0.25      0.44      0.32        25
                      Neurology       0.49      0.64      0.55        53
                    Orthopedics       0.45      0.64      0.53        73
              Other Specialties       0.40      0.61      0.48        41
        Radiology & Diagnostics       0.37      0.40      0.39        52
                        Surgery       0.72      0.26      0.38       242
Women/Men's Reproductive Health       0.46      0.78      0.58        54

                       accuracy                           0.50       780
                      macro avg       0.47      0.57      0.49       780
                   weighted avg       0.57      0