In [None]:
# Ensure kagglehub is available and give a clear message if not
try:
    import kagglehub
except ModuleNotFoundError:
    raise ModuleNotFoundError("kagglehub is not installed. Install with `pip install kagglehub` or use the Kaggle API instead.")

# Download latest version
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")

print("Path to dataset files:", path)


In [None]:
#preview dataset
import pandas as pd
orig_df = pd.read_csv(path + "/mtsamples.csv")
orig_df.head()
print("Number of rows in dataset:", len(orig_df))
orig_df.shape

In [None]:
# 1. Check class distribution
print(orig_df['medical_specialty'].value_counts())


In [None]:
#clean dataset first
#create new df (df) that is subset of orig_df
#drop any empty rows
#drop Unnamed:0 (index col), sample_name?,
#description seems not too informative, so maybe we can drop that as well?
df = orig_df.drop(['Unnamed: 0', 'sample_name', 'description'], axis=1)
df = df[df['transcription'].notna() & df['keywords'].notna()]

#reorder to move medical_specialty col to the right
df = df[['transcription', 'keywords', 'medical_specialty']]
df.head()

# 1. Check class distribution after cleaning
print(df['medical_specialty'].value_counts())

#print num rows
print("Number of rows in dataset:", len(df))

# Combine multiple text fields
df['combined_text'] = df['transcription'].fillna('') + ' ' + df['keywords'].fillna('')
X = df['combined_text']

In [None]:
#do some relabeling
RELABEL_RULES = {
    "Cardiovascular/Pulmonary": [
        "troponin", "acute coronary", "ecg", "ekg",
        "cardiac catheterization", "stent", "angiogram"
    ],
    "Orthopedics": [
        "fracture", "tibia", "femur", "cast",
        "weight bearing", "ligament tear"
    ],
    "Neurology": [
        "seizure", "stroke", "cva", "tia",
        "parkinson", "brain mri"
    ],
    "Gastroenterology": [
        "colonoscopy", "gi bleed", "melena",
        "pancreatitis", "cirrhosis"
    ],
    "Surgery": [
        "post operative", "incision",
        "laparoscopic", "surgical repair"
    ]
}

def relabel_specialty(text, current_label):
    text = text.lower()
    for specialty, keywords in RELABEL_RULES.items():
        if any(k in text for k in keywords):
            return specialty
    return current_label

df['specialty_refined'] = df.apply(
    lambda row: relabel_specialty(row['combined_text'], row['medical_specialty']), axis=1
)

df["specialty_refined"].value_counts()


In [None]:
#combine some categories

mapping = {
    'Cosmetic / Plastic Surgery': 'Surgery',
    'Neurosurgery': 'Surgery',
    'Surgery': 'Surgery',
    'ENT - Otolaryngology': 'Surgery',

    'Orthopedic': 'Orthopedics',
    'Podiatry': 'Orthopedics',
    'Physical Medicine - Rehab': 'Orthopedics',
    'Chiropractic': 'Orthopedics',
    'Rheumatology': 'Orthopedics',

    'Cardiovascular / Pulmonary': 'Cardiovascular/Pulmonary',

    'Gastroenterology': 'Gastroenterology',
    'Bariatrics': 'Gastroenterology',

    'Neurology': 'Neurology',
    'Psychiatry / Psychology': 'Neurology',
    'Pain Management': 'Neurology',
    'Sleep Medicine': 'Neurology',

    'Obstetrics / Gynecology': 'Women/Men\'s Reproductive Health',
    'Urology': 'Women/Men\'s Reproductive Health',

    'Hematology - Oncology': 'Kidney & Blood/Oncology',
    'Nephrology': 'Kidney & Blood/Oncology',

    'Radiology': 'Radiology & Diagnostics',
    'Lab Medicine - Pathology': 'Radiology & Diagnostics',

    'General Medicine': 'General Medicine',
    'Consult - History and Phy.': 'General Medicine',
    'SOAP / Chart / Progress Notes': 'General Medicine',
    'Discharge Summary': 'General Medicine',
    'Office Notes': 'General Medicine',
    'Letters': 'General Medicine',
    'Hospice - Palliative Care': 'General Medicine',
    'IME-QME-Work Comp etc.': 'General Medicine',
    'Emergency Room Reports': 'General Medicine',

    'Ophthalmology': 'Other Specialties',
    'Dermatology': 'Other Specialties',
    'Pediatrics - Neonatal': 'Other Specialties',
    'Dentistry': 'Other Specialties',
    'Speech - Language': 'Other Specialties',
    'Endocrinology': 'Other Specialties',
    'Diets and Nutritions': 'Other Specialties',
    'Allergy / Immunology': 'Other Specialties',

}

In [None]:
df['medical_specialty'] = df['medical_specialty'].str.strip()
df['specialty_final'] = df['specialty_refined'].map(mapping)

# Keep existing labels when mapping fails
df['specialty_final'] = df['specialty_final'].fillna(df['specialty_refined'])

# Check results
print(df['specialty_final'].value_counts())

In [None]:
#relabeling might cause issues
counts = df['specialty_final'].value_counts()
rare_labels = counts[counts < 2].index

df.loc[df['specialty_final'].isin(rare_labels), 'specialty_final'] = 'General Medicine'
df['specialty_final'].value_counts().sort_values()

In [None]:
#split into train and test sets
from sklearn.model_selection import train_test_split

X = df['combined_text']
y = df['specialty_final']

# Correct assignment order: X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# quick sanity-check shapes
print('Shapes ->', X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
#TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,3),
    stop_words='english'
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
#BERT model training
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Ecode labels because BERT cannot handle string labels

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['specialty_final'])

num_labels = len(label_encoder.classes_)
print(num_labels, label_encoder.classes_)

#tokenize
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

#map labels to encoded labels
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
#create pytorch dataset
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = MedicalDataset(train_encodings, y_train_encoded)
test_dataset = MedicalDataset(test_encodings, y_test_encoded)

In [None]:
#load BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

#define metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc}

    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    # precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    # acc = accuracy_score(labels, preds)
    # return {
    #     'accuracy': acc,
    #     'f1': f1,
    #     'precision': precision,
    #     'recall': recall
    # }

In [None]:
#training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,   # Reduce if out of memory
    per_device_eval_batch_size=8,
    # warmup_steps=500,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',
)

In [None]:
#trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
#train BERT model
trainer.train()

#evaluate
trainer.evaluate()

In [None]:
#save BERT results
bert_save_path = "./bert_model"

trainer.save_model(bert_save_path)
tokenizer.save_pretrained(bert_save_path)

In [None]:
#Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)

model.fit(X_train_tfidf, y_train)
print("Model training complete.")


In [None]:
#save logreg results
import pickle
with open("logreg.pkl", "wb") as f:
    pickle.dump({
        "model": model,
        "vectorizer": vectorizer,
        "label_encoder": label_encoder
    }, f)


In [None]:
#ensemble predictions
from scipy.special import softmax

#BERT predictions
bert_preds = trainer.predict(test_dataset)
bert_probs = softmax(bert_preds.predictions, axis=1)

#LogReg predictions
logreg_probs = model.predict_proba(X_test_tfidf)

#average probs
# Note: Need to align class order between models
alpha = 0.6  # trust BERT slightly more
ensemble_probs = alpha * bert_probs + (1 - alpha) * logreg_probs
ensemble_preds = label_encoder.inverse_transform(np.argmax(ensemble_probs, axis=1))


In [None]:
# Evaluate ensemble
from sklearn.metrics import classification_report, accuracy_score
accuracy_ensemble = accuracy_score(y_test, ensemble_preds)
print("\n" + "="*50)
print("ENSEMBLE MODEL RESULTS")
print("="*50)
print(classification_report(y_test, ensemble_preds))
print(f"Accuracy: {accuracy_ensemble:.4f}")

In [None]:
import pickle

with open("specialty_classifier.pkl", "wb") as f:
  pickle.dump(
      {
          "model": model,
          "vectorizer": vectorizer,
          "label_encoder": label_encoder,
      },
  f
)
  print("Model saved as specialty_classifier.pkl")

In [None]:
!python app.py