# Medical Specialty Classification

In [None]:
# TODO: Problem statement

# Edit all the Mardown cells below with the appropriate information 
# Run all cells, containing your code 
# Save this Jupyter with the outputs of your executed cells
#
# PS: Save again the notebook with this outcome.
# PSPS: Don't forget to include the dataset in your submission

**Team 3:**
* Balachander Srinivasan
* Christopher Umbel
* Mahfuzur Rahman

**Course:** AI 574 – Natural Language Processing (Summer, 2025)

### Problem Statement
* This project  .....
    
    
    
* **Keywords:** House price prediction, real estate ,..., 
	

### Data Collection
* **Source(url):** https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions/data
* **Short Description:** The data set of anonymized medical transcription reports from Boyle (2018)

* **Keywords:** description, medical_specialty, sample_name, transcription, and keywords

### Required packages

Run the following command to install the required packages:
```
pip install -r ./requirements.txt

```

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
import torch

from data_utils import DataUtils

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Data Exploration

In [None]:
MAX_SEQ_LEN = 256
BATCH_SIZE = 16
DS_SPLIT = 0.2
MIN_SPECIALITY_THRESHOLD = 100
DATASET_PATH = 'data/mtsamples.csv'

In [None]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.info()
df.sample(10)

In [None]:
utils = DataUtils()
_ = utils.class_distribution(df.medical_specialty, 'Medical Specialty', show_plot=True)

In [None]:
texts = df['transcription'].fillna('')
texts = texts.map(utils.clean_text)
tok_freq = Counter(tok for row in texts for tok in row.split() if tok)

print("\nMost common tokens:")
for tok, freq in tok_freq.most_common (10):
    print(f"Token: '{tok}', Frequency: {freq}")

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(texts))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Transcriptions")
plt.show()

In [None]:
stats = pd.DataFrame()
stats['length'] = texts.apply(len)
stats['word_cnt'] = texts.apply(lambda x: len(x.split()))
print(f"\n{stats.describe()}")
stats.hist(bins=50)

### Methodology

1. Explan your Deep Learning process / methodology



2. Introduce the Deep Neural Networks you used in your project
 * Model 1
    * Description 
 
 * Model 2
    * Description
 
 * Ensemble method
     * Description 
 
 
3. Add keywords  
**Keywords:** natural language processing, sentiment analysis, clustering, binary classification, multi-label classification, prediction
	___
 **Example**
* ConvNet
    * A convolutional neural network (CNN, or ConvNet) is a class of deep neural networks, most commonly applied to analyzing visual imagery(source Wikipedia). 
 
* **Keywords:** supervised learning, classification, ...

### Model 1: Distil-BioBERT

#### Data Preprocessing

* Enumerate and present the main steps you preformed in the data preprocessing
* Add your code and interpret the outcome of main steps/functions


In [None]:
MODEL_NAME = 'nlpie/distil-biobert'
df = pd.read_csv(DATASET_PATH, usecols=['medical_specialty', 'transcription'])

df = utils.handle_nulls(df)
df = utils.handle_duplicates(df)

df['text'] = df['transcription'].map(utils.clean_text)
df['medical_specialty'] = df['medical_specialty'].str.strip()

In [None]:
excluded_specialities = ['Surgery', 'Consult - History and Phy.',  'Radiology'] 
modified_df = df[~df['medical_specialty'].isin(excluded_specialities)]

modified_df['medical_specialty'] = modified_df['medical_specialty'].apply(
    lambda x: 'Neurology / Neurosurgery' if x in ['Neurology', 'Neurosurgery'] else x
)

modified_df['medical_specialty'] = modified_df['medical_specialty'].apply(
    lambda x: 'General Medicine / SOAP / Chart / Progress Notes' if x in ['General Medicine', 'SOAP / Chart / Progress Notes'] else x
)

speciality_count = utils.class_distribution(modified_df.medical_specialty, verbose=False)
modified_df = modified_df[modified_df.medical_specialty.isin(speciality_count[speciality_count >= MIN_SPECIALITY_THRESHOLD].index)]
_ = utils.class_distribution(modified_df.medical_specialty, 'Medical Specialty', show_plot=True)

In [None]:
def encode_label(df):
    df['label'], uniques = pd.factorize(df['medical_specialty'])
    print("Unique medical specialties:")
    for i, label in enumerate(uniques):
        print(f"{i}: {label}")
    return df, uniques

modified_df, uniques = encode_label(modified_df)
modified_df.sample(5, random_state=42)

In [None]:
def resample_data(df):
    ros = RandomOverSampler(random_state=42)
    df_resampled, y_resampled = ros.fit_resample(
        df[['text']],
        df['label']
    )

    df_resampled['label'] = y_resampled
    print(f"Resampled Shape: {df_resampled.shape}")
    return df_resampled

df_resampled = resample_data(modified_df)

train_df, test_df = train_test_split(df_resampled, test_size=DS_SPLIT, stratify=df_resampled['label'], random_state=42)
print(f"Train Shape: {train_df.shape}, Test Shape: {test_df.shape}")

In [None]:
def tokenize_data(tokenizer, train_df, test_df):
    def tokenize(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)

    train_ds = Dataset.from_pandas(train_df)
    test_ds = Dataset.from_pandas(test_df)

    train_ds = train_ds.map(tokenize, batched=True)
    test_ds = test_ds.map(tokenize, batched=True)

    train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

    return train_ds, test_ds

distilbiobert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_ds, test_ds = tokenize_data(distilbiobert_tokenizer, train_df, test_df)

#### Model Fitting and Validation

1. model 1 
    - decription 
2. model 2
    - decription 

In [None]:
label2id = {label: idx for idx, label in enumerate(uniques)}
id2label = {idx: label for idx, label in enumerate(uniques)}

data_collator = DataCollatorWithPadding(tokenizer=distilbiobert_tokenizer)
distilbiobert_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(uniques),
    id2label=id2label,
    label2id=label2id
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted")
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./results",         
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    learning_rate=2e-5,             
    per_device_train_batch_size=BATCH_SIZE, 
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=10,             
    weight_decay=0.01,
    report_to="none",
    logging_dir=None,           
    logging_steps=-1
)

trainer = Trainer(
    model=distilbiobert_model,       
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

#### Model Evaluation 

* Examine your models (coefficients, parameters, errors, etc...)

* Compute and interpret your results in terms of accuracy, precision, recall, ROC etc. 

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
predictions = trainer.predict(test_ds)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

classes = list(id2label.values())
cm = confusion_matrix(y_true, y_pred)
utils.plot_confusion_matrix(cm, classes)

In [None]:
def make_predictions(tokenizer, model, test_df):
    def classify(text):
        tokens = tokenizer.encode(text, return_tensors='pt', truncation=True, padding=True, max_length=MAX_SEQ_LEN)
        tokens = tokens.to(model.device)
        result = model(tokens)
        return int(torch.argmax(result.logits))
    
    samples = pd.DataFrame()
    for i in range(len(uniques)):
        sample = test_df[test_df.label == i].sample(1, random_state=21)
        samples = pd.concat([samples, sample])

    samples = samples.reset_index(drop=True)
    samples = samples.rename(columns={'label': 'true_label'})
    samples['predicted_label'] = samples['text'].apply(classify)
    return samples

make_predictions(distilbiobert_tokenizer, distilbiobert_model, test_df)

### Model 2: DistilBERT

#### Data Preprocessing

* Enumerate and present the main steps you preformed in the data preprocessing
* Add your code and interpret the outcome of main steps/functions


In [None]:
MODEL_NAME = 'distilbert-base-uncased'
speciality_count = utils.class_distribution(df.medical_specialty, verbose=False)
df = df[df.medical_specialty.isin(speciality_count[speciality_count >= MIN_SPECIALITY_THRESHOLD].index)]
_ = utils.class_distribution(df.medical_specialty, 'Medical Specialty', show_plot=True)

In [None]:
df, uniques = encode_label(df)
df.sample(5, random_state=42)

In [None]:
df_resampled = resample_data(df)

train_df, test_df = train_test_split(df_resampled, test_size=DS_SPLIT, stratify=df_resampled['label'], random_state=42)
print(f"Train Shape: {train_df.shape}, Test Shape: {test_df.shape}")

distilbert_tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
train_ds, test_ds = tokenize_data(distilbert_tokenizer, train_df, test_df)

#### Model Fitting and Validation

1. model 1 
    - decription 
2. model 2
    - decription 

In [None]:
label2id = {label: idx for idx, label in enumerate(uniques)}
id2label = {idx: label for idx, label in enumerate(uniques)}

distilbert_model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(uniques),
    id2label=id2label,
    label2id=label2id,
)
data_collator = DataCollatorWithPadding(tokenizer=distilbert_tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",         
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    learning_rate=2e-5,             
    per_device_train_batch_size=BATCH_SIZE, 
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=10,             
    weight_decay=0.01,
    report_to="none",
    logging_dir=None,           
    logging_steps=-1
)

trainer = Trainer(
    model=distilbert_model,       
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

#### Model Evaluation 

* Examine your models (coefficients, parameters, errors, etc...)

* Compute and interpret your results in terms of accuracy, precision, recall, ROC etc. 

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
predictions = trainer.predict(test_ds)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

classes = list(id2label.values())
cm = confusion_matrix(y_true, y_pred)
utils.plot_confusion_matrix(cm, classes)

make_predictions(distilbert_tokenizer, distilbert_model, test_df)

### Model 3: RoBERTa

#### Data Preprocessing

* Enumerate and present the main steps you preformed in the data preprocessing
* Add your code and interpret the outcome of main steps/functions


In [None]:
df = pd.read_csv(DATASET_PATH, usecols=['keywords', 'transcription', 'medical_specialty'])    

utils = DataUtils()
df = utils.handle_nulls(df)
df = utils.handle_duplicates(df)

df['medical_specialty'] = df['medical_specialty'].str.strip()

In [None]:
speciality_count = utils.class_distribution(df.medical_specialty, verbose=False)
df['medical_specialty'] = df['medical_specialty'].apply(
    lambda x: x if speciality_count[x] >= MIN_SPECIALITY_THRESHOLD else 'other')
_ = utils.class_distribution(df.medical_specialty, 'Medical Specialty', show_plot=True)

In [None]:
df, uniques = encode_label(df)
df.sample(5, random_state=42)

In [None]:
dataset = pd.DataFrame({
    'text' : df['keywords']+df['transcription'],
    'label' : df['label']
})

train_df, test_df = train_test_split(dataset, test_size=DS_SPLIT, stratify=dataset['label'], random_state=42)
print(f"Train Shape: {train_df.shape}, Test Shape: {test_df.shape}")

#### Model Fitting and Validation

1. model 1 
    - decription 
2. model 2
    - decription 

In [None]:
MODEL_TYPE = 'roberta'
MODEL_NAME = 'roberta-base'

def compute_metrics(pred, actual):
    return {
        "accuracy": balanced_accuracy_score(actual, pred),
        "f1_weighted": f1_score(actual, pred, average="weighted")
    }

In [None]:
cuda_available = torch.cuda.is_available()
num_classes = len(uniques)

model_args = ClassificationArgs(
    num_train_epochs=10,
    learning_rate=1e-5,
    reprocess_input_data=True,
    save_model_every_epoch=False,
    overwrite_output_dir= True,
    use_early_stopping=True,
    early_stopping_patience=2,
    early_stopping_metric="mcc",
    early_stopping_delta=0.005
)

roberta_model = ClassificationModel(
    MODEL_TYPE,
    MODEL_NAME,
    num_labels=num_classes,
    weight=[1]*num_classes,
    use_cuda=cuda_available,
    args=model_args
)

roberta_model.train_model(train_df, eval_df=test_df, custom_eval_function=compute_metrics)

#### Model Evaluation 

* Examine your models (coefficients, parameters, errors, etc...)

* Compute and interpret your results in terms of accuracy, precision, recall, ROC etc. 

In [None]:
eval_result, _, _ = roberta_model.eval_model(test_df)
y_pred, _ = roberta_model.predict(test_df['text'].values.tolist())
pred_result = compute_metrics(y_pred, test_df['label'])

print(f"MCC: {eval_result['mcc']}, Eval_Loss: {eval_result['eval_loss']}")
print(f"Accuracy: {pred_result['accuracy']}, F1 Score: {pred_result['f1_weighted']}")

In [None]:
classes = [uniques[i] for i in range(len(uniques))]
cm = confusion_matrix(test_df['label'], y_pred)
utils.plot_confusion_matrix(cm, classes)

In [None]:
samples = pd.DataFrame()
for i in range(len(uniques)):
    sample = test_df[test_df.label == i].sample(1, random_state=21)
    samples = pd.concat([samples, sample])

samples = samples.reset_index(drop=True)
samples = samples.rename(columns={'label': 'true_label'})
samples['predicted_label'], _ = roberta_model.predict(samples['text'].tolist())
samples

### Issues / Improvements
1. Dataset is very small
2. Use regularization / initialization
3. Use cross-validaiton
4. ...

###  References
   * Rajapakse, T. C., Yates, A., & de Rijke, M. (2024). Simple Transformers: Open-source for all. In *Proceedings of the 2024 Annual International ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region* (SIGIR-AP 2024, pp. 209–215). Association for Computing Machinery. [https://doi.org/10.1145/3673791.3698412](https://doi.org/10.1145/3673791.3698412)
	

### Credits

- If you use and/or adapt your code from existing projects, you must provide links and acknowldge the authors. Keep in mind that all documents in your projects and code will be check against the official plagiarism detection tool used by Penn State ([Turnitin](https://turnitin.psu.edu))

> *This code is based on .... (if any)*

In [None]:
# End of Project