In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import optimizers, layers, models

from data_processor import preprocess_data, make_dataset
from data_transformer import encode_labels, get_vectorizer

**Data Exploration and Preprocessing**

In [2]:
MAX_SEQ_LEN = 200
EMBEDDING_DIM = 32
BATCH_SIZE = 32
DS_SPLIT = 0.15
MIN_SPECIALITY_THRESHOLD = 100
DATASET_PATH = 'data/mtsamples.csv'

In [3]:
dataset = pd.read_csv(DATASET_PATH)
dataset.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [4]:
sub_ds = dataset[['medical_specialty', 'transcription',]]
print(sub_ds.info())
sub_ds = preprocess_data(sub_ds)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   medical_specialty  4999 non-null   object
 1   transcription      4966 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB
None
===== Null Summary =====
medical_specialty     0
transcription        33
dtype: int64
Dropping rows with missing values...
===== Duplicate Summary =====
Count: 2
Dropping duplicate rows...


In [5]:
speciality_count = sub_ds.medical_specialty.value_counts()
filtered_ds = sub_ds[sub_ds.medical_specialty.isin(speciality_count[speciality_count >= MIN_SPECIALITY_THRESHOLD].index)]
filtered_ds.medical_specialty.value_counts()

medical_specialty
Surgery                          1088
Consult - History and Phy.        516
Cardiovascular / Pulmonary        371
Orthopedic                        355
Radiology                         273
General Medicine                  259
Gastroenterology                  224
Neurology                         223
SOAP / Chart / Progress Notes     166
Urology                           156
Obstetrics / Gynecology           155
Discharge Summary                 108
Name: count, dtype: int64

In [6]:
train_df, test_df = train_test_split(filtered_ds, test_size=DS_SPLIT, random_state=42)
print(f"Train set size: {len(train_df)}, Test set size: {len(test_df)}")

Train set size: 3309, Test set size: 585


In [7]:
# Label encoding
train_y, test_y, num_classes, le_classes = encode_labels(train_df, test_df, 'medical_specialty')
print(f"Number of classes: {num_classes}\nClasses: {le_classes}")

Number of classes: 12
Classes: [' Cardiovascular / Pulmonary' ' Consult - History and Phy.'
 ' Discharge Summary' ' Gastroenterology' ' General Medicine' ' Neurology'
 ' Obstetrics / Gynecology' ' Orthopedic' ' Radiology'
 ' SOAP / Chart / Progress Notes' ' Surgery' ' Urology']


In [None]:
vectorizer, vocab_size = get_vectorizer(train_df.transcription.values, MAX_SEQ_LEN)

train_ds = make_dataset(vectorizer, train_df.transcription.values, train_y, batch_size=BATCH_SIZE, shuffle=True)
test_ds = make_dataset(vectorizer, test_df.transcription.values, test_y, batch_size=BATCH_SIZE)

**Build RNN model**

In [9]:
def build_rnn_model(rnn_layer, opt, lr=0.001, em_dim=EMBEDDING_DIM):
    model = models.Sequential([
        layers.Input(shape=(MAX_SEQ_LEN,)),
        layers.Embedding(input_dim=vocab_size, output_dim=em_dim, mask_zero=True),
        rnn_layer,
        layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=opt(learning_rate=lr),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    return model

In [10]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min', restore_best_weights=True)

def train_and_eval(model, epochs=20, callable=[early_stopping]):
    history = model.fit(
        train_ds,
        validation_data=test_ds,
        epochs=epochs,
        callbacks=callable
    )

    test_loss, test_accuracy = model.evaluate(test_ds)
    print(f"Test Accuracy: {test_accuracy}, Test Loss: {test_loss}")
    return history

In [11]:
bilstm_layer = layers.Bidirectional(layers.LSTM(32))
lstm_model = build_rnn_model(bilstm_layer, optimizers.Adam)
lstm_model.summary()

In [12]:
lstm_history = train_and_eval(lstm_model, epochs=40)

Epoch 1/40
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 99ms/step - accuracy: 0.2666 - loss: 2.2968 - val_accuracy: 0.4068 - val_loss: 1.9017
Epoch 2/40
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 114ms/step - accuracy: 0.3896 - loss: 1.9073 - val_accuracy: 0.4085 - val_loss: 1.8717
Epoch 3/40
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 125ms/step - accuracy: 0.3775 - loss: 1.8982 - val_accuracy: 0.4274 - val_loss: 1.8297
Epoch 4/40
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 118ms/step - accuracy: 0.4211 - loss: 1.8017 - val_accuracy: 0.4427 - val_loss: 1.7956
Epoch 5/40
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 115ms/step - accuracy: 0.4230 - loss: 1.7731 - val_accuracy: 0.4393 - val_loss: 1.7642
Epoch 6/40
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 114ms/step - accuracy: 0.4320 - loss: 1.7491 - val_accuracy: 0.4479 - val_loss: 1.7254
Epoch 7/40


In [13]:
# def plot_history(history, target_metric):
#     title = target_metric[0].upper() + target_metric[1:]
#     plt.figure(figsize=(10, 6))
#     plt.plot(history.history[target_metric], label=f'Train {title}')
#     plt.plot(history.history[f'val_{target_metric}'], label=f'Test {title}')
#     plt.xlabel('Epochs')
#     plt.ylabel(title)
#     plt.title(f'{title} Trajectory')
#     plt.legend()
#     plt.show()

In [14]:
# print("LSTM Accuracy and Loss Trajectory")
# plot_history(lstm_history, 'accuracy')
# plot_history(lstm_history, 'loss')  

In [15]:
# def classify(model, text, verbose=False):
#     seq = vectorizer(tf.constant([text]))
#     probs = model.predict(seq)[0]
#     lang_id = tf.argmax(probs).numpy()
#     prediction = le.inverse_transform([lang_id])[0]
#     if verbose:
#         print(f"Input Text: {text}")
#         print(f"Prediction: {prediction}")
#     return prediction

# sample_text_1 = ""
# classify(lstm_model, sample_text_1, verbose=True)