In [1]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
%conda install -c conda-forge wordcloud

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - wordcloud


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2022.6.15  |       ha878542_0         149 KB  conda-forge
    certifi-2022.6.15          |   py38h578d9bd_0         155 KB  conda-forge
    conda-4.13.0               |   py38h578d9bd_1         994 KB  conda-forge
    cycler-0.11.0              |     pyhd8ed1ab_0          10 KB  conda-forge
    freetype-2.10.4            |       h0708190_1         890 KB  conda-forge
    jpeg-9e                    |       h166bdaf_1         268 KB  conda-forge
    kiwisolver-1.4.3           |   py38h43d8883_0          75 KB  conda-forge
    lcms2-2.12                 |       hddcbb42_0         443 KB  conda-forge
    lerc-3.0                   |       h9c3ff4c_

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
import warnings
from wordcloud import WordCloud
warnings.filterwarnings("ignore")

In [4]:
import wandb

In [5]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
df = pd.read_csv('../dataset/processed/processed_transcriptions.csv')
df.head()

Unnamed: 0,medical_specialty,transcription
0,allergy / immunology,subjective 23 year old white female presents c...
1,bariatrics,past medical history difficulty climbing stair...
2,bariatrics,history present illness seen abc today pleasan...
3,cardiovascular / pulmonary,2 mode 1 left atrial enlargement left atrial d...
4,cardiovascular / pulmonary,1 left ventricular cavity size wall thickness ...


In [9]:
print(df.medical_specialty.unique().tolist())

['allergy / immunology', 'bariatrics', 'cardiovascular / pulmonary', 'neurology', 'dentistry', 'urology', 'general medicine', 'surgery', 'speech language', 'soap / chart / progress notes', 'sleep medicine', 'rheumatology', 'radiology', 'psychiatry / psychology', 'podiatry', 'physical medicine rehab', 'pediatrics neonatal', 'pain management', 'orthopedic', 'ophthalmology', 'office notes', 'obstetrics / gynecology', 'neurosurgery', 'nephrology', 'letters', 'lab medicine pathology', 'ime qme work comp etc', 'hospice palliative care', 'hematology oncology', 'gastroenterology', 'ent otolaryngology', 'endocrinology', 'emergency room reports', 'discharge summary', 'diets nutritions', 'dermatology', 'cosmetic / plastic surgery', 'consult history phy', 'chiropractic', 'autopsy']


In [10]:
from collections import Counter

In [14]:
# Filter classes with less than < 50 sentences
class_counter = Counter(df.medical_specialty)
filtered_class_counter = {}
for k, v in dict(class_counter).items():
    if v <= 80:
        filtered_class_counter[k] = v
filtered_class_counter

{'allergy / immunology': 7,
 'bariatrics': 18,
 'dentistry': 27,
 'speech language': 9,
 'sleep medicine': 20,
 'rheumatology': 10,
 'psychiatry / psychology': 53,
 'podiatry': 47,
 'physical medicine rehab': 21,
 'pediatrics neonatal': 70,
 'pain management': 61,
 'office notes': 50,
 'letters': 23,
 'lab medicine pathology': 8,
 'ime qme work comp etc': 16,
 'hospice palliative care': 6,
 'endocrinology': 19,
 'emergency room reports': 75,
 'diets nutritions': 10,
 'dermatology': 29,
 'cosmetic / plastic surgery': 27,
 'chiropractic': 14,
 'autopsy': 8}

In [34]:
def merge_and_rebuild_classes(df, filtered_classes):
    filtered_classes.extend(['soap / chart / progress notes', 'discharge summary', 'consult history phy'])
    # print(filtered_classes)
    df = df[~df["medical_specialty"].isin(filtered_classes)]
    return df

In [35]:
filtered_df = merge_and_rebuild_classes(df, list(filtered_class_counter.keys()))
filtered_df.medical_specialty.unique().tolist()

['cardiovascular / pulmonary',
 'neurology',
 'urology',
 'general medicine',
 'surgery',
 'radiology',
 'orthopedic',
 'ophthalmology',
 'obstetrics / gynecology',
 'neurosurgery',
 'nephrology',
 'hematology oncology',
 'gastroenterology',
 'ent otolaryngology']

In [36]:
# Feature and target
X = filtered_df.transcription.to_numpy()
y = filtered_df.medical_specialty

In [37]:
import json

In [38]:
# Label encoder to encode class labels
class LabelEncoder(object):
    """Encode labels into unqiue ids/integers"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded
    
    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
            
        return classes
    
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
    
    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [39]:
label_encoder = LabelEncoder()
label_encoder.fit(y)

<__main__.LabelEncoder at 0x7fe786f9fb80>

In [40]:
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'cardiovascular / pulmonary': 0,
 'ent otolaryngology': 1,
 'gastroenterology': 2,
 'general medicine': 3,
 'hematology oncology': 4,
 'nephrology': 5,
 'neurology': 6,
 'neurosurgery': 7,
 'obstetrics / gynecology': 8,
 'ophthalmology': 9,
 'orthopedic': 10,
 'radiology': 11,
 'surgery': 12,
 'urology': 13}

In [48]:
# Check one example
print(f"Target sample: {y[11]}")
print(f"Encoded target sample: {label_encoder.encode([y[11]])}")

Target sample: cardiovascular / pulmonary
Encoded target sample: [0]


In [49]:
# Encode all our labels
y = label_encoder.encode(y)
print(y.shape)

(3548,)


In [51]:
from pathlib import Path

In [53]:
label_encoder.save(Path("../data_artifacts/new_label_encoder.json"))

In [54]:
from argparse import Namespace

In [55]:
data_args = Namespace(
    lower=True,
    stem=False,
    num_classes=NUM_CLASSES
)

In [56]:
# initialize wandb project
run = wandb.init(project="Medical-Transcription", entity="alokpadhi")

[34m[1mwandb[0m: Currently logged in as: [33malokpadhi[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [57]:
run.log({"dataset_args": vars(data_args)})

In [58]:
data_artifacts = wandb.Artifact("Artifacts-for-data", type="Preprocessing_Artifacts")
data_artifacts.add_dir(Path("../data_artifacts"))
run.log_artifact(data_artifacts)

[34m[1mwandb[0m: Adding directory to artifact (./../data_artifacts)... Done. 0.0s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fe7860fbb80>

In [59]:
from sklearn.model_selection import train_test_split

In [60]:
# split sizes
train_size = 0.7
val_size = 0.15
test_size = 0.15

In [61]:
# To ensure the target distribution remains same across the splits
X_train, X_, y_train, y_ = train_test_split(
                            X, y, train_size=train_size, stratify=y)

In [62]:
print(f"train: {len(X_train)} ({(len(X_train)/len(X)):.2f})\n"
     f"remaining: {len(X_)} ({(len(X_)/len(X)):.2f})")

train: 2483 (0.70)
remaining: 1065 (0.30)


In [63]:
# split for validation and test set
X_val, X_test, y_val, y_test = train_test_split(
                                X_, y_, train_size=0.5, stratify=y_)

In [64]:
print(f"train: {len(X_train)} ({(len(X_train)/len(X)):.2f})\n"
     f"val: {len(X_val)} ({(len(X_val)/len(X)):.2f})\n"
     f"test: {len(X_test)} ({(len(X_test)/len(X)):.2f})")

train: 2483 (0.70)
val: 532 (0.15)
test: 533 (0.15)


In [65]:
train_df = pd.DataFrame({"transcription": X_train, "specialty": label_encoder.decode(y_train)})
val_df = pd.DataFrame({"transcription": X_val, "specialty": label_encoder.decode(y_val)})
test_df = pd.DataFrame({"transcription": X_test, "specialty": label_encoder.decode(y_test)})

In [76]:
v2_dataset_path = Path("../dataset/v2.0/")

In [77]:
type(v2_dataset_path)

pathlib.PosixPath

In [78]:
train_df.to_parquet(Path(v2_dataset_path,"train.parquet"), index=False)
val_df.to_parquet(Path(v2_dataset_path,"val.parquet"),index=False)
test_df.to_parquet(Path(v2_dataset_path,"test.parquet"),index=False)

In [80]:
datav2 = wandb.Artifact("Dataset_V2_0", type="raw_data", description="Train, Val and test splits", metadata={"train_size": len(train_df), "val_size": len(val_df),
                                                                                                            "test_size": len(test_df)})
datav2.add_dir(v2_dataset_path)
    
run.log_artifact(datav2)

[34m[1mwandb[0m: Adding directory to artifact (./../dataset/v2.0)... Done. 0.1s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fe749809460>