In [2]:
!nvidia-smi

Sat Jun 18 19:25:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 4000     On   | 00000000:00:05.0 Off |                  N/A |
| 30%   37C    P8     6W / 125W |      1MiB /  7982MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Fri_Dec_17_18:16:03_PST_2021
Cuda compilation tools, release 11.6, V11.6.55
Build cuda_11.6.r11.6/compiler.30794723_0


In [4]:
!pip install nltk==3.7

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [6]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
# Weights and Bias to version the dataset
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [11]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting seaborn
  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
[K     |████████████████████████████████| 292 kB 22.1 MB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.11.2
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import warnings
warnings.filterwarnings("ignore")

In [9]:
import wandb

In [10]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [15]:
# read the data
transcriptions = pd.read_csv('dataset/mtsamples.csv')
transcriptions.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [16]:
# drop the first column as it does not have any relevant information
transcriptions.drop('Unnamed: 0', axis=1, inplace=True)

In [17]:
# check the data again
transcriptions.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [18]:
# filter data with the columns of interest
filtered_data = transcriptions[['medical_specialty', 'transcription']]
filtered_data.head()

Unnamed: 0,medical_specialty,transcription
0,Allergy / Immunology,"SUBJECTIVE:, This 23-year-old white female pr..."
1,Bariatrics,"PAST MEDICAL HISTORY:, He has difficulty climb..."
2,Bariatrics,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ..."
3,Cardiovascular / Pulmonary,"2-D M-MODE: , ,1. Left atrial enlargement wit..."
4,Cardiovascular / Pulmonary,1. The left ventricular cavity size and wall ...


In [19]:
# Preprocess
def preprocess(text, lower=True, stem=False,
                filters="[!\"'#$%&()*\+,-.:;<=>?@\\\[\]^_`{|}~]",
                stopwords=STOPWORDS):
    # lower the text
    if lower:
        text = text.lower()
    
    # remove the stopwords
    pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub('', text)

    # spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub(filters, r"", text)
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    # Remove links
    text = re.sub(r"http\S+", "", text)

    # Stemming
    if stem:
        text = " ".join([porter.stem(word) for word in text.split(" ")])

    return text

In [20]:
%pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [21]:
import ipywidgets as widgets

In [22]:
# Toggle preprocessing parameters
@widgets.interact(lower=True, stem=False)
def display_preprocessed_text(lower, stem):
    text = "SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,Lungs:  Clear.,ASSESSMENT:,  Allergic rhinitis.,PLAN:,1.  She will try Zyrtec instead of Allegra again.  Another option will be to use loratadine.  She does not think she has prescription coverage so that might be cheaper.,2.  Samples of Nasonex two sprays in each nostril given for three weeks.  A prescription was written as well."
    preprocessed_text = preprocess(text=text, lower=lower, stem=stem)
    print(text)
    print (preprocessed_text)

interactive(children=(Checkbox(value=True, description='lower'), Checkbox(value=False, description='stem'), Ou…

In [23]:
df = filtered_data.copy()

In [24]:
# drop the null values
print(f"Length of dataset before dropping null values: {len(df)}")
df = df.dropna()
print(f"Length of dataset after dropping null values: {len(df)}")

Length of dataset before dropping null values: 4999
Length of dataset after dropping null values: 4966


In [26]:
# preprocess the columns
df.transcription = df.transcription.apply(preprocess,lower=True, stem=False)
print (f"{filtered_data.transcription.values[0]}\n{df.transcription.values[0]}")

SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,L

In [29]:
df.medical_specialty = df.medical_specialty.apply(preprocess,lower=True, stem=False)
print (f"{filtered_data.medical_specialty.values[0]}\n{df.medical_specialty.values[0]}")

 Allergy / Immunology
allergy / immunology


In [27]:
# initialize wandb project
run = wandb.init(project="Medical-Transcription", entity="alokpadhi")

[34m[1mwandb[0m: Currently logged in as: [33malokpadhi[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [30]:
df.to_csv('processed_transcriptions.csv', index=False)

In [34]:
%ls

Preprocessing_and_Augmentation.ipynb  instance_type.png          [0m[01;34mwandb[0m/
[01;34mdataset[0m/                              quick_start_pytorch.ipynb


In [36]:
from pathlib import Path

In [38]:
preprocessed_data = wandb.Artifact("processed_dataset", type="raw_data")
dataset_path = Path("dataset/processed/")
preprocessed_data.add_dir(dataset_path)
run.log_artifact(preprocessed_data)

[34m[1mwandb[0m: Adding directory to artifact (./dataset/processed)... Done. 0.1s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f5682202bb0>

In [39]:
# Feature and target
X = df.transcription.to_numpy()
y = df.medical_specialty

In [40]:
import json

In [43]:
# Label encoder to encode class labels
class LabelEncoder(object):
    """Encode labels into unqiue ids/integers"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded
    
    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
            
        return classes
    
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
    
    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [44]:
label_encoder = LabelEncoder()
label_encoder.fit(y)

<__main__.LabelEncoder at 0x7f56801d3c70>

In [45]:
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'allergy / immunology': 0,
 'autopsy': 1,
 'bariatrics': 2,
 'cardiovascular / pulmonary': 3,
 'chiropractic': 4,
 'consult history phy': 5,
 'cosmetic / plastic surgery': 6,
 'dentistry': 7,
 'dermatology': 8,
 'diets nutritions': 9,
 'discharge summary': 10,
 'emergency room reports': 11,
 'endocrinology': 12,
 'ent otolaryngology': 13,
 'gastroenterology': 14,
 'general medicine': 15,
 'hematology oncology': 16,
 'hospice palliative care': 17,
 'ime qme work comp etc': 18,
 'lab medicine pathology': 19,
 'letters': 20,
 'nephrology': 21,
 'neurology': 22,
 'neurosurgery': 23,
 'obstetrics / gynecology': 24,
 'office notes': 25,
 'ophthalmology': 26,
 'orthopedic': 27,
 'pain management': 28,
 'pediatrics neonatal': 29,
 'physical medicine rehab': 30,
 'podiatry': 31,
 'psychiatry / psychology': 32,
 'radiology': 33,
 'rheumatology': 34,
 'sleep medicine': 35,
 'soap / chart / progress notes': 36,
 'speech language': 37,
 'surgery': 38,
 'urology': 39}

In [49]:
y[0]

'allergy / immunology'

In [51]:
# Check one example
print(f"Target sample: {y[15]}")
print(f"Encoded target sample: {label_encoder.encode([y[15]])}")

Target sample: bariatrics
Encoded target sample: [2]


In [52]:
# Encode all our labels
y = label_encoder.encode(y)
print(y.shape)

(4966,)


In [53]:
label_encoder.save(Path("data_artifacts/label_encoder.json"))

In [54]:
from argparse import Namespace

In [55]:
data_args = Namespace(
    lower=True,
    stem=False,
    num_classes=NUM_CLASSES
)

In [61]:
run.log({"dataset_args": vars(data_args)})

In [63]:
data_artifacts = wandb.Artifact("Artifacts-for-data", type="Preprocessing_Artifacts")
data_artifacts.add_dir(Path("data_artifacts"))
run.log_artifact(data_artifacts)

[34m[1mwandb[0m: Adding directory to artifact (./data_artifacts)... Done. 0.0s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f5666c1fc70>

VBox(children=(Label(value='10.513 MB of 10.513 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…