# Baseline

* First we will create a baseline and iteratively improve the performance with more complex models
* Along the way we will try to fix any data related issues like data imbalance, trying different vecotorization methods etc

## Parameters to validate the final model

* Performance per class
* Latency
* Size of the model
* Inference cost
* Bias check
* Maintenance Cost

## Baseline Model

* **Randomly generated targets**
* **Simple rule based model**

### Random Model

In [1]:
import random

In [2]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Weights and Bias to version the dataset
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import warnings
warnings.filterwarnings("ignore")

In [6]:
import wandb

In [7]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33malokpadhi[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
import ipywidgets as widgets

In [9]:
import json

In [10]:
# Label encoder to encode class labels
class LabelEncoder(object):
    """Encode labels into unqiue ids/integers"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded
    
    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
            
        return classes
    
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
    
    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [11]:
label_encoder = LabelEncoder.load("../../data_artifacts/new_label_encoder.json")

In [12]:
# # use this code to download the split dataset files
# import wandb
# run = wandb.init()
# artifact = run.use_artifact('alokpadhi/Medical-Transcription/Dataset_V1_0:v0', type='raw_data')
# artifact_dir = artifact.download()

In [13]:
train_df = pd.read_parquet("../../dataset/v2.0/train.parquet")
val_df = pd.read_parquet("../../dataset/v2.0/val.parquet")
test_df = pd.read_parquet("../../dataset/v2.0/test.parquet")

In [14]:
# validate shapes
train_df.shape, val_df.shape, test_df.shape

((2483, 2), (532, 2), (533, 2))

In [15]:
# Feature and target
X_train = train_df.transcription.to_numpy()
y_train = train_df.specialty

X_val =  val_df.transcription.to_numpy()
y_val = val_df.specialty

X_test = test_df.transcription.to_numpy()
y_test = test_df.specialty

In [16]:
# Encode all our labels
y_train = label_encoder.encode(y_train)

y_val = label_encoder.encode(y_val)

y_test = label_encoder.encode(y_test)

In [17]:
def set_seeds(seed=42):
    """Set seeds for reproducibility"""
    np.random.seed(seed)
    random.seed(seed)

In [18]:
print(label_encoder)

<LabelEncoder(num_classes=14)>


In [19]:
print(label_encoder.classes)

['cardiovascular / pulmonary', 'ent otolaryngology', 'gastroenterology', 'general medicine', 'hematology oncology', 'nephrology', 'neurology', 'neurosurgery', 'obstetrics / gynecology', 'ophthalmology', 'orthopedic', 'radiology', 'surgery', 'urology']


In [20]:
# Let's generate  random prediction on the test set to see the performance
y_pred = np.random.randint(low=0, high=len(label_encoder), size=len(y_test))
print(y_pred.shape)
print(y_pred[:5])

(533,)
[ 7 12 11  3  1]


In [21]:
from sklearn.metrics import precision_recall_fscore_support

In [22]:
# Evaluate the random prediction
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recal": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.10794216098925503,
  "recal": 0.0525328330206379,
  "f1": 0.05840163283199127
}


In [23]:
random_model_run = wandb.init(project="Medical-Transcription", name="Random-Model-new")
random_model_run.log({"precision": performance["precision"], "recall": performance["recal"], "f1-score": performance["f1"]})

### Rule Based Model

In [24]:
set_seeds()

In [25]:
from collections import Counter

In [28]:
# Let's find out most frequent 10-20 words in each class
target_word_dist = {}
for y_ in label_encoder.classes:
    filter_target_sentences = train_df.loc[train_df.specialty == y_, 'transcription'].values
    combine_words = " ".join(filter_target_sentences).split(' ')
    word_counter = Counter(combine_words).most_common(25)
    target_word_dist[y_] = [wc[0] for wc in word_counter if len(wc[0]) > 1]

In [29]:
# Check the top 15 words in each class
target_word_dist

{'cardiovascular / pulmonary': ['patient',
  'left',
  'right',
  'artery',
  'procedure',
  'coronary',
  'history',
  'normal',
  'chest',
  'catheter',
  'heart',
  'disease',
  'mg',
  'pulmonary',
  'well',
  'placed',
  'pressure',
  'performed'],
 'ent otolaryngology': ['patient',
  'nasal',
  'left',
  'placed',
  'right',
  'procedure',
  'ear',
  'anesthesia',
  'removed',
  'tube',
  'well',
  'room',
  'used',
  'general',
  'using',
  'incision',
  'history',
  'normal',
  'chronic',
  'performed',
  'also'],
 'gastroenterology': ['patient',
  'procedure',
  'history',
  'placed',
  'normal',
  'colon',
  'abdomen',
  'well',
  'right',
  'pain',
  'abdominal',
  'diagnosis',
  'gallbladder',
  'left',
  'performed',
  'anesthesia',
  'removed',
  'without',
  'mm'],
 'general medicine': ['patient',
  'history',
  'normal',
  'pain',
  'mg',
  'negative',
  'blood',
  'without',
  'left',
  'past',
  'also',
  'right',
  'year',
  'well',
  'chest',
  'clear',
  'old',
  '

In [30]:
import operator

In [31]:
# Match the input string words with each class word whichever gives the highest matching assign that class
def match_words(input_str):
    target_match_count = {}
    input_str_words = input_str.split(' ')
    for target_ in target_word_dist.keys():
        target_words = [wc for wc in target_word_dist[target_]]
        match_count = 0
        for word in input_str_words:
            if word in target_words:
                match_count += 1
        target_match_count[target_] = match_count
    y_pred = max(target_match_count.items(), key=operator.itemgetter(1))[0]
    return y_pred

In [32]:
y_pred = [match_words(input_str) for input_str in X_test]

In [33]:
y_pred = label_encoder.encode(y_pred)

In [34]:
y_pred.shape

(533,)

In [35]:
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recal": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.41824920138329746,
  "recal": 0.38461538461538464,
  "f1": 0.37792602367859063
}


As we can see above we have significantly improved our metrics than our baseline of random predictions.

In [36]:
rulebased_model_run = wandb.init(project="Medical-Transcription", name="Rule Based New")
rulebased_model_run.log({"precision": performance["precision"], "recall": performance["recal"], "f1-score": performance["f1"]})

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1-score,▁
precision,▁
recall,▁

0,1
f1-score,0.0584
precision,0.10794
recall,0.05253
