# Baseline Model Training with Custom Labeler

In this notebook, we will show an end-to-end process of using piton to apply custom labeling function and generate features, and train a simple baseline models such as logistic regression and xgboost. The example dataset used here is the STARR OMOP dataset extracted in the previous notebook. 

In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Please follow the installation guide for piton on readme. You may need to install few other packages such as sklearn and xgboost. Use `pip install <package>`. 

In [38]:
import datetime
import os
from typing import List, Tuple

import numpy as np
from sklearn import metrics

import piton
import piton.datasets
from piton.labelers.core import Label, LabeledPatients, TimeHorizon
from piton.labelers.omop_labeling_functions import CodeLF, MortalityLF, IsMaleLF, DiabetesLF
from piton.featurizers.core import Featurizer, FeaturizerList
from piton.featurizers.featurizers import AgeFeaturizer, CountFeaturizer
from piton.extension import datasets as extension_datasets

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import xgboost as xgb

In [39]:
# Please update this path with your extract of piton as noted in previous notebook. 
PATH_TO_PITON_DB = '/local-scratch/nigam/projects/clmbr_text_assets/data/piton_database_1_perct/'

# Patient database
data = piton.datasets.PatientDatabase(PATH_TO_PITON_DB)

# Ontology 
ontology = data.get_ontology()

In [40]:
help(data)

Help on PatientDatabase in module piton.extension.datasets object:

class PatientDatabase(collections.abc.Sequence, pybind11_builtins.pybind11_object)
 |  Method resolution order:
 |      PatientDatabase
 |      collections.abc.Sequence
 |      collections.abc.Reversible
 |      collections.abc.Collection
 |      collections.abc.Sized
 |      collections.abc.Iterable
 |      collections.abc.Container
 |      pybind11_builtins.pybind11_object
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(...)
 |      __getitem__(self: object, arg0: int) -> object
 |  
 |  __init__(...)
 |      __init__(self: piton.extension.datasets.PatientDatabase, filename: str, read_all: bool = False) -> None
 |  
 |  __len__(...)
 |      __len__(self: piton.extension.datasets.PatientDatabase) -> int
 |  
 |  close(...)
 |      close(self: piton.extension.datasets.PatientDatabase) -> None
 |  
 |  get_code_count(...)
 |      get_code_count(self: piton.extension.datasets.PatientDatabase, 

In [41]:
help(ontology)

Help on Ontology in module piton.extension.datasets object:

class Ontology(pybind11_builtins.pybind11_object)
 |  Method resolution order:
 |      Ontology
 |      pybind11_builtins.pybind11_object
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  get_all_parents(...)
 |      get_all_parents(self: piton.extension.datasets.Ontology, arg0: int) -> piton.extension.IntSpan
 |  
 |  get_children(...)
 |      get_children(self: piton.extension.datasets.Ontology, arg0: int) -> piton.extension.IntSpan
 |  
 |  get_dictionary(...)
 |      get_dictionary(self: piton.extension.datasets.Ontology) -> Dictionary
 |  
 |  get_parents(...)
 |      get_parents(self: piton.extension.datasets.Ontology, arg0: int) -> piton.extension.IntSpan
 |  
 |  ----------------------------------------------------------------------
 |  Static methods inherited from pybind11_builtins.pybind1

In [18]:
# For sake for running the demo fast, lets only use 100 patients
patients = [data[i] for i in range(0, 10)]
patients = data

#### First, let use use labelers and featurizers provided with Piton. The task is to predict mortality in next 1 year. 

In [19]:
# Define time horizon for labeling purpose based on your use case. 
# Note: Some labeling function may not take any time_horizon

time_horizon = TimeHorizon(
        datetime.timedelta(days=0), datetime.timedelta(days=365)
    )

# Define the mortality labeling function. 
# labeler = MortalityLF(ontology, time_horizon)
labeler = DiabetesLF(ontology, time_horizon)

In [20]:
labeled_patients = labeler.apply(patients)

In [30]:
patients[0].patient_id

0

In [28]:
labeled_patients.pat_idx_to_label(0)

[]

In [8]:
# def save_to_file(object_to_save, path_to_file: str):
#     """Save object to Pickle file."""
#     os.makedirs(os.path.dirname(path_to_file), exist_ok=True)
#     with open(path_to_file, "wb") as fd:
#         pickle.dump(object_to_save, fd)

In [9]:
# import pickle
# save_to_file(labeler, "./test_labeler.pickle")

In [25]:
# Lets use both age and count featurizer 
age = AgeFeaturizer()
count = CountFeaturizer(rollup=True)
featurizer_age_count = FeaturizerList([age, count])

In [26]:
# save_to_file(featurizer_age_count, "./featurizer_age_count_1.pickle")

In [12]:
# Preprocessing the featurizers, which includes processes such as normalizing age. 
featurizer_age_count.preprocess_featurizers(patients, labeled_patients)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33771/33771 [04:44<00:00, 118.81it/s]


In [13]:
full_matrix, labels, _, _ = featurizer_age_count.featurize(patients, labeler, num_threads=5, database_path=PATH_TO_PITON_DB)

In [30]:
labels

array([0., 0., 0., 0.], dtype=float32)

#### Training baseline model

In [110]:
X_train, X_test, y_train, y_test = train_test_split(full_matrix, labels, train_size = 0.8)

In [111]:
X_train.shape

(5472, 31559)

In [112]:
X_test.shape

(1369, 31559)

In [113]:
y_test.sum()

174.0

In [114]:
# Logistic Regresion
model = LogisticRegression().fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auc = metrics.roc_auc_score(y_test, y_pred_proba)
print(auc)

0.9307122589332948


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [115]:
# XGBoost
params = {
    "n_estimators": 50, 
    "max_depth": 2
}

model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[::,1]
auc = metrics.roc_auc_score(y_test, y_pred_proba)
print(auc)

0.9870076468042129


## Now, let us make a custom labeler

In [59]:
DIABETES_CODE = "SNOMED/44054006"

piton_code = data.get_code_dictionary().index(DIABETES_CODE)

print("Piton Code: ", piton_code)
print("Count in dataset: ", data.get_code_count(piton_code))
print("Prevalence: ", round(data.get_code_count(piton_code)/len(data)*100, 2))

Piton Code:  1243
Count in dataset:  4009
Prevalence:  11.87


An example of how to make a custom labeler. This labeler is also included in `labelers.omop_labeling_functions` 

In [60]:
class DiabetesLF(CodeLF):
    """Apply a label for whether or not a patient has diabetes within the `time_horizon`.

    """

    def __init__(
        self, ontology: extension_datasets.Ontology, time_horizon: TimeHorizon
    ):
        """Create a Diabetes labeler.

        Args:
            ontology (extension_datasets.Ontology): Maps code IDs to concept names
            time_horizon (TimeHorizon): An interval of time. If the event occurs during this time horizon, then
                the label is TRUE. Otherwise, FALSE.

        Raises:
            ValueError: Raised if there are multiple unique codes that map to the death code
        """
        DIABETES_CODE = "SNOMED/44054006"

        diabetes_codes: Set[Tuple[str, int]] = set()
        for code, code_str in enumerate(ontology.get_dictionary()):
            code_str = bytes(code_str).decode('utf-8')
            if code_str == DIABETES_CODE:
                diabetes_codes.add((code_str, code))

        if len(diabetes_codes) != 1:
            raise ValueError(
                f"Could not find exactly one death code -- instead found {len(diabetes_codes)} codes: {str(diabetes_codes)}"
            )
        else:
            diabetes_code: int = list(diabetes_codes)[0][1]
            super().__init__(code=diabetes_code, time_horizon=time_horizon)

In [61]:
time_horizon = TimeHorizon(
        datetime.timedelta(days=0), datetime.timedelta(days=90)
    )

labeler = DiabetesLF(ontology, time_horizon)

age = AgeFeaturizer()
count = CountFeaturizer(ontology=ontology, rollup=True)
featurizer_age_count = FeaturizerList([age, count])

featurizer_age_count.preprocess_featurizers(patients, labeler)
full_matrix, labels, _, _ = featurizer_age_count.featurize(patients, labeler)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(full_matrix, labels, train_size = 0.8)

In [63]:
# Logistic Regression
model = LogisticRegression().fit(X_train, y_train)
model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.99937700249199

In [64]:
# XGBoost
params = {
    "n_estimators": 50, 
    "max_depth": 2
}

model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9999110003559986