# Using CLMBR to generate features and training models on those features

We can use a trained CLMBR model to generate features and then use those features in a logistic regression model.

In [1]:
import shutil
import os

TARGET_DIR = 'trash/tutorial_7_INSPECT'

if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)


num_proc = 20

In [2]:
import femr.models.transformer
import pyarrow.csv
import datasets
import pickle
import pyarrow as pa
import pyarrow.compute as pc

# First, we compute our features

# Load some labels
# labels = pyarrow.csv.read_csv('input/labels.csv').to_pylist()
# label_csv = '/share/pi/nigam/projects/zphuo/data/PE/inspect/cohort_0.2.0_master_file_anon.csv'
label_csv_subset = '/share/pi/nigam/projects/zphuo/data/PE/inspect/timelines_smallfiles_meds/cohort_0.2.0_master_file_anon_subset.csv'

labels_table = pyarrow.csv.read_csv(label_csv_subset)

# filter out censored
selected_table = labels_table.select(['patient_id', 'procedure_time', '12_month_PH'])
filtered_table = selected_table.filter(pa.compute.field("12_month_PH") != "Censored")

# cast to bool
casted_column = pc.cast(filtered_table.column('12_month_PH'), target_type=pa.bool_())
filtered_table = filtered_table.set_column(filtered_table.schema.get_field_index('12_month_PH'), pa.field('12_month_PH', pa.bool_()), casted_column)


columns = {name: filtered_table.column(name) for name in filtered_table.column_names}
columns['prediction_time'] = columns.pop('procedure_time')
columns['boolean_value'] = columns.pop('12_month_PH')
filtered_table = pa.Table.from_arrays(list(columns.values()), names=list(columns.keys()))

labels = filtered_table.to_pylist()

# Load our data
# dataset = datasets.Dataset.from_parquet("input/meds/data/*")
parquet_folder = '/share/pi/nigam/projects/zphuo/data/PE/inspect/timelines_smallfiles_meds/data_subset/*'
dataset = datasets.Dataset.from_parquet(parquet_folder)

# femr.ontology.Ontology will create one given an athena path and code_metadata
# We need an ontology for MOTOR
with open('input/meds/ontology.pkl', 'rb') as f:
    ontology = pickle.load(f)

model_folder = '/share/pi/nigam/projects/zphuo/repos/femr/tutorials/trash/tutorial_6_INSEPCT/motor_model'
# model_folder = '/share/pi/nigam/ethanid/femrv3/femr/tutorials/trash/tutorial_6/motor_model'


features = femr.models.transformer.compute_features(dataset, model_folder, labels, num_proc=num_proc, tokens_per_batch=128, ontology=ontology)

# We have our features
for k, v in features.items():
    print(k, v.shape)



Map (num_proc=20):   0%|          | 0/1916 [00:00<?, ? examples/s]

Some weights of the model checkpoint at /share/pi/nigam/projects/zphuo/repos/femr/tutorials/trash/tutorial_6_INSEPCT/motor_model were not used when initializing FEMRModel: ['task_model.final_layer.bias', 'task_model.final_layer.weight', 'task_model.task_layer.bias', 'task_model.task_layer.weight']
- This IS expected if you are initializing FEMRModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FEMRModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map (num_proc=20):   0%|          | 0/1574 [00:00<?, ? examples/s]

Creating batches 1257


Generating train split: 0 examples [00:00, ? examples/s]

patient_ids (2067,)
feature_times (2067,)
features (2067, 64)


# Joining features and labels

Given a feature set, it's important to be able to join a set of labels to those features.

This can be done with femr.featurizers.join_labels

In [3]:
import femr.featurizers

features_and_labels = femr.featurizers.join_labels(features, labels)

for k, v in features_and_labels.items():
    print(k, v.shape)

boolean_values (1872,)
patient_ids (1872,)
times (1872,)
features (1872, 64)


# Data Splitting

When using a pretrained CLMBR model, we have to be very careful to use the splits used for the original model

In [9]:
import femr.splits
import numpy as np

# We split into a global training and test set
split = femr.splits.PatientSplit.load_from_csv('/share/pi/nigam/projects/zphuo/repos/femr/tutorials/trash/tutorial_6_INSEPCT/motor_model/main_split.csv')

train_mask = np.isin(features_and_labels['patient_ids'], split.train_patient_ids)
test_mask = np.isin(features_and_labels['patient_ids'], split.test_patient_ids)

percent_train = .70
X_train, y_train = (
    features_and_labels['features'][train_mask],
    features_and_labels['boolean_values'][train_mask],
)
X_test, y_test = (
    features_and_labels['features'][test_mask],
    features_and_labels['boolean_values'][test_mask],
)

# Building Models

The generated features can then be used to build your standard models. In this case we construct both logistic regression and XGBoost models and evaluate them.

Performance is perfect since our task (predicting gender) is 100% determined by the features

In [10]:
import xgboost as xgb
import sklearn.linear_model
import sklearn.metrics
import sklearn.preprocessing

def run_analysis(title: str, y_train, y_train_proba, y_test, y_test_proba):
    print(f"---- {title} ----")
    print("Train:")
    print_metrics(y_train, y_train_proba)
    print("Test:")
    print_metrics(y_test, y_test_proba)

def print_metrics(y_true, y_proba):
    y_pred = y_proba > 0.5
    auroc = sklearn.metrics.roc_auc_score(y_true, y_proba)
    aps = sklearn.metrics.average_precision_score(y_true, y_proba)
    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
    f1 = sklearn.metrics.f1_score(y_true, y_pred)
    print("\tAUROC:", auroc)
    print("\tAPS:", aps)
    print("\tAccuracy:", accuracy)
    print("\tF1 Score:", f1)


model = sklearn.linear_model.LogisticRegressionCV(penalty="l2", solver="liblinear").fit(X_train, y_train)
y_train_proba = model.predict_proba(X_train)[::, 1]
y_test_proba = model.predict_proba(X_test)[::, 1]
run_analysis("Logistic Regression", y_train, y_train_proba, y_test, y_test_proba)

---- Logistic Regression ----
Train:
	AUROC: 0.5675300138948589
	APS: 0.16495229726359867
	Accuracy: 0.864375
	F1 Score: 0.0
Test:
	AUROC: 0.6306208559373118
	APS: 0.20239493285916538
	Accuracy: 0.8713235294117647
	F1 Score: 0.0
