This notebook uses the EPILEPSY_PREDICTIONS conda environment

`conda env create -f epilepsy_predictions.yml`  
  
`conda activate epilepsy_predictions`  

Alternatively, you can create your own environment with the necessary packages:
- python=3.9
- scikit-learn=0.24.2
- dill
- numpy
- ipykernel
- polars

In [5]:
import dill
import polars as pl
import numpy as np

model = dill.load(open('lr_meds_icds_dem_text__no_prodigy_binary_model.sav', 'rb'))
df = pl.read_parquet('features/fm.parquet')
preds = model.predict_proba(df)

# this hardcoded threshold is drawn from the X_train data in the original model's code
# the precision is the same
threshold = np.float64(0.4675610608142482)

df = df.with_columns(pl.Series(name='prob_NO', values=preds[:, 0]))
df = df.with_columns(pl.Series(name='prob_YES', values=preds[:, 1]))
df = df.with_columns(pl.when(pl.col('prob_YES') >= threshold).then(1).otherwise(0).alias('prediction'))

In [40]:
display(df.select(
    [
        'prob_NO',
        'prob_YES',
        'prediction'
    ]
).head())

print('Total YES predictions:', df.select(pl.col('prediction')).sum().item())
print('Total NO predictions:', len(df) - df.select(pl.col('prediction')).sum().item())
final = pl.read_parquet('features/fm_all_cols.parquet').select(['bdsp_patient_id', 'date_note']).hstack(df.select(['prob_NO', 'prob_YES', 'prediction']))
print('Total unique patients:', final['bdsp_patient_id'].n_unique())
print('Total unique patients WITH YES prediction:', final.filter(pl.col('prediction') == 1)['bdsp_patient_id'].n_unique())
print('Total unique patients WITHOUT YES prediction:', final['bdsp_patient_id'].n_unique() - final.filter(pl.col('prediction') == 1)['bdsp_patient_id'].n_unique())

prob_NO,prob_YES,prediction
f64,f64,i32
0.946808,0.053192,0
0.995955,0.004045,0
0.68135,0.31865,0
0.98437,0.01563,0
0.299859,0.700141,1


Total YES predictions: 6293
Total NO predictions: 39006
Total unique patients: 267
Total unique patients WITH YES prediction: 88
Total unique patients WITHOUT YES prediction: 179


In [41]:
final.write_parquet('predictions/predictions.parquet')