In [7]:
# --- CONFIGURATION ---

PROCESSING_METHOD = 'cutoff'         # Options: 'cutoff' or 'index_year'
CUTOFF = 0.8                         # Only used if PROCESSING_METHOD='cutoff'
MIN_LABS = 5
YEAR_INDEX = 2016                   # Only used if PROCESSING_METHOD='index_year'
MIN_GAP = 30

SMOOTH_FRAC = 0.7
TEST_TO_PLOT = 'RBC'

# Auto-naming tag
method_tag = (
    f"cutoff_{CUTOFF}_min_{MIN_LABS}" if PROCESSING_METHOD == "cutoff" else
    f"index_{YEAR_INDEX}_gap_{MIN_GAP}_min_{MIN_LABS}"
)

In [11]:
%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import warnings

sys.path.append('../')
warnings.filterwarnings('ignore')

from models.setpoints import *
from models.bayes import *
from models.smooth import *
from process.config import *
from process.measurements import *
from models.ornstein_uhlenbeck import *

# Load raw measurements
cols = ['subject_id', 'time', 'test_name', 'numeric_value', 'gender_concept_id', 'age']
measurements = pd.read_csv('../data/processed/all_lab_measurements.csv')[cols]
measurements['sex'] = measurements['gender_concept_id'].map(lambda x: 'M' if x == 0 else 'F')
measurements = measurements.sort_values(by=['subject_id', 'test_name', 'time'])


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Process Measurements

In [12]:
def get_test_data(measurements_df, sample_size=None):
    """Get subset of data for testing"""
    measurements_df = measurements_df.copy()
    if sample_size != None:
        test_patients = np.random.choice(
            measurements_df['subject_id'].unique(), 
            size=sample_size, 
            replace=False
        )
        measurements_df = measurements_df[measurements_df['subject_id'].isin(test_patients)]
        print(f"Running with {len(test_patients)} sample patients")
    return measurements_df

def process_measurements(df, params):
    # can u do the pop and then the if statement?
    params = params.copy()
    method = params.pop('method')
    
    processors = {
        'cutoff': cutoff_measurements_df,
        'index': filter_measurements_yash,
        'index_foy': filter_measurements_foy,
    }
    
    if method not in processors:
        raise ValueError(f"Unknown processing method: {method}")
    
    return processors[method](df, **params)

def save_results(df, output_path, method, params):
    
    tag = '_'.join(f"{k}:{v}" for k,v in params.items())
    path = f'{output_path}/{method}/{method}_{tag}.csv'
    os.makedirs(f'{output_path}/{method}', exist_ok=True)
    df.to_csv(path, index=False)
    print(f"Results saved to {path}")

configs = [
    {
        'method': 'cutoff',
        'percent': 0.8,
        'min_tests': 5
    },
   Index year method configs
    {
        'method': 'index',
        'index_year': 2016,
        'min_gap': 30,
        'min_tests': 5
    },
    {
        'method': 'index_foy',
        'index_year': 2016,
        'min_gap': 30,
        'min_tests': 5
    }
]

sample = None  
df = get_test_data(measurements, sample_size=sample)
output_path = '../results'

for parameters in configs:
    print('Processing using', parameters['method'], 'method')
   # processed_df = process_measurements(df, parameters)
    unique_subjects = processed_df['subject_id'].nunique()
    unique_tests = processed_df['test_name'].nunique()
    
    print(f"Number of unique subjects: {unique_subjects}")
   
    #setpoint_df = run_gmm_model(processed_df)    
    #smoothed_df = run_lowess_model(processed_df, smooth_frac=0.7)
    #bayes_df = run_pEB_model(processed_df, prior_method='reference')
    ou_df = run_ou_with_prior(processed_df)
    
    if not sample:
       # save_results(setpoint_df, output_path, 'setpoints', parameters)
       # save_results(smoothed_df, output_path, 'smoothed', parameters)
        #save_results(bayes_df, output_path, 'bayes', parameters)
        
        save_results(ou_df, output_path, 'ou', parameters)
    

Processing using cutoff method


Filtering Sequences:  96%|█████████▌| 146384/152269 [01:18<00:03, 1854.61it/s]


Number of unique subjects: 3907


100%|██████████| 72925/72925 [26:06<00:00, 46.56it/s]  


Results saved to ../results/ou/ou_method:cutoff_percent:0.8_min_tests:5.csv


In [None]:
unique_subjects = processed_df['subject_id'].nunique()
unique_tests = processed_df['test_name'].nunique()

print(f"Number of unique subjects: {unique_subjects}")

#setpoint_df = run_gmm_model(processed_df)    
#smoothed_df = run_lowess_model(processed_df, smooth_frac=0.7)
#bayes_df = run_pEB_model(processed_df, prior_method='reference')
ou_df = run_ou_with_prior(processed_df)

if not sample:
    # save_results(setpoint_df, output_path, 'setpoints', parameters)
    # save_results(smoothed_df, output_path, 'smoothed', parameters)
    #save_results(bayes_df, output_path, 'bayes', parameters)
    
    save_results(ou_df, output_path, 'ou', parameters)


Number of unique subjects: 3907


  1%|          | 588/72925 [00:10<39:39, 30.40it/s] 