In [1]:
# Auto reload
%reload_ext autoreload
%autoreload 2

import sys
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# set random seed
random_state = 42

sys.path.append('../model/')
from utils import *

import warnings
warnings.filterwarnings('ignore')


{'HCT': 0, 'HGB': 1, 'MCH': 2, 'MCHC': 3, 'MPV': 4, 'PLT': 5, 'RBC': 6, 'RDW': 7, 'WBC': 8, 'MCV': 9, 'NA': 10, 'K': 11, 'CL': 12, 'CO2': 13, 'BUN': 14, 'CRE': 15, 'GLU': 16, 'A1C': 17, 'CA': 18, 'ALT': 19, 'GGT': 20, 'AST': 21, 'LDH': 22, 'PT': 23, 'ALP': 24, 'TBIL': 25, 'DBIL': 26, 'ALB': 27, 'TP': 28, 'CRP': 29, 'TC': 30, 'HDL': 31, 'LDL': 32, 'TGL': 33}


In [2]:
df = load_data('../../SETPOINT/data/processed/lab_measurements.csv')
df.head()

df.test_name.value_counts()
drop = ['TGL', 'LDH', 'TBIL']
df = df[~df['test_name'].isin(drop)]

In [3]:
sequences = create_sequences(df)
print(f"Created {len(sequences)} sequences")

labels = get_stratify_labels(sequences)
# Find sequences with not enough unique stratify labels points
labels_counts = pd.Series(labels).value_counts()
train_val_seq, test_seq = train_test_split(
    sequences, test_size=0.3, stratify=labels, random_state=random_state
)

labels = get_stratify_labels(train_val_seq)
train_seq, val_seq = train_test_split(
    train_val_seq, test_size=0.5, stratify=labels, random_state=random_state
)

print(f"\nTrain sequences: {len(train_seq)}")
print(f"Validation sequences: {len(val_seq)}")
print(f"Test sequences: {len(test_seq)}")

Created 106873 sequences

Train sequences: 37405
Validation sequences: 37406
Test sequences: 32062


In [None]:
def create_mean_baseline_predictions(sequences, split_name):
    results = []
    
    for i, seq in enumerate(sequences):
        x = seq['x'][:-1]
        x_next = seq['x'][-1]
        subject_id = seq['subject_id']
        
        prediction = np.mean(x)
        variance = np.var(x)
        
        error = abs(prediction - x_next)
        
        results.append({
            'subject_id': subject_id,
            'cid': seq['cid'],
            'split': split_name,
            'model': 'mean',
            'x_h': x.tolist(),
            'x_next': x_next,
            'x_pred': prediction,
            'absolute_error': error,
            'sequence_length': len(x)
        })
    
    return pd.DataFrame(results)

In [12]:
sequences[0]

{'x': array([16., 13., 11., 10., 10., 11., 10., 20., 19., 17.], dtype=float32),
 't': array([0.        , 0.29097223, 1.1625    , 2.2041667 , 3.2284722 ,
        4.2145834 , 5.2395835 , 5.929861  , 6.2180557 , 7.204167  ],
       dtype=float32),
 's': array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int32),
 'sex': np.int64(1),
 'cid': 14,
 'ref_mu': array([14.], dtype=float32),
 'ref_var': array([9.], dtype=float32),
 'subject_id': np.int64(115967095)}

In [18]:
def create_arima_baseline_predictions(sequences, split_name):
    results = []
    
    for i, seq in enumerate(sequences):
        x = seq['x'][:-1]
        x_next = seq['x'][-1]
        subject_id = seq['subject_id']
        
        try:
            if len(x) > 2:
                model = ARIMA(x, order=(1, 1, 1))
                model_fit = model.fit()
                prediction = model_fit.forecast(steps=1)[0]
            else:
                prediction = np.mean(x)
        except:
            prediction = np.mean(x)
        
        error = abs(prediction - x_next)
        
        results.append({
            'subject_id': subject_id,
            'code': seq['cid'],
            'split': split_name,
            'baseline_type': 'arima',
            'historical_values': x.tolist(),
            'true_next_value': x_next,
            'prediction': prediction,
            'absolute_error': error,
            'sequence_length': len(x)
        })
    
    return pd.DataFrame(results)

In [21]:
# Create mean baseline predictions for all splits
print("Creating mean baseline predictions...")
train_mean_df = create_mean_baseline_predictions(train_seq, 'train')
val_mean_df = create_mean_baseline_predictions(val_seq, 'val')
test_mean_df = create_mean_baseline_predictions(test_seq, 'test')

# Combine all mean baseline results
mean_baseline_df = pd.concat([train_mean_df, val_mean_df, test_mean_df], ignore_index=True)
mean_baseline_df.to_csv('predictions/mean_baseline_predictions.csv', index=False)
print(f"Mean baseline predictions created: {len(mean_baseline_df)} total")

Creating mean baseline predictions...
Mean baseline predictions created: 106873 total


In [None]:
# Create ARIMA baseline predictions for all splits
print("Creating ARIMA baseline predictions...")
train_arima_df = create_arima_baseline_predictions(train_seq, 'train')
val_arima_df = create_arima_baseline_predictions(val_seq, 'val')
test_arima_df = create_arima_baseline_predictions(test_seq, 'test')

# Combine all ARIMA baseline results
arima_baseline_df = pd.concat([train_arima_df, val_arima_df, test_arima_df], ignore_index=True)
mean_baseline_df.to_csv('predictions/arima_baseline_predictions.csv', index=False)
print(f"ARIMA baseline predictions created: {len(arima_baseline_df)} total")

Creating ARIMA baseline predictions...
