In [18]:
# Auto reload
%reload_ext autoreload
%autoreload 2

import sys
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# set random seed
random_state = 42

sys.path.append('../NORMA/')
from utils import *

import warnings
warnings.filterwarnings('ignore')


In [17]:
df = load_data('../../SETPOINT/data/processed/lab_measurements.csv')
df.head()

df.test_name.value_counts()
drop = ['TGL', 'LDH']
df = df[~df['test_name'].isin(drop)]

In [20]:
sequences = create_sequences(df)
print(f"Created {len(sequences)} sequences")

labels = get_stratify_labels(sequences)
# Find sequences with not enough unique stratify labels points
labels_counts = pd.Series(labels).value_counts()
train_val_seq, test_seq = train_test_split(
    sequences, test_size=0.3, stratify=labels, random_state=random_state
)

labels = get_stratify_labels(train_val_seq)
train_seq, val_seq = train_test_split(
    train_val_seq, test_size=0.5, stratify=labels, random_state=random_state
)

print(f"\nTrain sequences: {len(train_seq)}")
print(f"Validation sequences: {len(val_seq)}")
print(f"Test sequences: {len(test_seq)}")

Created 110609 sequences

Train sequences: 38713
Validation sequences: 38713
Test sequences: 33183


In [21]:
def create_mean_baseline_predictions(sequences, split_name):
    results = []
    
    for i, seq in enumerate(sequences):
        x = seq['x']
        target = seq['target'][0]
        subject_id = seq['subject_id']
        
        prediction = np.mean(x)
        
        error = abs(prediction - target)
        
        results.append({
            'subject_id': subject_id,
            'code': seq['s'][1],
            'split': split_name,
            'baseline_type': 'mean',
            'historical_values': x.tolist(),
            'true_next_value': target,
            'prediction': prediction,
            'absolute_error': error,
            'sequence_length': len(x)
        })
    
    return pd.DataFrame(results)

In [22]:
def create_arima_baseline_predictions(sequences, split_name):
    results = []
    
    for i, seq in enumerate(sequences):
        x = seq['x']
        target = seq['target'][0]
        subject_id = seq['subject_id']
        
        try:
            if len(x) > 2:
                model = ARIMA(x, order=(1, 1, 1))
                model_fit = model.fit()
                prediction = model_fit.forecast(steps=1)[0]
            else:
                prediction = np.mean(x)
        except:
            prediction = np.mean(x)
        
        error = abs(prediction - target)
        
        results.append({
            'subject_id': subject_id,
            'split': split_name,
            'baseline_type': 'arima',
            'historical_values': x.tolist(),
            'true_next_value': target,
            'prediction': prediction,
            'absolute_error': error,
            'sequence_length': len(x)
        })
    
    return pd.DataFrame(results)

In [23]:
# Create mean baseline predictions for all splits
print("Creating mean baseline predictions...")
train_mean_df = create_mean_baseline_predictions(train_seq, 'train')
val_mean_df = create_mean_baseline_predictions(val_seq, 'val')
test_mean_df = create_mean_baseline_predictions(test_seq, 'test')

# Combine all mean baseline results
mean_baseline_df = pd.concat([train_mean_df, val_mean_df, test_mean_df], ignore_index=True)
print(f"Mean baseline predictions created: {len(mean_baseline_df)} total")

Creating mean baseline predictions...
Mean baseline predictions created: 110609 total


In [24]:
# Create ARIMA baseline predictions for all splits
print("Creating ARIMA baseline predictions...")
train_arima_df = create_arima_baseline_predictions(train_seq, 'train')
val_arima_df = create_arima_baseline_predictions(val_seq, 'val')
test_arima_df = create_arima_baseline_predictions(test_seq, 'test')

# Combine all ARIMA baseline results
arima_baseline_df = pd.concat([train_arima_df, val_arima_df, test_arima_df], ignore_index=True)
print(f"ARIMA baseline predictions created: {len(arima_baseline_df)} total")

Creating ARIMA baseline predictions...
ARIMA baseline predictions created: 110609 total
