## Exploratory analysis

In [None]:
import pandas as pd

targets = {
        'htn_dx_ia':'Htndx',
        'res_htn_dx_ia':'ResHtndx', 
        'htn_hypok_dx_ia':'HtnHypoKdx', 
        'HTN_heuristic':'HtnHeuri', 
        'res_HTN_heuristic':'ResHtnHeuri',
        'hypoK_heuristic_v4':'HtnHypoKHeuri'
        }

targets_inv = { v:k for k,v in targets.items() }

def analyze_dataset(target, fold, ds_number=0):
    # Read the datasets
    df_train = pd.read_csv(f'../data/Dataset{ds_number}/{target}/{target}{fold}Train.csv')
    df_test = pd.read_csv(f'../data/Dataset{ds_number}/{target}/{target}{fold}Test.csv')
    
    # Calculate metrics for training data
    train_samples = len(df_train)
    train_positives = df_train[targets_inv[target]].sum()
    train_prevalence = (train_positives / train_samples * 100)
    
    # Calculate metrics for test data
    test_samples = len(df_test)
    test_positives = df_test[targets_inv[target]].sum()
    test_prevalence = (test_positives / test_samples * 100)
    
    return {
        'target': target,
        'fold': fold,
        'train_samples': train_samples,
        'train_positives': train_positives,
        'train_prevalence': train_prevalence,
        'test_samples': test_samples,
        'test_positives': test_positives,
        'test_prevalence': test_prevalence
    }

# Analyze all datasets
results = []
for target in ['Htndx', 'ResHtndx', 'HtnHypoKdx', 'HtnHeuri', 'ResHtnHeuri', 'HtnHypoKHeuri']:
    for fold in ['A', 'B', 'C', 'D', 'E']:
        results.append(analyze_dataset(target, fold))

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)

# Display results
print(results_df.to_string(index=False))

In [None]:
results_df.groupby('target').median(numeric_only=True)

In [None]:
print(results_df.groupby('target').median(numeric_only=True).reset_index().to_latex(
    index=False,
    float_format=lambda x: '%.2f' % x, 
    escape=True
))

In [None]:
# Statistics of the test fold
results_101 = []
for target in ['Htndx', 'ResHtndx', 'HtnHypoKdx', 'HtnHeuri', 'ResHtnHeuri', 'HtnHypoKHeuri']:
    for fold in ['A']:
        results_101.append(analyze_dataset(target, fold, 101))

results_101_df = pd.DataFrame(results_101)
results_101_df.groupby('target').median(numeric_only=True)