# Imports

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Preprocessing

In [2]:
df = pd.read_excel("../data/albion_dataset_2024.8.xlsx")

In [5]:
# Filter rows with valid diagnosis and visits
df_cleaned = df[df['DIAGNOSIS'].notna() & (df['DIAGNOSIS'] != 2) & (df['VisitNuA'] != 999)]

In [7]:
df_cleaned.replace(999, np.nan, inplace=True)

## Create new target (progression) variable

In [9]:
# Filter individuals who start as healthy (DIAGNOSIS = 0) at VisitNuA = 1
initial_healthy = df_cleaned[(df_cleaned['VisitNuA'] == 1) & (df_cleaned['DIAGNOSIS'] == 0)]

# Get the unique IDs of individuals who start as healthy
healthy_ids = initial_healthy['ID'].unique()

# Filter the data for only these individuals
df_filtered = df_cleaned[df_cleaned['ID'].isin(healthy_ids)]

# Create a target variable
def assign_target(group):
    # Check if the person progresses to MCI (DIAGNOSIS = 1) after VisitNuA = 1
    if any((group['VisitNuA'] > 1) & (group['DIAGNOSIS'] == 1)):
        return 1  # Progresses to MCI
    else:
        return 0  # Remains healthy

# Apply the function group by individual IDs
target_mapping = df_filtered.groupby('ID').apply(assign_target)

# Map the target variable back to the original data
df_cleaned['Progressed'] = df_cleaned['ID'].map(target_mapping)

# Filter the data for only the individuals of interest
df_cleaned = df_cleaned[df_cleaned['Progressed'].notna()]

  target_mapping = df_filtered.groupby('ID').apply(assign_target)


In [10]:
# Filter the data for only the first visit of each individual
df_cleaned = df_cleaned[df_cleaned['VisitNuA'] == 1].reset_index(drop=True)

# Progression Analysis

In [13]:
# Set predictors to analyze, these are the features that the first-visit model showed as most important
predictors = ['MMSE', 'CDR_TOT', 'ZME', 'ZLA', 'ZEX', 'SPEED_1', 'SPEED_2']

# Store results
results = []

# Number of bootstrap samples
n_bootstraps = 1000

# Iterate through each predictor
for predictor in predictors:
    print(f"Analyzing Predictor: {predictor}")
    group_0 = df_cleaned[df_cleaned['Progressed'] == 0][predictor].dropna()
    group_1 = df_cleaned[df_cleaned['Progressed'] == 1][predictor].dropna()
    
    # Step 1: Mann-Whitney U Test
    u_stat, p_value = mannwhitneyu(group_0, group_1, alternative='two-sided')
    print(f"  Mann-Whitney U Test: U={u_stat}, p-value={p_value}")
    
    # If Mann-Whitney U is not significant, skip further analysis
    if p_value >= 0.05:
        results.append({
            'Predictor': predictor,
            'Mann-Whitney U Significant': False,
            'Bootstrapped U p-value': None,
            'Median Difference CI': None,
            'Variance Difference CI': None
        })
        continue

    # Step 2: Bootstrapping the U Statistic
    boot_u_stats = []
    for _ in range(n_bootstraps):
        boot_0 = np.random.choice(group_0, size=len(group_0), replace=True)
        boot_1 = np.random.choice(group_1, size=len(group_1), replace=True)
        boot_stat, _ = mannwhitneyu(boot_0, boot_1, alternative='two-sided')
        boot_u_stats.append(boot_stat)
    
    lower_ci_u = np.percentile(boot_u_stats, 2.5)
    upper_ci_u = np.percentile(boot_u_stats, 97.5)
    boot_u_p_value = np.mean(np.array(boot_u_stats) >= u_stat)
    print(f"  Bootstrapped U p-value: {boot_u_p_value}, CI: [{lower_ci_u}, {upper_ci_u}]")

    # Step 3: Bootstrapping the Median Difference
    boot_diff_medians = []
    for _ in range(n_bootstraps):
        boot_0 = np.random.choice(group_0, size=len(group_0), replace=True)
        boot_1 = np.random.choice(group_1, size=len(group_1), replace=True)
        boot_diff_medians.append(np.median(boot_1) - np.median(boot_0))
    
    lower_ci_median = np.percentile(boot_diff_medians, 2.5)
    upper_ci_median = np.percentile(boot_diff_medians, 97.5)
    print(f"  Bootstrapped Median Difference CI: [{lower_ci_median}, {upper_ci_median}]")

    # Step 4: Bootstrapping the Variance Difference (if median is not significant)
    boot_diff_var = []
    for _ in range(n_bootstraps):
        boot_0 = np.random.choice(group_0, size=len(group_0), replace=True)
        boot_1 = np.random.choice(group_1, size=len(group_1), replace=True)
        boot_diff_var.append(np.var(boot_1) - np.var(boot_0))

    # Confidence Interval for Variance Difference
    lower_ci_var = np.percentile(boot_diff_var, 2.5)
    upper_ci_var = np.percentile(boot_diff_var, 97.5)
    print(f"  Bootstrapped Variance Difference CI: [{lower_ci_var}, {upper_ci_var}]")


    # Store results
    results.append({
        'Predictor': predictor,
        'Mann-Whitney U Significant': True,
        'Bootstrapped U p-value': boot_u_p_value,
        'Median Difference CI': [lower_ci_median, upper_ci_median],
        'Variance Difference CI': [lower_ci_var, upper_ci_var] if boot_diff_var else None
    })

# Convert results to DataFrame for easy viewing
results_df = pd.DataFrame(results)



Analyzing Predictor: MMSE
  Mann-Whitney U Test: U=964.5, p-value=0.007368713670905064
  Bootstrapped U p-value: 0.528, CI: [747.9625, 1138.025]
  Bootstrapped Median Difference CI: [-2.0, 0.0]
  Bootstrapped Variance Difference CI: [-1.1613723026736318, 0.8114273910722193]
Analyzing Predictor: CDR_TOT
  Mann-Whitney U Test: U=394.0, p-value=0.00025954267786194705
  Bootstrapped U p-value: 0.496, CI: [177.9625, 599.05]
  Bootstrapped Median Difference CI: [0.0, 0.5]
  Bootstrapped Variance Difference CI: [-0.022131934739977807, 0.15824638450710257]
Analyzing Predictor: ZME
  Mann-Whitney U Test: U=1009.0, p-value=0.0036826111555860056
  Bootstrapped U p-value: 0.531, CI: [755.0, 1214.05]
  Bootstrapped Median Difference CI: [-1.7775063526419432, 0.17041182283252515]
  Bootstrapped Variance Difference CI: [-0.275175535313192, 0.5111352180169896]
Analyzing Predictor: ZLA
  Mann-Whitney U Test: U=807.5, p-value=0.04552875991675964
  Bootstrapped U p-value: 0.532, CI: [578.4875, 1019.57499