# OmicsNet

## Proteomics

### Merge SHAP values

In [None]:
import os
import pandas as pd
import numpy as np
import random

"""
Reads SHAP value files from multiple runs, calculates the element-wise average,
displays a verification sample for a single data point, and saves the final averaged SHAP values.
"""
# 1. Define paths
# Base path where the SEED directories (1234, 1235, etc.) are located
base_path = "/your path/cardiomicscore/saved/results/SHAP/OmicsNet/Proteomics"

# Output path for the final averaged results
output_path = os.path.join(base_path, "Final")

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Results will be saved to: {output_path}")

# 2. Automatically discover SEED directories and outcome files
try:
    # Find all directories that are named like a number (our SEEDs)
    seed_dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()])
    if not seed_dirs:
        raise FileNotFoundError("No valid SEED directories found in the base path.")
    
    print(f"Found {len(seed_dirs)} SEED runs: {seed_dirs}")

    # Get the list of SHAP files to process from the first SEED directory
    first_seed_path = os.path.join(base_path, seed_dirs[0])
    shap_files_to_process = [f for f in os.listdir(first_seed_path) if f.endswith('.parquet')]
    if not shap_files_to_process:
        raise FileNotFoundError(f"No .parquet files found in directory: {first_seed_path}")
        
    print(f"Found {len(shap_files_to_process)} outcomes to process: {shap_files_to_process}")

except FileNotFoundError as e:
    print(f"Error: {e}")

# 3. Loop through each SHAP file name (e.g., 'shap_af.parquet')
for shap_filename in shap_files_to_process:
    print("\n" + "="*80)
    print(f"Processing outcome file: {shap_filename}")

    list_of_dfs = []
    # Collect the full path for the current file from each SEED directory
    for seed in seed_dirs:
        file_path = os.path.join(base_path, seed, shap_filename)
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                list_of_dfs.append(df)
            except Exception as e:
                print(f"Warning: Could not read file {file_path}. Error: {e}. Skipping.")
        else:
            print(f"Warning: File not found and will be skipped: {file_path}")

    if not list_of_dfs:
        print(f"No valid data found for {shap_filename}. Skipping to next outcome.")
        continue
        
    # 4. Calculate the element-wise average of the SHAP values.
    # This is the core calculation. It produces a DataFrame with the exact same
    # dimensions (sample size, number of proteins) as the original files.
    final_avg_df = sum(list_of_dfs) / len(list_of_dfs)
    print(f"Successfully averaged {len(list_of_dfs)} files. Resulting shape: {final_avg_df.shape}")

    print("\n--- Verification for a random sample and 10 random proteins ---")
    
    # Check if the dataframe is empty
    if final_avg_df.empty:
        print("Averaged DataFrame is empty, skipping verification.")
    else:
        # Get the number of samples (rows) and proteins (columns)
        num_samples, num_proteins = final_avg_df.shape

        # Pick one random sample (person/row) to check
        random_sample_index = random.randint(0, num_samples - 1)
        print(f"Verifying with data from a single random sample (row index: {random_sample_index})")

        # Pick 10 random proteins (columns) to check
        all_proteins = final_avg_df.columns.tolist()
        num_to_sample = min(10, num_proteins)
        sample_proteins = random.sample(all_proteins, num_to_sample)
        
        verification_data = []
        for protein in sample_proteins:
            row_data = {'Protein': protein}
            
            # Get the specific raw SHAP value from each individual run for the selected sample and protein
            for i, df in enumerate(list_of_dfs):
                seed_name = seed_dirs[i]
                # Use .iloc to get the value at the specific row/column position
                row_data[f'Run_{seed_name}_SHAP'] = df.iloc[random_sample_index][protein]
            
            # Get the final averaged SHAP value for the same sample and protein
            row_data['Final_Avg_SHAP'] = final_avg_df.iloc[random_sample_index][protein]
            verification_data.append(row_data)
        
        verification_df = pd.DataFrame(verification_data)
        # Use to_string() to ensure all columns are displayed properly
        print(verification_df.to_string())
    
    # 6. Save the final averaged DataFrame. The structure is already correct.
    output_file_path = os.path.join(output_path, shap_filename)
    try:
        final_avg_df.to_parquet(output_file_path, engine='pyarrow')
        print(f"\nSuccessfully saved final average SHAP values to:\n{output_file_path}")
    except Exception as e:
        print(f"\nError saving file to {output_file_path}. Error: {e}")

print("\n" + "="*80)
print("All processing complete.")

### Calculate mean absolute SHAP values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

data_directory = '/your path/cardiomicscore/saved/results/SHAP/OmicsNet/Proteomics/Final'
"""
Iterates through 'shap_*.parquet' files in a specified directory, calculates
the mean absolute SHAP values, and plots a bar chart of the top 30 proteins
for each file onto a 2x3 grid.

Args:
    data_directory (str): The path to the data directory containing the SHAP parquet files.
"""
# Check if the directory exists
if not os.path.isdir(data_directory):
    print(f"Error: Directory '{data_directory}' not found.")

# Get a list of all relevant parquet files
parquet_files = [f for f in os.listdir(data_directory) if f.startswith('shap_') and f.endswith('.parquet')]

# Ensure we have files to process
if not parquet_files:
    print(f"No 'shap_*.parquet' files found in the directory '{data_directory}'.")

# --- Plotting Setup ---
# 1. Create a 2x3 subplot grid. The figsize needs to be large enough for all charts.
# fig is the entire figure window, axes is an array containing 6 subplot objects.
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(22, 14))

# Flatten the 2D axes array into a 1D array for easy iteration
axes = axes.flatten()

# --- Loop through files and plot ---
for i, filename in enumerate(parquet_files):
    # Stop if the number of files exceeds the number of subplots
    if i >= 6:
        print("Warning: More than 6 parquet files found. Only processing the first 6.")
        break

    # Extract the outcome name from the filename
    outcome_name = filename.replace('shap_', '').replace('.parquet', '')
    full_path = os.path.join(data_directory, filename)
    
    print(f"--- Processing: {filename} (Outcome: {outcome_name.upper()}) ---")

    # --- Data Loading and Processing ---
    try:
        # Read the parquet file instead of csv
        df = pd.read_parquet(full_path)
        if 'eid' in df.columns:
            df = df.drop(columns=['eid'])
            print("  'eid' column removed.")
    except FileNotFoundError:
        print(f"  Error: File '{full_path}' not found.")
        continue
        
    mean_abs_shap = df.abs().mean(axis=0)
    top_30_shap = mean_abs_shap.sort_values(ascending=False).head(30)
    
    print(f"  Calculated Top 30 proteins for {outcome_name.upper()}.")

    # --- Plot on the designated subplot ---
    # 2. Select the current subplot to draw on
    ax = axes[i]
    
    # To display the most important proteins at the top, reverse the data order
    top_30_shap.iloc[::-1].plot(kind='barh', ax=ax, color='c', zorder=2)
    
    # 3. Set the title and labels for each subplot
    ax.set_title(f'Top 30 for {outcome_name.upper()}', fontsize=14, weight='bold')
    ax.set_xlabel('Mean Absolute SHAP Value', fontsize=10)
    ax.set_ylabel('Protein', fontsize=10)
    ax.tick_params(axis='y', labelsize=8) # Adjust y-axis label font size
    ax.grid(axis='x', linestyle='--', alpha=0.7, zorder=1) # Add a grid for the x-axis

# --- Post-processing ---
# If there are fewer than 6 files, hide the remaining empty subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# 4. Add a main title for the entire figure
fig.suptitle('Top 30 Proteins by Mean Absolute SHAP Value for Each CVD Outcome', fontsize=20, weight='bold')

# 5. Automatically adjust the layout to prevent overlap
# The rect parameter makes room for the main title (suptitle)
fig.tight_layout(rect=[0, 0, 1, 0.96])

# 6. Save and display the final figure
# You can uncomment the next line to save the figure to a file
# plt.savefig('combined_shap_analysis.png', dpi=300)
plt.show()

print("\nAll files processed.")

## Metabolomics

### Merge SHAP values

In [None]:
import os
import pandas as pd
import numpy as np
import random

"""
Reads SHAP value files from multiple runs, calculates the element-wise average,
displays a verification sample for a single data point, and saves the final averaged SHAP values.
"""
# 1. Define paths
# Base path where the SEED directories (1234, 1235, etc.) are located
base_path = "/your path/cardiomicscore/saved/results/SHAP/OmicsNet/Metabolomics"

# Output path for the final averaged results
output_path = os.path.join(base_path, "Final")

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Results will be saved to: {output_path}")

# 2. Automatically discover SEED directories and outcome files
try:
    # Find all directories that are named like a number (our SEEDs)
    seed_dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()])
    if not seed_dirs:
        raise FileNotFoundError("No valid SEED directories found in the base path.")
    
    print(f"Found {len(seed_dirs)} SEED runs: {seed_dirs}")

    # Get the list of SHAP files to process from the first SEED directory
    first_seed_path = os.path.join(base_path, seed_dirs[0])
    shap_files_to_process = [f for f in os.listdir(first_seed_path) if f.endswith('.parquet')]
    if not shap_files_to_process:
        raise FileNotFoundError(f"No .parquet files found in directory: {first_seed_path}")
        
    print(f"Found {len(shap_files_to_process)} outcomes to process: {shap_files_to_process}")

except FileNotFoundError as e:
    print(f"Error: {e}")

# 3. Loop through each SHAP file name (e.g., 'shap_af.parquet')
for shap_filename in shap_files_to_process:
    print("\n" + "="*80)
    print(f"Processing outcome file: {shap_filename}")

    list_of_dfs = []
    # Collect the full path for the current file from each SEED directory
    for seed in seed_dirs:
        file_path = os.path.join(base_path, seed, shap_filename)
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                list_of_dfs.append(df)
            except Exception as e:
                print(f"Warning: Could not read file {file_path}. Error: {e}. Skipping.")
        else:
            print(f"Warning: File not found and will be skipped: {file_path}")

    if not list_of_dfs:
        print(f"No valid data found for {shap_filename}. Skipping to next outcome.")
        continue
        
    # 4. Calculate the element-wise average of the SHAP values.
    # This is the core calculation. It produces a DataFrame with the exact same
    # dimensions (sample size, number of proteins) as the original files.
    final_avg_df = sum(list_of_dfs) / len(list_of_dfs)
    print(f"Successfully averaged {len(list_of_dfs)} files. Resulting shape: {final_avg_df.shape}")

    print("\n--- Verification for a random sample and 10 random proteins ---")
    
    # Check if the dataframe is empty
    if final_avg_df.empty:
        print("Averaged DataFrame is empty, skipping verification.")
    else:
        # Get the number of samples (rows) and proteins (columns)
        num_samples, num_proteins = final_avg_df.shape

        # Pick one random sample (person/row) to check
        random_sample_index = random.randint(0, num_samples - 1)
        print(f"Verifying with data from a single random sample (row index: {random_sample_index})")

        # Pick 10 random proteins (columns) to check
        all_proteins = final_avg_df.columns.tolist()
        num_to_sample = min(10, num_proteins)
        sample_proteins = random.sample(all_proteins, num_to_sample)
        
        verification_data = []
        for protein in sample_proteins:
            row_data = {'Protein': protein}
            
            # Get the specific raw SHAP value from each individual run for the selected sample and protein
            for i, df in enumerate(list_of_dfs):
                seed_name = seed_dirs[i]
                # Use .iloc to get the value at the specific row/column position
                row_data[f'Run_{seed_name}_SHAP'] = df.iloc[random_sample_index][protein]
            
            # Get the final averaged SHAP value for the same sample and protein
            row_data['Final_Avg_SHAP'] = final_avg_df.iloc[random_sample_index][protein]
            verification_data.append(row_data)
        
        verification_df = pd.DataFrame(verification_data)
        # Use to_string() to ensure all columns are displayed properly
        print(verification_df.to_string())
    
    # 6. Save the final averaged DataFrame. The structure is already correct.
    output_file_path = os.path.join(output_path, shap_filename)
    try:
        final_avg_df.to_parquet(output_file_path, engine='pyarrow')
        print(f"\nSuccessfully saved final average SHAP values to:\n{output_file_path}")
    except Exception as e:
        print(f"\nError saving file to {output_file_path}. Error: {e}")

print("\n" + "="*80)
print("All processing complete.")

### Calculate mean absolute SHAP values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

data_directory = '/your path/cardiomicscore/saved/results/SHAP/OmicsNet/Metabolomics/Final'
"""
Iterates through 'shap_*.parquet' files in a specified directory, calculates
the mean absolute SHAP values, and plots a bar chart of the top 30 proteins
for each file onto a 2x3 grid.

Args:
    data_directory (str): The path to the data directory containing the SHAP parquet files.
"""
# Check if the directory exists
if not os.path.isdir(data_directory):
    print(f"Error: Directory '{data_directory}' not found.")

# Get a list of all relevant parquet files
parquet_files = [f for f in os.listdir(data_directory) if f.startswith('shap_') and f.endswith('.parquet')]

# Ensure we have files to process
if not parquet_files:
    print(f"No 'shap_*.parquet' files found in the directory '{data_directory}'.")

# --- Plotting Setup ---
# 1. Create a 2x3 subplot grid. The figsize needs to be large enough for all charts.
# fig is the entire figure window, axes is an array containing 6 subplot objects.
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(22, 14))

# Flatten the 2D axes array into a 1D array for easy iteration
axes = axes.flatten()

# --- Loop through files and plot ---
for i, filename in enumerate(parquet_files):
    # Stop if the number of files exceeds the number of subplots
    if i >= 6:
        print("Warning: More than 6 parquet files found. Only processing the first 6.")
        break

    # Extract the outcome name from the filename
    outcome_name = filename.replace('shap_', '').replace('.parquet', '')
    full_path = os.path.join(data_directory, filename)
    
    print(f"--- Processing: {filename} (Outcome: {outcome_name.upper()}) ---")

    # --- Data Loading and Processing ---
    try:
        # Read the parquet file instead of csv
        df = pd.read_parquet(full_path)
        if 'eid' in df.columns:
            df = df.drop(columns=['eid'])
            print("  'eid' column removed.")
    except FileNotFoundError:
        print(f"  Error: File '{full_path}' not found.")
        continue
        
    mean_abs_shap = df.abs().mean(axis=0)
    top_30_shap = mean_abs_shap.sort_values(ascending=False).head(30)
    
    print(f"  Calculated Top 30 proteins for {outcome_name.upper()}.")

    # --- Plot on the designated subplot ---
    # 2. Select the current subplot to draw on
    ax = axes[i]
    
    # To display the most important proteins at the top, reverse the data order
    top_30_shap.iloc[::-1].plot(kind='barh', ax=ax, color='c', zorder=2)
    
    # 3. Set the title and labels for each subplot
    ax.set_title(f'Top 30 for {outcome_name.upper()}', fontsize=14, weight='bold')
    ax.set_xlabel('Mean Absolute SHAP Value', fontsize=10)
    ax.set_ylabel('Protein', fontsize=10)
    ax.tick_params(axis='y', labelsize=8) # Adjust y-axis label font size
    ax.grid(axis='x', linestyle='--', alpha=0.7, zorder=1) # Add a grid for the x-axis

# --- Post-processing ---
# If there are fewer than 6 files, hide the remaining empty subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# 4. Add a main title for the entire figure
fig.suptitle('Top 30 Metabolites by Mean Absolute SHAP Value for Each CVD Outcome', fontsize=20, weight='bold')

# 5. Automatically adjust the layout to prevent overlap
# The rect parameter makes room for the main title (suptitle)
fig.tight_layout(rect=[0, 0, 1, 0.96])

# 6. Save and display the final figure
# You can uncomment the next line to save the figure to a file
# plt.savefig('combined_shap_analysis.png', dpi=300)
plt.show()

print("\nAll files processed.")

## Metabolomics no statins

### Merge SHAP values

In [None]:
import os
import pandas as pd
import numpy as np
import random

"""
Reads SHAP value files from multiple runs, calculates the element-wise average,
displays a verification sample for a single data point, and saves the final averaged SHAP values.
"""
# 1. Define paths
# Base path where the SEED directories (1234, 1235, etc.) are located
base_path = "/your path/cardiomicscore/saved/results/SHAP/OmicsNet/Metabolomics_no_statins"

# Output path for the final averaged results
output_path = os.path.join(base_path, "Final")

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Results will be saved to: {output_path}")

# 2. Automatically discover SEED directories and outcome files
try:
    # Find all directories that are named like a number (our SEEDs)
    seed_dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()])
    if not seed_dirs:
        raise FileNotFoundError("No valid SEED directories found in the base path.")
    
    print(f"Found {len(seed_dirs)} SEED runs: {seed_dirs}")

    # Get the list of SHAP files to process from the first SEED directory
    first_seed_path = os.path.join(base_path, seed_dirs[0])
    shap_files_to_process = [f for f in os.listdir(first_seed_path) if f.endswith('.parquet')]
    if not shap_files_to_process:
        raise FileNotFoundError(f"No .parquet files found in directory: {first_seed_path}")
        
    print(f"Found {len(shap_files_to_process)} outcomes to process: {shap_files_to_process}")

except FileNotFoundError as e:
    print(f"Error: {e}")

# 3. Loop through each SHAP file name (e.g., 'shap_af.parquet')
for shap_filename in shap_files_to_process:
    print("\n" + "="*80)
    print(f"Processing outcome file: {shap_filename}")

    list_of_dfs = []
    # Collect the full path for the current file from each SEED directory
    for seed in seed_dirs:
        file_path = os.path.join(base_path, seed, shap_filename)
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                list_of_dfs.append(df)
            except Exception as e:
                print(f"Warning: Could not read file {file_path}. Error: {e}. Skipping.")
        else:
            print(f"Warning: File not found and will be skipped: {file_path}")

    if not list_of_dfs:
        print(f"No valid data found for {shap_filename}. Skipping to next outcome.")
        continue
        
    # 4. Calculate the element-wise average of the SHAP values.
    # This is the core calculation. It produces a DataFrame with the exact same
    # dimensions (sample size, number of proteins) as the original files.
    final_avg_df = sum(list_of_dfs) / len(list_of_dfs)
    print(f"Successfully averaged {len(list_of_dfs)} files. Resulting shape: {final_avg_df.shape}")

    print("\n--- Verification for a random sample and 10 random proteins ---")
    
    # Check if the dataframe is empty
    if final_avg_df.empty:
        print("Averaged DataFrame is empty, skipping verification.")
    else:
        # Get the number of samples (rows) and proteins (columns)
        num_samples, num_proteins = final_avg_df.shape

        # Pick one random sample (person/row) to check
        random_sample_index = random.randint(0, num_samples - 1)
        print(f"Verifying with data from a single random sample (row index: {random_sample_index})")

        # Pick 10 random proteins (columns) to check
        all_proteins = final_avg_df.columns.tolist()
        num_to_sample = min(10, num_proteins)
        sample_proteins = random.sample(all_proteins, num_to_sample)
        
        verification_data = []
        for protein in sample_proteins:
            row_data = {'Protein': protein}
            
            # Get the specific raw SHAP value from each individual run for the selected sample and protein
            for i, df in enumerate(list_of_dfs):
                seed_name = seed_dirs[i]
                # Use .iloc to get the value at the specific row/column position
                row_data[f'Run_{seed_name}_SHAP'] = df.iloc[random_sample_index][protein]
            
            # Get the final averaged SHAP value for the same sample and protein
            row_data['Final_Avg_SHAP'] = final_avg_df.iloc[random_sample_index][protein]
            verification_data.append(row_data)
        
        verification_df = pd.DataFrame(verification_data)
        # Use to_string() to ensure all columns are displayed properly
        print(verification_df.to_string())
    
    # 6. Save the final averaged DataFrame. The structure is already correct.
    output_file_path = os.path.join(output_path, shap_filename)
    try:
        final_avg_df.to_parquet(output_file_path, engine='pyarrow')
        print(f"\nSuccessfully saved final average SHAP values to:\n{output_file_path}")
    except Exception as e:
        print(f"\nError saving file to {output_file_path}. Error: {e}")

print("\n" + "="*80)
print("All processing complete.")

### Calculate mean absolute SHAP values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

data_directory = '/your path/cardiomicscore/saved/results/SHAP/OmicsNet/Metabolomics_no_statins/Final'
"""
Iterates through 'shap_*.parquet' files in a specified directory, calculates
the mean absolute SHAP values, and plots a bar chart of the top 30 proteins
for each file onto a 2x3 grid.

Args:
    data_directory (str): The path to the data directory containing the SHAP parquet files.
"""
# Check if the directory exists
if not os.path.isdir(data_directory):
    print(f"Error: Directory '{data_directory}' not found.")

# Get a list of all relevant parquet files
parquet_files = [f for f in os.listdir(data_directory) if f.startswith('shap_') and f.endswith('.parquet')]

# Ensure we have files to process
if not parquet_files:
    print(f"No 'shap_*.parquet' files found in the directory '{data_directory}'.")

# --- Plotting Setup ---
# 1. Create a 2x3 subplot grid. The figsize needs to be large enough for all charts.
# fig is the entire figure window, axes is an array containing 6 subplot objects.
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(22, 14))

# Flatten the 2D axes array into a 1D array for easy iteration
axes = axes.flatten()

# --- Loop through files and plot ---
for i, filename in enumerate(parquet_files):
    # Stop if the number of files exceeds the number of subplots
    if i >= 6:
        print("Warning: More than 6 parquet files found. Only processing the first 6.")
        break

    # Extract the outcome name from the filename
    outcome_name = filename.replace('shap_', '').replace('.parquet', '')
    full_path = os.path.join(data_directory, filename)
    
    print(f"--- Processing: {filename} (Outcome: {outcome_name.upper()}) ---")

    # --- Data Loading and Processing ---
    try:
        # Read the parquet file instead of csv
        df = pd.read_parquet(full_path)
        if 'eid' in df.columns:
            df = df.drop(columns=['eid'])
            print("  'eid' column removed.")
    except FileNotFoundError:
        print(f"  Error: File '{full_path}' not found.")
        continue
        
    mean_abs_shap = df.abs().mean(axis=0)
    top_30_shap = mean_abs_shap.sort_values(ascending=False).head(30)
    
    print(f"  Calculated Top 30 proteins for {outcome_name.upper()}.")

    # --- Plot on the designated subplot ---
    # 2. Select the current subplot to draw on
    ax = axes[i]
    
    # To display the most important proteins at the top, reverse the data order
    top_30_shap.iloc[::-1].plot(kind='barh', ax=ax, color='c', zorder=2)
    
    # 3. Set the title and labels for each subplot
    ax.set_title(f'Top 30 for {outcome_name.upper()}', fontsize=14, weight='bold')
    ax.set_xlabel('Mean Absolute SHAP Value', fontsize=10)
    ax.set_ylabel('Protein', fontsize=10)
    ax.tick_params(axis='y', labelsize=8) # Adjust y-axis label font size
    ax.grid(axis='x', linestyle='--', alpha=0.7, zorder=1) # Add a grid for the x-axis

# --- Post-processing ---
# If there are fewer than 6 files, hide the remaining empty subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# 4. Add a main title for the entire figure
fig.suptitle('Top 30 Metabolites by Mean Absolute SHAP Value for Each CVD Outcome', fontsize=20, weight='bold')

# 5. Automatically adjust the layout to prevent overlap
# The rect parameter makes room for the main title (suptitle)
fig.tight_layout(rect=[0, 0, 1, 0.96])

# 6. Save and display the final figure
# You can uncomment the next line to save the figure to a file
# plt.savefig('combined_shap_analysis.png', dpi=300)
plt.show()

print("\nAll files processed.")

# OmicsNet_Unweighted

## Proteomics

### Merge SHAP values

In [None]:
import os
import pandas as pd
import numpy as np
import random

"""
Reads SHAP value files from multiple runs, calculates the element-wise average,
displays a verification sample for a single data point, and saves the final averaged SHAP values.
"""
# 1. Define paths
# Base path where the SEED directories (1234, 1235, etc.) are located
base_path = "/your path/cardiomicscore/saved/results/SHAP/OmicsNet_Unweighted/Proteomics"

# Output path for the final averaged results
output_path = os.path.join(base_path, "Final")

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Results will be saved to: {output_path}")

# 2. Automatically discover SEED directories and outcome files
try:
    # Find all directories that are named like a number (our SEEDs)
    seed_dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()])
    if not seed_dirs:
        raise FileNotFoundError("No valid SEED directories found in the base path.")
    
    print(f"Found {len(seed_dirs)} SEED runs: {seed_dirs}")

    # Get the list of SHAP files to process from the first SEED directory
    first_seed_path = os.path.join(base_path, seed_dirs[0])
    shap_files_to_process = [f for f in os.listdir(first_seed_path) if f.endswith('.parquet')]
    if not shap_files_to_process:
        raise FileNotFoundError(f"No .parquet files found in directory: {first_seed_path}")
        
    print(f"Found {len(shap_files_to_process)} outcomes to process: {shap_files_to_process}")

except FileNotFoundError as e:
    print(f"Error: {e}")

# 3. Loop through each SHAP file name (e.g., 'shap_af.parquet')
for shap_filename in shap_files_to_process:
    print("\n" + "="*80)
    print(f"Processing outcome file: {shap_filename}")

    list_of_dfs = []
    # Collect the full path for the current file from each SEED directory
    for seed in seed_dirs:
        file_path = os.path.join(base_path, seed, shap_filename)
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                list_of_dfs.append(df)
            except Exception as e:
                print(f"Warning: Could not read file {file_path}. Error: {e}. Skipping.")
        else:
            print(f"Warning: File not found and will be skipped: {file_path}")

    if not list_of_dfs:
        print(f"No valid data found for {shap_filename}. Skipping to next outcome.")
        continue
        
    # 4. Calculate the element-wise average of the SHAP values.
    # This is the core calculation. It produces a DataFrame with the exact same
    # dimensions (sample size, number of proteins) as the original files.
    final_avg_df = sum(list_of_dfs) / len(list_of_dfs)
    print(f"Successfully averaged {len(list_of_dfs)} files. Resulting shape: {final_avg_df.shape}")

    print("\n--- Verification for a random sample and 10 random proteins ---")
    
    # Check if the dataframe is empty
    if final_avg_df.empty:
        print("Averaged DataFrame is empty, skipping verification.")
    else:
        # Get the number of samples (rows) and proteins (columns)
        num_samples, num_proteins = final_avg_df.shape

        # Pick one random sample (person/row) to check
        random_sample_index = random.randint(0, num_samples - 1)
        print(f"Verifying with data from a single random sample (row index: {random_sample_index})")

        # Pick 10 random proteins (columns) to check
        all_proteins = final_avg_df.columns.tolist()
        num_to_sample = min(10, num_proteins)
        sample_proteins = random.sample(all_proteins, num_to_sample)
        
        verification_data = []
        for protein in sample_proteins:
            row_data = {'Protein': protein}
            
            # Get the specific raw SHAP value from each individual run for the selected sample and protein
            for i, df in enumerate(list_of_dfs):
                seed_name = seed_dirs[i]
                # Use .iloc to get the value at the specific row/column position
                row_data[f'Run_{seed_name}_SHAP'] = df.iloc[random_sample_index][protein]
            
            # Get the final averaged SHAP value for the same sample and protein
            row_data['Final_Avg_SHAP'] = final_avg_df.iloc[random_sample_index][protein]
            verification_data.append(row_data)
        
        verification_df = pd.DataFrame(verification_data)
        # Use to_string() to ensure all columns are displayed properly
        print(verification_df.to_string())
    
    # 6. Save the final averaged DataFrame. The structure is already correct.
    output_file_path = os.path.join(output_path, shap_filename)
    try:
        final_avg_df.to_parquet(output_file_path, engine='pyarrow')
        print(f"\nSuccessfully saved final average SHAP values to:\n{output_file_path}")
    except Exception as e:
        print(f"\nError saving file to {output_file_path}. Error: {e}")

print("\n" + "="*80)
print("All processing complete.")

### Calculate mean absolute SHAP values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

data_directory = '/your path/cardiomicscore/saved/results/SHAP/OmicsNet_Unweighted/Proteomics/Final'
"""
Iterates through 'shap_*.parquet' files in a specified directory, calculates
the mean absolute SHAP values, and plots a bar chart of the top 30 proteins
for each file onto a 2x3 grid.

Args:
    data_directory (str): The path to the data directory containing the SHAP parquet files.
"""
# Check if the directory exists
if not os.path.isdir(data_directory):
    print(f"Error: Directory '{data_directory}' not found.")

# Get a list of all relevant parquet files
parquet_files = [f for f in os.listdir(data_directory) if f.startswith('shap_') and f.endswith('.parquet')]

# Ensure we have files to process
if not parquet_files:
    print(f"No 'shap_*.parquet' files found in the directory '{data_directory}'.")

# --- Plotting Setup ---
# 1. Create a 2x3 subplot grid. The figsize needs to be large enough for all charts.
# fig is the entire figure window, axes is an array containing 6 subplot objects.
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(22, 14))

# Flatten the 2D axes array into a 1D array for easy iteration
axes = axes.flatten()

# --- Loop through files and plot ---
for i, filename in enumerate(parquet_files):
    # Stop if the number of files exceeds the number of subplots
    if i >= 6:
        print("Warning: More than 6 parquet files found. Only processing the first 6.")
        break

    # Extract the outcome name from the filename
    outcome_name = filename.replace('shap_', '').replace('.parquet', '')
    full_path = os.path.join(data_directory, filename)
    
    print(f"--- Processing: {filename} (Outcome: {outcome_name.upper()}) ---")

    # --- Data Loading and Processing ---
    try:
        # Read the parquet file instead of csv
        df = pd.read_parquet(full_path)
        if 'eid' in df.columns:
            df = df.drop(columns=['eid'])
            print("  'eid' column removed.")
    except FileNotFoundError:
        print(f"  Error: File '{full_path}' not found.")
        continue
        
    mean_abs_shap = df.abs().mean(axis=0)
    top_30_shap = mean_abs_shap.sort_values(ascending=False).head(30)
    
    print(f"  Calculated Top 30 proteins for {outcome_name.upper()}.")

    # --- Plot on the designated subplot ---
    # 2. Select the current subplot to draw on
    ax = axes[i]
    
    # To display the most important proteins at the top, reverse the data order
    top_30_shap.iloc[::-1].plot(kind='barh', ax=ax, color='c', zorder=2)
    
    # 3. Set the title and labels for each subplot
    ax.set_title(f'Top 30 for {outcome_name.upper()}', fontsize=14, weight='bold')
    ax.set_xlabel('Mean Absolute SHAP Value', fontsize=10)
    ax.set_ylabel('Protein', fontsize=10)
    ax.tick_params(axis='y', labelsize=8) # Adjust y-axis label font size
    ax.grid(axis='x', linestyle='--', alpha=0.7, zorder=1) # Add a grid for the x-axis

# --- Post-processing ---
# If there are fewer than 6 files, hide the remaining empty subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# 4. Add a main title for the entire figure
fig.suptitle('Top 30 Metabolites by Mean Absolute SHAP Value for Each CVD Outcome', fontsize=20, weight='bold')

# 5. Automatically adjust the layout to prevent overlap
# The rect parameter makes room for the main title (suptitle)
fig.tight_layout(rect=[0, 0, 1, 0.96])

# 6. Save and display the final figure
# You can uncomment the next line to save the figure to a file
# plt.savefig('combined_shap_analysis.png', dpi=300)
plt.show()

print("\nAll files processed.")

## Metabolomics

### Merge SHAP values

In [None]:
import os
import pandas as pd
import numpy as np
import random

"""
Reads SHAP value files from multiple runs, calculates the element-wise average,
displays a verification sample for a single data point, and saves the final averaged SHAP values.
"""
# 1. Define paths
# Base path where the SEED directories (1234, 1235, etc.) are located
base_path = "/your path/cardiomicscore/saved/results/SHAP/OmicsNet_Unweighted/Metabolomics"

# Output path for the final averaged results
output_path = os.path.join(base_path, "Final")

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Results will be saved to: {output_path}")

# 2. Automatically discover SEED directories and outcome files
try:
    # Find all directories that are named like a number (our SEEDs)
    seed_dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()])
    if not seed_dirs:
        raise FileNotFoundError("No valid SEED directories found in the base path.")
    
    print(f"Found {len(seed_dirs)} SEED runs: {seed_dirs}")

    # Get the list of SHAP files to process from the first SEED directory
    first_seed_path = os.path.join(base_path, seed_dirs[0])
    shap_files_to_process = [f for f in os.listdir(first_seed_path) if f.endswith('.parquet')]
    if not shap_files_to_process:
        raise FileNotFoundError(f"No .parquet files found in directory: {first_seed_path}")
        
    print(f"Found {len(shap_files_to_process)} outcomes to process: {shap_files_to_process}")

except FileNotFoundError as e:
    print(f"Error: {e}")

# 3. Loop through each SHAP file name (e.g., 'shap_af.parquet')
for shap_filename in shap_files_to_process:
    print("\n" + "="*80)
    print(f"Processing outcome file: {shap_filename}")

    list_of_dfs = []
    # Collect the full path for the current file from each SEED directory
    for seed in seed_dirs:
        file_path = os.path.join(base_path, seed, shap_filename)
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                list_of_dfs.append(df)
            except Exception as e:
                print(f"Warning: Could not read file {file_path}. Error: {e}. Skipping.")
        else:
            print(f"Warning: File not found and will be skipped: {file_path}")

    if not list_of_dfs:
        print(f"No valid data found for {shap_filename}. Skipping to next outcome.")
        continue
        
    # 4. Calculate the element-wise average of the SHAP values.
    # This is the core calculation. It produces a DataFrame with the exact same
    # dimensions (sample size, number of proteins) as the original files.
    final_avg_df = sum(list_of_dfs) / len(list_of_dfs)
    print(f"Successfully averaged {len(list_of_dfs)} files. Resulting shape: {final_avg_df.shape}")

    print("\n--- Verification for a random sample and 10 random proteins ---")
    
    # Check if the dataframe is empty
    if final_avg_df.empty:
        print("Averaged DataFrame is empty, skipping verification.")
    else:
        # Get the number of samples (rows) and proteins (columns)
        num_samples, num_proteins = final_avg_df.shape

        # Pick one random sample (person/row) to check
        random_sample_index = random.randint(0, num_samples - 1)
        print(f"Verifying with data from a single random sample (row index: {random_sample_index})")

        # Pick 10 random proteins (columns) to check
        all_proteins = final_avg_df.columns.tolist()
        num_to_sample = min(10, num_proteins)
        sample_proteins = random.sample(all_proteins, num_to_sample)
        
        verification_data = []
        for protein in sample_proteins:
            row_data = {'Protein': protein}
            
            # Get the specific raw SHAP value from each individual run for the selected sample and protein
            for i, df in enumerate(list_of_dfs):
                seed_name = seed_dirs[i]
                # Use .iloc to get the value at the specific row/column position
                row_data[f'Run_{seed_name}_SHAP'] = df.iloc[random_sample_index][protein]
            
            # Get the final averaged SHAP value for the same sample and protein
            row_data['Final_Avg_SHAP'] = final_avg_df.iloc[random_sample_index][protein]
            verification_data.append(row_data)
        
        verification_df = pd.DataFrame(verification_data)
        # Use to_string() to ensure all columns are displayed properly
        print(verification_df.to_string())
    
    # 6. Save the final averaged DataFrame. The structure is already correct.
    output_file_path = os.path.join(output_path, shap_filename)
    try:
        final_avg_df.to_parquet(output_file_path, engine='pyarrow')
        print(f"\nSuccessfully saved final average SHAP values to:\n{output_file_path}")
    except Exception as e:
        print(f"\nError saving file to {output_file_path}. Error: {e}")

print("\n" + "="*80)
print("All processing complete.")

### Calculate mean absolute SHAP values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

data_directory = '/your path/cardiomicscore/saved/results/SHAP/OmicsNet_Unweighted/Metabolomics/Final'
"""
Iterates through 'shap_*.parquet' files in a specified directory, calculates
the mean absolute SHAP values, and plots a bar chart of the top 30 proteins
for each file onto a 2x3 grid.

Args:
    data_directory (str): The path to the data directory containing the SHAP parquet files.
"""
# Check if the directory exists
if not os.path.isdir(data_directory):
    print(f"Error: Directory '{data_directory}' not found.")

# Get a list of all relevant parquet files
parquet_files = [f for f in os.listdir(data_directory) if f.startswith('shap_') and f.endswith('.parquet')]

# Ensure we have files to process
if not parquet_files:
    print(f"No 'shap_*.parquet' files found in the directory '{data_directory}'.")

# --- Plotting Setup ---
# 1. Create a 2x3 subplot grid. The figsize needs to be large enough for all charts.
# fig is the entire figure window, axes is an array containing 6 subplot objects.
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(22, 14))

# Flatten the 2D axes array into a 1D array for easy iteration
axes = axes.flatten()

# --- Loop through files and plot ---
for i, filename in enumerate(parquet_files):
    # Stop if the number of files exceeds the number of subplots
    if i >= 6:
        print("Warning: More than 6 parquet files found. Only processing the first 6.")
        break

    # Extract the outcome name from the filename
    outcome_name = filename.replace('shap_', '').replace('.parquet', '')
    full_path = os.path.join(data_directory, filename)
    
    print(f"--- Processing: {filename} (Outcome: {outcome_name.upper()}) ---")

    # --- Data Loading and Processing ---
    try:
        # Read the parquet file instead of csv
        df = pd.read_parquet(full_path)
        if 'eid' in df.columns:
            df = df.drop(columns=['eid'])
            print("  'eid' column removed.")
    except FileNotFoundError:
        print(f"  Error: File '{full_path}' not found.")
        continue
        
    mean_abs_shap = df.abs().mean(axis=0)
    top_30_shap = mean_abs_shap.sort_values(ascending=False).head(30)
    
    print(f"  Calculated Top 30 proteins for {outcome_name.upper()}.")

    # --- Plot on the designated subplot ---
    # 2. Select the current subplot to draw on
    ax = axes[i]
    
    # To display the most important proteins at the top, reverse the data order
    top_30_shap.iloc[::-1].plot(kind='barh', ax=ax, color='c', zorder=2)
    
    # 3. Set the title and labels for each subplot
    ax.set_title(f'Top 30 for {outcome_name.upper()}', fontsize=14, weight='bold')
    ax.set_xlabel('Mean Absolute SHAP Value', fontsize=10)
    ax.set_ylabel('Protein', fontsize=10)
    ax.tick_params(axis='y', labelsize=8) # Adjust y-axis label font size
    ax.grid(axis='x', linestyle='--', alpha=0.7, zorder=1) # Add a grid for the x-axis

# --- Post-processing ---
# If there are fewer than 6 files, hide the remaining empty subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# 4. Add a main title for the entire figure
fig.suptitle('Top 30 Metabolites by Mean Absolute SHAP Value for Each CVD Outcome', fontsize=20, weight='bold')

# 5. Automatically adjust the layout to prevent overlap
# The rect parameter makes room for the main title (suptitle)
fig.tight_layout(rect=[0, 0, 1, 0.96])

# 6. Save and display the final figure
# You can uncomment the next line to save the figure to a file
# plt.savefig('combined_shap_analysis.png', dpi=300)
plt.show()

print("\nAll files processed.")