# OmicsNet

In [None]:
import os
import pandas as pd
import random

"""
Reads score files from multiple runs, calculates the average score for each sample (eid),
displays a verification sample, and saves the final averaged scores.
"""
# 1. Define paths
# Base path where the SEED directories (1234, 1235, etc.) are located
base_path = "/your path/cardiomicscore/saved/results/Scores/OmicsNet"

# Output path for the final averaged results
output_path = os.path.join(base_path, "Final")

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Results will be saved to: {output_path}")

# 2. Automatically discover SEED directories and score files
try:
    # Find all directories that are named like a number (our SEEDs)
    seed_dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()])
    if not seed_dirs:
        raise FileNotFoundError("No valid SEED directories found in the base path.")
    
    print(f"Found {len(seed_dirs)} SEED runs: {seed_dirs}")

    # Get the list of score files to process from the first SEED directory
    first_seed_path = os.path.join(base_path, seed_dirs[0])
    score_files_to_process = [f for f in os.listdir(first_seed_path) if f.endswith('.csv')]
    if not score_files_to_process:
        raise FileNotFoundError(f"No .csv files found in directory: {first_seed_path}")
        
    print(f"Found {len(score_files_to_process)} unique score files to process.")

except FileNotFoundError as e:
    print(f"Error: {e}")

# 3. Loop through each unique score file name
for score_filename in score_files_to_process:
    print("\n" + "="*80)
    print(f"Processing score file: {score_filename}")

    list_of_dfs = []
    # Collect the full path for the current file from each SEED directory
    for seed in seed_dirs:
        file_path = os.path.join(base_path, seed, score_filename)
        if os.path.exists(file_path):
            try:
                df = pd.read_csv(file_path)
                list_of_dfs.append(df)
            except Exception as e:
                print(f"Warning: Could not read file {file_path}. Error: {e}. Skipping.")
        else:
            print(f"Warning: File not found and will be skipped: {file_path}")

    if not list_of_dfs:
        print(f"No valid data found for {score_filename}. Skipping to next file.")
        continue
        
    # 4. Core averaging logic:
    # Concatenate all dataframes, set 'eid' as the index,
    # then group by 'eid' and calculate the mean. This is the most robust method.
    combined_df = pd.concat([df.set_index('eid') for df in list_of_dfs])
    final_avg_df = combined_df.groupby('eid').mean().reset_index()
    
    print(f"Successfully averaged {len(list_of_dfs)} files. Resulting shape: {final_avg_df.shape}")

    # 5. Verification step for 10 random samples
    print("\n--- Verification for 10 random samples (eids) ---")
    
    # Get the list of score columns (all columns except 'eid')
    score_columns = [col for col in final_avg_df.columns if col != 'eid']
    if not score_columns:
        print("No score columns found for verification. Skipping.")
    else:
        # We will use the first score column for the verification display
        verification_col = score_columns[0]
        print(f"Showing verification for the first score column: '{verification_col}'")
        
        # Get 10 random eids from the final result
        sample_eids = random.sample(final_avg_df['eid'].tolist(), min(10, len(final_avg_df)))
        
        verification_data = []
        for eid in sample_eids:
            row_data = {'eid': eid}
            # Get score from each individual run
            for i, df in enumerate(list_of_dfs):
                seed_name = seed_dirs[i]
                # Find the score for the specific eid in the original dataframe
                score = df.loc[df['eid'] == eid, verification_col].values[0]
                row_data[f'Run_{seed_name}_Score'] = score
            
            # Get the final averaged score
            final_score = final_avg_df.loc[final_avg_df['eid'] == eid, verification_col].values[0]
            row_data['Final_Avg_Score'] = final_score
            verification_data.append(row_data)

        verification_df = pd.DataFrame(verification_data)
        print(verification_df.to_string())

    # 6. Save the final averaged scores to a new CSV file
    output_file_path = os.path.join(output_path, score_filename)
    try:
        final_avg_df.to_csv(output_file_path, index=False)
        print(f"\nSuccessfully saved final average scores to:\n{output_file_path}")
    except Exception as e:
        print(f"\nError saving file to {output_file_path}. Error: {e}")

print("\n" + "="*80)
print("All processing complete.")

# OmicsNet_Unweighted

In [None]:
import os
import pandas as pd
import random

"""
Reads score files from multiple runs, calculates the average score for each sample (eid),
displays a verification sample, and saves the final averaged scores.
"""
# 1. Define paths
# Base path where the SEED directories (1234, 1235, etc.) are located
base_path = "/your path/cardiomicscore/saved/results/Scores/OmicsNet_Unweighted"

# Output path for the final averaged results
output_path = os.path.join(base_path, "Final")

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Results will be saved to: {output_path}")

# 2. Automatically discover SEED directories and score files
try:
    # Find all directories that are named like a number (our SEEDs)
    seed_dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()])
    if not seed_dirs:
        raise FileNotFoundError("No valid SEED directories found in the base path.")
    
    print(f"Found {len(seed_dirs)} SEED runs: {seed_dirs}")

    # Get the list of score files to process from the first SEED directory
    first_seed_path = os.path.join(base_path, seed_dirs[0])
    score_files_to_process = [f for f in os.listdir(first_seed_path) if f.endswith('.csv')]
    if not score_files_to_process:
        raise FileNotFoundError(f"No .csv files found in directory: {first_seed_path}")
        
    print(f"Found {len(score_files_to_process)} unique score files to process.")

except FileNotFoundError as e:
    print(f"Error: {e}")

# 3. Loop through each unique score file name
for score_filename in score_files_to_process:
    print("\n" + "="*80)
    print(f"Processing score file: {score_filename}")

    list_of_dfs = []
    # Collect the full path for the current file from each SEED directory
    for seed in seed_dirs:
        file_path = os.path.join(base_path, seed, score_filename)
        if os.path.exists(file_path):
            try:
                df = pd.read_csv(file_path)
                list_of_dfs.append(df)
            except Exception as e:
                print(f"Warning: Could not read file {file_path}. Error: {e}. Skipping.")
        else:
            print(f"Warning: File not found and will be skipped: {file_path}")

    if not list_of_dfs:
        print(f"No valid data found for {score_filename}. Skipping to next file.")
        continue
        
    # 4. Core averaging logic:
    # Concatenate all dataframes, set 'eid' as the index,
    # then group by 'eid' and calculate the mean. This is the most robust method.
    combined_df = pd.concat([df.set_index('eid') for df in list_of_dfs])
    final_avg_df = combined_df.groupby('eid').mean().reset_index()
    
    print(f"Successfully averaged {len(list_of_dfs)} files. Resulting shape: {final_avg_df.shape}")

    # 5. Verification step for 10 random samples
    print("\n--- Verification for 10 random samples (eids) ---")
    
    # Get the list of score columns (all columns except 'eid')
    score_columns = [col for col in final_avg_df.columns if col != 'eid']
    if not score_columns:
        print("No score columns found for verification. Skipping.")
    else:
        # We will use the first score column for the verification display
        verification_col = score_columns[0]
        print(f"Showing verification for the first score column: '{verification_col}'")
        
        # Get 10 random eids from the final result
        sample_eids = random.sample(final_avg_df['eid'].tolist(), min(10, len(final_avg_df)))
        
        verification_data = []
        for eid in sample_eids:
            row_data = {'eid': eid}
            # Get score from each individual run
            for i, df in enumerate(list_of_dfs):
                seed_name = seed_dirs[i]
                # Find the score for the specific eid in the original dataframe
                score = df.loc[df['eid'] == eid, verification_col].values[0]
                row_data[f'Run_{seed_name}_Score'] = score
            
            # Get the final averaged score
            final_score = final_avg_df.loc[final_avg_df['eid'] == eid, verification_col].values[0]
            row_data['Final_Avg_Score'] = final_score
            verification_data.append(row_data)

        verification_df = pd.DataFrame(verification_data)
        print(verification_df.to_string())

    # 6. Save the final averaged scores to a new CSV file
    output_file_path = os.path.join(output_path, score_filename)
    try:
        final_avg_df.to_csv(output_file_path, index=False)
        print(f"\nSuccessfully saved final average scores to:\n{output_file_path}")
    except Exception as e:
        print(f"\nError saving file to {output_file_path}. Error: {e}")

print("\n" + "="*80)
print("All processing complete.")