In [None]:
import pandas as pd
import os

protein_path = "dataset/imputed_CruchagaLab_CSF.csv" # Using the imputed dataset
protein_metadata_path = "dataset/ADNI_Cruchaga_lab_CSF.csv"
imaging_path = "dataset/MuSIC_cov.tsv" 

In [20]:
# Paths for different body system protein files
body_system_proteins_paths = {
    "cardiovascular": "dataset/matched/matched_cardiovascular.csv",
    "brain": "dataset/matched/matched_brain_tissue.csv",
    "digestive": "dataset/matched/matched_digestive_system.csv",
    "endocrine": "dataset/matched/matched_endocrine.csv",
    "female_reproductive": "dataset/matched/matched_female_reproductive.csv",
    "hepatic": "dataset/matched/matched_hepatic.csv",
    "kidney": "dataset/matched/matched_kidney.csv",
    "male_reproductive": "dataset/matched/matched_male_reproductive.csv",
    "pulmonary": "dataset/matched/matched_pulmonary.csv",
    "retina": "dataset/matched/matched_retina.csv",
    "skin": "dataset/matched/matched_skin.csv",
}

protein_metadata = pd.read_csv(protein_metadata_path)

def filter_and_save_metadata(body_system_name, proteins_path):
    """
    Filters the protein metadata for a specific body system
    and saves the result as a CSV file.
    """
    # Load the specific body system proteins
    body_system_proteins = pd.read_csv(proteins_path)
    
    # Get unique genes from 'MatchedGene'
    body_system_genes = body_system_proteins["MatchedGene"].unique()
    
    # Filter rows in protein_metadata where 'EntrezGeneSymbol' matches body system genes
    filtered_metadata = protein_metadata[protein_metadata["EntrezGeneSymbol"].isin(body_system_genes)]
    
    # Drop duplicates based on 'EntrezGeneSymbol'
    filtered_metadata = filtered_metadata.drop_duplicates(subset="EntrezGeneSymbol")
    
    # Save the filtered dataset with an appropriate name
    output_path = f"dataset/ADNI_{body_system_name}.csv"
    filtered_metadata.to_csv(output_path, index=False)
    
    print(f"Filtered {body_system_name} dataset saved as '{output_path}'")


# Loop through each body system and process the files
for body_system_name, proteins_path in body_system_proteins_paths.items():
    filter_and_save_metadata(body_system_name, proteins_path)




Filtered cardiovascular dataset saved as 'dataset/ADNI_cardiovascular.csv'
Filtered brain dataset saved as 'dataset/ADNI_brain.csv'
Filtered digestive dataset saved as 'dataset/ADNI_digestive.csv'
Filtered endocrine dataset saved as 'dataset/ADNI_endocrine.csv'
Filtered female_reproductive dataset saved as 'dataset/ADNI_female_reproductive.csv'
Filtered hepatic dataset saved as 'dataset/ADNI_hepatic.csv'
Filtered kidney dataset saved as 'dataset/ADNI_kidney.csv'
Filtered male_reproductive dataset saved as 'dataset/ADNI_male_reproductive.csv'
Filtered pulmonary dataset saved as 'dataset/ADNI_pulmonary.csv'
Filtered retina dataset saved as 'dataset/ADNI_retina.csv'
Filtered skin dataset saved as 'dataset/ADNI_skin.csv'


In [9]:


# Load the datasets
protein = pd.read_csv(protein_path)
imaging_data = pd.read_csv(imaging_path, sep="\t")

# Extract the last 4 digits from participant_id in the imaging dataset
imaging_data["RID_from_participant_id"] = imaging_data["participant_id"].str[-4:].astype(int)

# Merge the datasets on RID
merged_data = pd.merge(protein, imaging_data, left_on="RID", right_on="RID_from_participant_id", how="inner")

# # Identify unmatched row(s)
# merged_with_left_join = pd.merge(protein, imaging_data, left_on="RID", right_on="RID_from_participant_id", how="left")
# unmatched_row = merged_with_left_join[merged_with_left_join["RID_from_participant_id"].isna()]

# # Print the unmatched row
# print("Unmatched Row:")
# print(unmatched_row)

# Drop the intermediate RID column from the imaging dataset if not needed
merged_data.drop(columns=["RID_from_participant_id"], inplace=True)

# Select required columns from imaging_path and protein_path
# Columns from imaging_path
metadata_columns = ["participant_id", "session_id", "age", "sex", "diagnosis"]

# Protein concentration columns from protein_path
protein_columns = [col for col in protein.columns if col.startswith("X")]  # Assuming concentrations start with "X"

# Combine the required columns
required_columns = metadata_columns + protein_columns
filtered_merged_data = merged_data[required_columns]


# Step 5: Save the filtered merged dataset
filtered_merged_data.to_csv("dataset/merged_dataset.csv", index=False)



## Prepare the dataset for input

In [70]:
# Paths to datasets
filtered_merged_path = "dataset/merged_dataset.csv"

# Directory containing all ADNI CSV files
adni_files_directory = "dataset/ADNI_matched"
output_directory = "dataset/final_datasets/" # Save to the final folder

# Load the merged dataset
filtered_merged_data = pd.read_csv(filtered_merged_path)

# Get a list of all ADNI CSV files in the directory
adni_files = [f for f in os.listdir(adni_files_directory) if f.startswith("ADNI_") and f.endswith(".csv")]

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Iterate through each ADNI file and process it
for adni_file in adni_files:
    # Load the ADNI data
    adni_data_path = os.path.join(adni_files_directory, adni_file)
    adni_data = pd.read_csv(adni_data_path)
    
    # Extract the list of analytes for the current body system
    analytes = adni_data["Analytes"].unique()
    
    # Identify protein columns in filtered_merged_data that start with "X"
    protein_columns = [col for col in filtered_merged_data.columns if col.startswith("X")]
    
    # Filter protein columns to keep only those present in the analytes list
    filtered_protein_columns = [col for col in protein_columns if col in analytes]
    
    # Define required columns: metadata + filtered protein columns
    metadata_columns = ["participant_id", "session_id", "age", "diagnosis"]
    required_columns = metadata_columns + filtered_protein_columns
    
    # Filter the merged dataset to keep only required columns
    filtered_data = filtered_merged_data[required_columns]
    
    # Rename "age" column to "diagnosis"
    filtered_data.rename(columns={"age": "diagnosis"}, inplace=True)
    
    # Remove duplicate "diagnosis" column if it exists
    filtered_data = filtered_data.loc[:, ~filtered_data.columns.duplicated()]
    
    # Save the filtered dataset as a TSV file
    output_path = os.path.join(output_directory, f"filtered_{os.path.splitext(adni_file)[0]}.tsv")
    filtered_data.to_csv(output_path, sep="\t", index=False)
    
    print(f"Filtered dataset saved as '{output_path}'")

print("Processing complete for all ADNI datasets.")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.rename(columns={"age": "diagnosis"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.rename(columns={"age": "diagnosis"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.rename(columns={"age": "diagnosis"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

Filtered dataset saved as 'dataset/final/filtered_ADNI_male_reproductive.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_female_reproductive.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_skin.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_digestive.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_hepatic.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_pulmonary.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_brain.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_kidney.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_retina.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_cardiovascular.tsv'
Filtered dataset saved as 'dataset/final/filtered_ADNI_endocrine.tsv'
Processing complete for all ADNI datasets.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.rename(columns={"age": "diagnosis"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.rename(columns={"age": "diagnosis"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.rename(columns={"age": "diagnosis"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [6]:
!nvidia-smi

## Run regression for all the body systems from ADNI_matched

In [4]:
import os
from mlni.adml_regression import regression_roi 
from mlni.adml_regression_rbf import regression_roi as regression_roi_rbf
from mlni.adml_regression_nn import regression_roi as regression_roi_nn
from mlni.adml_regression_mlp import regression_roi as regression_roi_mlp
from mlni.adml_regression_lasso import regression_roi as regression_roi_lasso

# Define paths
final_datasets_directory = "dataset/final_datasets"
output_base_directory = "result_by_type"
cv_repetition = 250  # Number of cross-validation repetitions

# Ensure the output directory exists
os.makedirs(output_base_directory, exist_ok=True)

# Get a list of all TSV files in the final_datasets directory
tsv_files = [f for f in os.listdir(final_datasets_directory) if f.endswith(".tsv")]

# Define regression types and their corresponding functions
regression_types = {
    'svr_linear': regression_roi,
    'svr_rbf': regression_roi_rbf,
    'nn': regression_roi_nn,
    'mlp': regression_roi_mlp,
    'lasso': regression_roi_lasso
}

# Iterate through each TSV file
for tsv_file in tsv_files:
    feature_tsv_path = os.path.join(final_datasets_directory, tsv_file)
    
    # Extract just the organ system name
    # If file is "filtered_ADNI_brain.tsv", this will give us "brain"
    organ_system = tsv_file.split('_')[-1].replace('.tsv', '')
    
    # Run each type of regression
    for reg_type, reg_function in regression_types.items():
        try:
            # Create output directory using organ system name and regression type
            dataset_name = f"{organ_system}_{reg_type}"
            output_dir = os.path.join(output_base_directory, dataset_name)
            os.makedirs(output_dir, exist_ok=True)
            
            print(f"Running {reg_type} regression for {organ_system}...")
            # Run regression
            reg_function(feature_tsv_path, output_dir, cv_repetition)
            print(f"{reg_type} regression completed for {organ_system}. Results saved in {output_dir}.")
            
        except Exception as e:
            print(f"Error running {reg_type} regression for {organ_system}: {str(e)}")
            continue

print("All datasets processed successfully.")

Running svr_linear regression for cardiovascular...
MLNI for a regression with nested CV...
Data split was performed based on validation strategy: hold_out...

Data split has been done!

Starts regression with linear SVR...
		[                                                  ] 0.40%
		[                                                  ] 0.80%
		[                                                  ] 1.20%
		[                                                  ] 1.60%
		[=                                                 ] 2.00%
		[=                                                 ] 2.40%
		[=                                                 ] 2.80%
		[=                                                 ] 3.20%
		[=                                                 ] 3.60%
		[==                                                ] 4.00%
		[==                                                ] 4.40%
		[==                                                ] 4.80%
		[==                                       

KeyboardInterrupt: 

In [3]:
# from mlni.regression_analysis import run_regression_analysis

# # Define paths
# final_datasets_directory = "dataset/final_datasets"
# output_base_directory = "dataset/result_by_type"
# cv_repetition = 250

# # Run the analysis
# results = run_regression_analysis(final_datasets_directory, output_base_directory, cv_repetition)

# # Display results
# print(results)



In [3]:
# import torch
# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("MPS backend available")
# else:
#     print("MPS backend not available")

MPS backend available


In [1]:
# import torch
# print(torch.cuda.is_available())  # Should print True
# print(torch.version.cuda)

False
None
