In [2]:
import os
import sys
git_root = !git rev-parse --show-toplevel #get the root of this repo, regardless of where the notebook is run from
git_root = git_root[0]
#os.chdir(git_root)
#print(os.getcwd())
sys.path.append(f'{git_root}/extract_features/')
from helper_functions import get_list_of_h5_files, process_csv_to_dict

In [5]:
'''
Get the list of h5 files, select the path of interest to analyze
'''
import os
def get_list_of_h5_files(h5_parent_dirs, allowed_scan_types=None, strings_filtered_out=None, print_bool=False, **kwargs):
    try: assert isinstance(h5_parent_dirs, list)     #assert that h5_parent_dirs is a list
    except: h5_parent_dirs = [h5_parent_dirs]    
    if allowed_scan_types is None: allowed_scan_types = kwargs.get('sorting_params', {}).get('allowed_scan_types', [''])[0]
    h5_files = []
    for h5_parent_dir in h5_parent_dirs:
        if h5_parent_dir.endswith('.h5') and allowed_scan_types in h5_parent_dir:
            h5_files.append(h5_parent_dir)
            continue
        for root, dirs, files in os.walk(h5_parent_dir):
            for file in files:
                if strings_filtered_out is not None:
                    if any([string in root for string in strings_filtered_out]):
                        continue
                if file.endswith('.h5') and allowed_scan_types in root:
                    h5_files.append(os.path.join(root, file))
                    if print_bool: print(os.path.join(root, file))
    return h5_files

import re
import datetime as dt
from collections import defaultdict
def process_csv_to_dict(df, h5_parent_dirs, allowed_scan_types=None, allowed_RBS_scan_types=None):
    data_dict = defaultdict(dict)     # Initialize the dictionary to hold the structured data
    h5_files = get_list_of_h5_files(h5_parent_dirs, allowed_scan_types, allowed_RBS_scan_types=allowed_RBS_scan_types)     # Get list of h5 files from the directories
    unmatched_rows = []  # List to track rows that couldn't be matched
    for _, row in df.iterrows():
        date_str = dt.datetime.strptime(row['Date'], '%m/%d/%Y').strftime('%y%m%d')         # Convert the date to YYMMDD format
        chip_id = row['ID']
        RBS_scan_type = row['Assay']
        source = row['Neuron Source'].split(', ')
        run_number = row['Run #']
        run_DIV = row['DIV']

        #filter RBS_scan_type
        if allowed_RBS_scan_types is not None:
            if RBS_scan_type not in allowed_RBS_scan_types:
                continue
        
        # Filter relevant h5 files for this chip_id and date
        relevant_files = [f for f in h5_files if chip_id in f and date_str in f]
        
        if not relevant_files:
            # If no relevant files found, add row to unmatched list and continue
            unmatched_rows.append(row)
            continue

        matched = False  # Flag to check if we successfully matched a file
        
        for h5_file in relevant_files:
            # Extract the scan type from the path, it should be the directory name before the chip_id
            maxwell_scan_type = h5_file.split('/')[-3]
            
            if maxwell_scan_type in allowed_scan_types:
                run_id_match = re.search(r'/(\d{6})/data\.raw\.h5', h5_file)   # Extract the run ID from the path (6-digit number in the Network folder)

                run_id = run_id_match.group(1)                
                try:
                    assert chip_id in h5_file, f"{chip_id} not in {h5_file}"  # Assert that the chip_id is in the h5_file path
                    assert date_str in h5_file, f"{date_str} not in {h5_file}" # Assert that the date_str is in the h5_file path
                    
                    #run number will be continuous while run_id may not be if there are any cancelled runs
                    assert int(run_number) <= int(run_id), f"{run_number} not less than {run_id}" # Assert that the run_number is less than or equal to the run_id
                    
                    #Rules for matching scan types
                    # If the scan type is 'Network', it should be present in the allowed_scan_types
                    if 'Network' in allowed_scan_types:
                        assert 'Network' in maxwell_scan_type
                        try: assert 'Network Today' in RBS_scan_type
                        except: assert 'Neuronal Unit' in RBS_scan_type
                except AssertionError as e:  # If the assertions fail, print the error and continue to the next file
                    print(e)
                    continue
                
                # Create a unique chip identifier based on the run ID (e.g., M08018_000120)
                chip_id_with_run = f"{chip_id}_{run_number}"
                
                # Insert the structured data into the dictionary
                if date_str not in data_dict:
                    data_dict[date_str] = {}
                
                data_dict[date_str][chip_id_with_run] = {
                    "path": h5_file,
                    "scan_type": maxwell_scan_type,
                    "RBS_scan_type": RBS_scan_type,
                    "DIV": run_DIV,
                    "source": source
                }
                matched = True
                break  # If a match is found, no need to check further

        if not matched:
            unmatched_rows.append(row)

    # If there are any unmatched rows, print them out
    if unmatched_rows:
        print("Unmatched rows:")
        for row in unmatched_rows:
            print(row)
    
    return dict(data_dict)

strings_filtered_out = [
    'analysis',
]
synology_mnt = '/mnt/ben-shalom_nas'
h5_files = get_list_of_h5_files(h5_parent_dirs = f'{synology_mnt}/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024', allowed_scan_types='Network', strings_filtered_out=strings_filtered_out, print_bool=True)
#print(h5_files)

/mnt/ben-shalom_nas/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024/240520/M07039/Network/000006/data.raw.h5
/mnt/ben-shalom_nas/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024/240520/M07420/Network/000002/data.raw.h5
/mnt/ben-shalom_nas/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024/240520/M07427/Network/000004/data.raw.h5
/mnt/ben-shalom_nas/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024/240520/M08018/Network/000008/data.raw.h5
/mnt/ben-shalom_nas/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024/240520/M08018/Network/000010/data.raw.h5
/mnt/ben-shalom_nas/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024/240520/M08018/Network/000011/data.raw.h5
/mnt/ben-shalom_nas/rbs_maxt

In [None]:
import pandas as pd
import json #save as json, save the json so I can easily read it in vscode
file_paths = [
    #'/app/extract_features/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024.csv', #if running in docker
    f'{git_root}/extract_features/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024.csv' #if running in vscode
]
synology_mnt = '/mnt/ben-shalom_nas'
#dev_mnt = '/data'
for file_path in file_paths:
    data = pd.read_csv(file_path) # Load the CSV file to inspect its contents
    h5_parent_dirs = [
        f'{synology_mnt}/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024',
        #f'{dev_mnt}/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024', #if running in docker
        ]
    allowed_scan_types = 'Network'
    allowed_RBS_scan_types = 'Neuronal Units 9'
    extracted_data = process_csv_to_dict(data, h5_parent_dirs, allowed_scan_types, allowed_RBS_scan_types=allowed_RBS_scan_types)
    #extracted_data_path = '/app/extract_features/CDKL5-E6D_T1_C1_05152024/extracted_data.json'
    extracted_data_path = f'{git_root}/extract_features/CDKL5-E6D_T1_C1_05152024/extracted_data.json'
    with open(extracted_data_path, 'w') as json_file:
        json.dump(extracted_data, json_file, indent=4)

In [None]:
#sys.path.append('/app/submodules/MEA_Analysis/MEAProcessingLibrary')
sys.path.append(f'{git_root}/submodules/MEA_Analysis/MEAProcessingLibrary')
import mea_processing_library as MPL

# Define the neuron source of interest
source_of_interest = "WT"

# Function to extract paths for the source of interest
def extract_paths_for_source(data, source_filter):
    filtered_data = []
    for date_key, wells_data in data.items():
        for well_id, well_info in wells_data.items():
            # Check if the source filter exists in the source list
            if any(source_filter in s for s in well_info['source']):
                filtered_data.append({
                    'path': well_info['path'],
                    'source': well_info['source']
                })
    return filtered_data

# Extract the data where WT is involved
filtered_well_data = extract_paths_for_source(extracted_data, source_of_interest)

# Process each well for the selected recordings
for well_info in filtered_well_data:
    test_h5_path = well_info['path']
    
    try:
        # Load recordings
        MaxID, recordings, expected_well_count, rec_counts = MPL.load_recordings(test_h5_path, stream_select=None, logger=None)
        
        # Process only the wells that are associated with the WT source
        for well_key, recording in recordings.items():
            
            recording = recording['recording_segments'][0]  # Get the first recording segment
            # Get the index from the well key to match with the source (e.g., well000 -> index 0, well001 -> index 1)
            well_index = int(well_key[-1])  # Assuming the key format is 'well000' to 'well005'
            
            # Check if the corresponding source for this well is WT
            source = well_info['source'][well_index]

            # # Path inside the container
            # output_folder_in_container = f'/app/extract_features/CDKL5-E6D_T1_C1_05152024/sortings/{well_key}'

            # # Create the output folder if it doesn't exist
            # if os.path.exists(output_folder_in_container) is False:
            #     os.makedirs(output_folder_in_container)

            # # Path inside the container
            # output_folder_in_container = f'/app/extract_features/CDKL5-E6D_T1_C1_05152024/sortings/'

            # # Convert the Docker container path to the host path
            # output_folder_on_host = output_folder_in_container.replace('/app', '/home/adamm/workspace/RBS_network_simulations')
            
            #get well recording info from path
            date_str = test_h5_path.split('/')[-5]
            chip_id = test_h5_path.split('/')[-4]
            scan_type = test_h5_path.split('/')[-3]
            run_id = test_h5_path.split('/')[-2]
            #output path
            output_folder = f'{git_root}/extract_features/CDKL5-E6D_T1_C1_05152024/sortings/{date_str}/{chip_id}/{scan_type}/{run_id}/{well_key}'

            #create the output folder if it doesn't exist
            if os.path.exists(output_folder) is False:
                os.makedirs(output_folder)

            if source_of_interest in source:
                # Now you can use `output_folder_on_host` to pass to the Kilosort container
                sorting = MPL.benshalom_kilosort2_docker_image(
                    recording, 
                    output_folder = output_folder, 
                    #output_folder=output_folder_on_host,  # Host path
                    #logger=None
                )
                print(f"Sorting complete for well {well_key} in {test_h5_path}")
            else:
                print(f"Skipping well {well_key} as it is not WT.")
    except Exception as e:
        print(f"Error processing {test_h5_path}: {e}")

In [None]:
## Extract features from the sorted data


