In [1]:
# note these notebooks only work inside of .devcontainer

# '''create new ipython kernel if needed'''
# !python -m ipykernel install --user --name=axonenv
import os
import sys
sys.path.append('/app/extract_features/')
from helper_functions import get_list_of_h5_files, process_csv_to_dict
os.chdir('/data')
print(os.getcwd())

/data


In [2]:
import os
import datetime as dt
from collections import defaultdict
import re

def get_list_of_h5_files(h5_parent_dirs, allowed_scan_types=None, **kwargs):
    if allowed_scan_types is None:
        allowed_scan_types = kwargs.get('sorting_params', {}).get('allowed_scan_types', [''])[0]
    
    h5_files = []
    for h5_parent_dir in h5_parent_dirs:
        if h5_parent_dir.endswith('.h5') and allowed_scan_types in h5_parent_dir:
            h5_files.append(h5_parent_dir)
            continue
        for root, dirs, files in os.walk(h5_parent_dir):
            for file in files:
                if file.endswith('.h5') and allowed_scan_types in root:
                    h5_files.append(os.path.join(root, file))
    return h5_files

def process_csv_to_dict(df, h5_parent_dirs, allowed_scan_types=None):
    # Initialize the dictionary to hold the structured data
    data_dict = defaultdict(dict)
    
    # Get list of h5 files from the directories
    h5_files = get_list_of_h5_files(h5_parent_dirs, allowed_scan_types)
    
    unmatched_rows = []  # List to track rows that couldn't be matched

    # Loop over each row in the DataFrame
    lowest_run_num = 999999
    lowest_run_id = 41092384
    for _, row in df.iterrows():
        # Convert the date to YYMMDD format
        date_str = dt.datetime.strptime(row['Date'], '%m/%d/%Y').strftime('%y%m%d')
        chip_id = row['ID']
        RBS_scan_type = row['Assay']
        source = row['Neuron Source'].split(', ')
        run_number = int(row['Run #'])
        if run_number < lowest_run_num:
            lowest_run_num = run_number
        run_num_diff = run_number - lowest_run_num
        
        # Filter relevant h5 files for this chip_id and date
        relevant_files = [f for f in h5_files if chip_id in f and date_str in f]
        
        if not relevant_files:
            # If no relevant files found, add row to unmatched list and continue
            unmatched_rows.append(row)
            continue

        matched = False  # Flag to check if we successfully matched a file
        
        for h5_file in relevant_files:
            # Extract the scan type from the path, it should be the directory name before the chip_id
            maxwell_scan_type = h5_file.split('/')[-3]
            
            if maxwell_scan_type in allowed_scan_types:
                # Extract the run ID from the path (6-digit number in the Network folder)
                run_id_match = re.search(r'/(\d{6})/data\.raw\.h5', h5_file)
                
                # if not run_id_match:
                #     continue  # If we can't find the run_id, skip this file

                run_id = run_id_match.group(1)
                if int(run_id) < int(lowest_run_id):
                    lowest_run_id = run_id
                run_id_diff = int(run_id) - int(lowest_run_id)
                
                try:
                    assert chip_id in h5_file, f"{chip_id} not in {h5_file}"  # Assert that the chip_id is in the h5_file path
                    assert date_str in h5_file, f"{date_str} not in {h5_file}" # Assert that the date_str is in the h5_file path
                    assert run_id_diff == run_num_diff, f"{run_id_diff} not equal to {run_num_diff}" #idk if this will work
                except AssertionError as e:  # If the assertions fail, print the error and continue to the next file
                    print(e)
                    continue
                
                # Create a unique chip identifier based on the run ID (e.g., M08018_000120)
                chip_id_with_run = f"{chip_id}_{run_id}"
                
                # Insert the structured data into the dictionary
                if date_str not in data_dict:
                    data_dict[date_str] = {}
                
                data_dict[date_str][chip_id_with_run] = {
                    "path": h5_file,
                    "scan_type": maxwell_scan_type,
                    "RBS_scan_type": RBS_scan_type,
                    "source": source
                }
                matched = True
                break  # If a match is found, no need to check further

        if not matched:
            unmatched_rows.append(row)

    # If there are any unmatched rows, print them out
    if unmatched_rows:
        print("Unmatched rows:")
        for row in unmatched_rows:
            print(row)
    
    return dict(data_dict)

In [4]:
import pandas as pd
data_path = '/data/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024'
file_path = '/app/extract_features/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024.csv'
data = pd.read_csv(file_path) # Load the CSV file to inspect its contents
h5_parent_dirs = ['/data/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024']
allowed_scan_types = 'Network'
processed_data = process_csv_to_dict(data, h5_parent_dirs, allowed_scan_types)
processed_data
#processed_data_example = {k: processed_data[k] for k in list(processed_data)[:2]}  # # Displaying a portion of the processed data for verification
#processed_data_example

0 not equal to 1
2 not equal to 3
4 not equal to 5
6 not equal to 7
8 not equal to 7
9 not equal to 7
10 not equal to 7
11 not equal to 7
6 not equal to 8
6 not equal to 9
8 not equal to 9
6 not equal to 10
8 not equal to 10
9 not equal to 10
6 not equal to 11
8 not equal to 11
9 not equal to 11
10 not equal to 11
6 not equal to 12
8 not equal to 12
9 not equal to 12
10 not equal to 12
11 not equal to 12
6 not equal to 13
8 not equal to 13
9 not equal to 13
10 not equal to 13
11 not equal to 13
17 not equal to 16
17 not equal to 18
19 not equal to 20
21 not equal to 22
23 not equal to 31
24 not equal to 31
25 not equal to 31
26 not equal to 31
23 not equal to 32
24 not equal to 32
25 not equal to 32
26 not equal to 32
31 not equal to 32
23 not equal to 33
24 not equal to 33
25 not equal to 33
26 not equal to 33
31 not equal to 33
32 not equal to 33
23 not equal to 34
24 not equal to 34
25 not equal to 34
26 not equal to 34
31 not equal to 34
32 not equal to 34
33 not equal to 34
23 not

KeyboardInterrupt: 