In [1]:
# note these notebooks only work inside of .devcontainer

# '''create new ipython kernel if needed'''
# !python -m ipykernel install --user --name=axonenv
import os
import sys
sys.path.append('/app/extract_features/')
from helper_functions import get_list_of_h5_files, process_csv_to_dict
os.chdir('/data')
print(os.getcwd())

/data


In [2]:
import os
import datetime as dt
from collections import defaultdict
import re

def get_list_of_h5_files(h5_parent_dirs, allowed_scan_types=None, **kwargs):
    if allowed_scan_types is None:
        allowed_scan_types = kwargs.get('sorting_params', {}).get('allowed_scan_types', [''])[0]
    
    h5_files = []
    for h5_parent_dir in h5_parent_dirs:
        if h5_parent_dir.endswith('.h5') and allowed_scan_types in h5_parent_dir:
            h5_files.append(h5_parent_dir)
            continue
        for root, dirs, files in os.walk(h5_parent_dir):
            for file in files:
                if file.endswith('.h5') and allowed_scan_types in root:
                    h5_files.append(os.path.join(root, file))
    return h5_files

def process_csv_to_dict(df, h5_parent_dirs, allowed_scan_types=None, allowed_RBS_scan_types=None):
    # Initialize the dictionary to hold the structured data
    data_dict = defaultdict(dict)
    
    # Get list of h5 files from the directories
    h5_files = get_list_of_h5_files(h5_parent_dirs, allowed_scan_types, allowed_RBS_scan_types=allowed_RBS_scan_types)
    
    unmatched_rows = []  # List to track rows that couldn't be matched

    # Loop over each row in the DataFrame
    lowest_run_num = 999999
    lowest_run_id = 41092384
    for _, row in df.iterrows():
        # Convert the date to YYMMDD format
        date_str = dt.datetime.strptime(row['Date'], '%m/%d/%Y').strftime('%y%m%d')
        chip_id = row['ID']
        RBS_scan_type = row['Assay']
        source = row['Neuron Source'].split(', ')
        run_number = row['Run #']
        run_DIV = row['DIV']
        # if run_number < lowest_run_num:
        #     lowest_run_num = run_number
        # run_num_diff = run_number - lowest_run_num

        #filter RBS_scan_type
        if allowed_RBS_scan_types is not None:
            if RBS_scan_type not in allowed_RBS_scan_types:
                continue
        
        # Filter relevant h5 files for this chip_id and date
        relevant_files = [f for f in h5_files if chip_id in f and date_str in f]
        
        if not relevant_files:
            # If no relevant files found, add row to unmatched list and continue
            unmatched_rows.append(row)
            continue

        matched = False  # Flag to check if we successfully matched a file
        
        for h5_file in relevant_files:
            # Extract the scan type from the path, it should be the directory name before the chip_id
            maxwell_scan_type = h5_file.split('/')[-3]
            
            if maxwell_scan_type in allowed_scan_types:
                # Extract the run ID from the path (6-digit number in the Network folder)
                run_id_match = re.search(r'/(\d{6})/data\.raw\.h5', h5_file)
                
                # if not run_id_match:
                #     continue  # If we can't find the run_id, skip this file

                run_id = run_id_match.group(1)
                if int(run_id) < int(lowest_run_id):
                    lowest_run_id = run_id
                run_id_diff = int(run_id) - int(lowest_run_id)
                
                try:
                    assert chip_id in h5_file, f"{chip_id} not in {h5_file}"  # Assert that the chip_id is in the h5_file path
                    assert date_str in h5_file, f"{date_str} not in {h5_file}" # Assert that the date_str is in the h5_file path
                    
                    #run number will be continuous while run_id may not be if there are any cancelled runs
                    assert int(run_number) <= int(run_id), f"{run_number} not less than {run_id}" # Assert that the run_number is less than or equal to the run_id
                    
                    #Rules for matching scan types
                    # If the scan type is 'Network', it should be present in the allowed_scan_types
                    if 'Network' in allowed_scan_types:
                        assert 'Network' in maxwell_scan_type
                        try: assert 'Network Today' in RBS_scan_type
                        except: assert 'Neuronal Unit' in RBS_scan_type
                except AssertionError as e:  # If the assertions fail, print the error and continue to the next file
                    print(e)
                    continue
                
                # Create a unique chip identifier based on the run ID (e.g., M08018_000120)
                chip_id_with_run = f"{chip_id}_{run_number}"
                
                # Insert the structured data into the dictionary
                if date_str not in data_dict:
                    data_dict[date_str] = {}
                
                data_dict[date_str][chip_id_with_run] = {
                    "path": h5_file,
                    "scan_type": maxwell_scan_type,
                    "RBS_scan_type": RBS_scan_type,
                    "DIV": run_DIV,
                    "source": source
                }
                matched = True
                break  # If a match is found, no need to check further

        if not matched:
            unmatched_rows.append(row)

    # If there are any unmatched rows, print them out
    if unmatched_rows:
        print("Unmatched rows:")
        for row in unmatched_rows:
            print(row)
    
    return dict(data_dict)

In [3]:
import pandas as pd
import json #save as json, save the json so I can easily read it in vscode
file_path = '/app/extract_features/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024.csv'
data = pd.read_csv(file_path) # Load the CSV file to inspect its contents
h5_parent_dirs = [
    '/data/rbs_maxtwo/rbsmaxtwo/media/rbs-maxtwo/harddisk20tb/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024',
    #'/data/CDKL5-E6D_T1_C1_05152024/CDKL5-E6D_T1_C1_05152024'
    ]
allowed_scan_types = 'Network'
allowed_RBS_scan_types = 'Neuronal Units 9'
extracted_data = process_csv_to_dict(data, h5_parent_dirs, allowed_scan_types, allowed_RBS_scan_types=allowed_RBS_scan_types)
with open('/app/extract_features/CDKL5-E6D_T1_C1_05152024/extracted_data.json', 'w') as json_file:
    json.dump(extracted_data, json_file, indent=4)

13 not less than 000008
13 not less than 000010
13 not less than 000011
13 not less than 000012
36 not less than 000025
36 not less than 000026
36 not less than 000027
36 not less than 000028
36 not less than 000033
36 not less than 000034
36 not less than 000035
49 not less than 000045
49 not less than 000047
49 not less than 000048
63 not less than 000058
63 not less than 000060
63 not less than 000061
63 not less than 000062
77 not less than 000072
77 not less than 000074
77 not less than 000075
77 not less than 000076
95 not less than 000090
95 not less than 000092
95 not less than 000093
110 not less than 000105
110 not less than 000107
110 not less than 000108
110 not less than 000109
127 not less than 000122
127 not less than 000124
127 not less than 000125
127 not less than 000126
