### Import statements

In [1]:
import numpy as np
import pandas as pd

### Functions

In [2]:
def get_row_at_region(df, key):
    """ For use with count_df and region_df (cell_region_count.csv and region_counts.csv) """
    return df[df['region'] == key]

def get_row_at_parcellation_idx(df, idx):
    """ For use only with CCF_all_regions / 1_adult_mouse_brain_graph_mapping.csv """
    return df[df['parcellation_index'] == idx]

def get_row_at_identifier(df, id):
    """ For use only with summary_structures.csv """
    if "MBA:" not in str(id):
        id = "MBA:" + str(id)
    return df[df['identifier'] == id]

def get_rows_containing_value_in_path(df, value, column_name='structure_id_path', delimiter='/'):
    """ Function to get all rows where a value is contained in a column with delimited values """
    #pattern = f'(^|{delimiter}){value}({delimiter}|$)'
    pattern = f'{delimiter}{value}{delimiter}'
    return df.loc[df[column_name].str.contains(pattern, regex=True)]

### Load in data

In [3]:
# Our counted data
cell_count_df_path = "example files_b0039/cell_count.csv"
count_df_path = "example files_b0039/cell_region_count.csv"  # Count df just has the counts for the specific region
region_df_path = "example files_b0039/region_counts.csv"     # Region has the counts of all of levels combined

# Allen CCF provided data
ccf_all_regions_df_path = "../../CCF_DATA/1_adult_mouse_brain_graph_mapping.csv"
ccf_summary_structures_df_path = "../../CCF_DATA/300_summary_structures.csv"

# Read in the data
count_df = pd.read_csv(count_df_path)
region_df = pd.read_csv(region_df_path)
ccf_all_regions_df = pd.read_csv(ccf_all_regions_df_path)
ccf_summary_structures_df = pd.read_csv(ccf_summary_structures_df_path)

# Add an extra column to region_df for keeping track of ids
region_df['id'] = np.nan

# Fill in default values of CCF counts
ccf_all_regions_df['count'] = 0
ccf_summary_structures_df['count'] = 0

___
### Visualize table data

In [4]:
region_df

Unnamed: 0,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,level_10,region,count,region name,id
0,root,,,,,,,,,,,987,6825,root,
1,root,Basic cell groups and regions,Brain stem,Hindbrain,Medulla,"Medulla, sensory related",Nucleus of the solitary tract,,,,,641,1,Nucleus of the solitary tract,
2,root,Basic cell groups and regions,Brain stem,Hindbrain,Pons,"Pons, behavioral state related",Superior central nucleus raphe,,,,,669,1,Superior central nucleus raphe,
3,root,Basic cell groups and regions,Brain stem,Hindbrain,Pons,"Pons, motor related",Tegmental reticular nucleus,,,,,564,1,Tegmental reticular nucleus,
4,root,Basic cell groups and regions,Brain stem,Interbrain,Hypothalamus,,,,,,,1086,3,Hypothalamus,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,root,fiber tracts,lateral forebrain bundle system,corpus callosum,"corpus callosum, anterior forceps",,,,,,,946,3,"corpus callosum, anterior forceps",
127,root,fiber tracts,medial forebrain bundle system,cerebrum related,fornix system,alveus,,,,,,456,1,alveus,
128,root,fiber tracts,medial forebrain bundle system,cerebrum related,fornix system,fimbria,,,,,,593,1,fimbria,
129,root,ventricular systems,lateral ventricle,,,,,,,,,76,10,lateral ventricle,


In [5]:
ccf_all_regions_df

Unnamed: 0,id,acronym,color_hex_triplet,name,graph_order,parent_structure_id,structure_id_path,parcellation_index,count
0,997,root,FFFFFF,root,0,-1,/997/,987,0
1,8,grey,BFDAE3,Basic cell groups and regions,1,997,/997/8/,7,0
2,567,CH,B0F0FF,Cerebrum,2,8,/997/8/567/,557,0
3,688,CTX,B0FFB8,Cerebral cortex,3,567,/997/8/567/688/,678,0
4,695,CTXpl,70FF70,Cortical plate,4,688,/997/8/567/688/695/,685,0
...,...,...,...,...,...,...,...,...,...
1322,49,ipf,AAAAAA,intraparafloccular fissure,1322,1040,/997/1024/1040/49/,44,0
1323,57,pms,AAAAAA,paramedian sulcus,1323,1040,/997/1024/1040/57/,52,0
1324,65,pfs,AAAAAA,parafloccular sulcus,1324,1040,/997/1024/1040/65/,60,0
1325,624,IPF,AAAAAA,Interpeduncular fossa,1325,1024,/997/1024/624/,614,0


In [6]:
ccf_summary_structures_df

Unnamed: 0,acronym,name,color_hex_triplet,graph_order,identifier,red,green,blue,parent_identifier,count
0,FRP,"Frontal pole, cerebral cortex",#268F45,6.0,MBA:184,38,143,69,MBA:315,0
1,MOp,Primary motor area,#1F9D5A,18.0,MBA:985,31,157,90,MBA:500,0
2,MOs,Secondary motor area,#1F9D5A,24.0,MBA:993,31,157,90,MBA:500,0
3,SSp-n,"Primary somatosensory area, nose",#188064,44.0,MBA:353,24,128,100,MBA:322,0
4,SSp-bfd,"Primary somatosensory area, barrel field",#188064,51.0,MBA:329,24,128,100,MBA:322,0
...,...,...,...,...,...,...,...,...,...,...
366,VL-unassigned,"lateral ventricle, unassigned",#AAAAAA,1293.3,MBA:81,170,170,170,,0
367,V3-unassigned,"third ventricle, unassigned",#AAAAAA,1299.3,MBA:129,170,170,170,,0
368,AQ,cerebral aqueduct,#AAAAAA,1300.3,MBA:140,170,170,170,,0
369,V4-unassigned,"fourth ventricle, unassigned",#AAAAAA,1301.3,MBA:145,170,170,170,,0


___
### Process the counted cells into summary structures

In [4]:
# Our counted data
#region_df_path = "example files_b0039/region_counts.csv"     # Region has the counts of all of levels combined
region_df_path = "example files_b0039/cell_region_count.csv"     # Region has the counts of all of levels combined

# Allen CCF provided data
ccf_all_regions_df_path = "../../CCF_DATA/1_adult_mouse_brain_graph_mapping.csv"
ccf_summary_structures_df_path = "../../CCF_DATA/300_summary_structures.csv"

# Outputted new summary structures path
output_path = "updated_summary_structures (cell_region_count step 2).csv"

# Read in the data
region_df = pd.read_csv(region_df_path)
ccf_all_regions_df = pd.read_csv(ccf_all_regions_df_path)
ccf_summary_structures_df = pd.read_csv(ccf_summary_structures_df_path)

# Add an extra column to region_df for keeping track of ids
region_df['id'] = np.nan

# Fill in default values of CCF counts
ccf_summary_structures_df['count'] = 0

################################
# STEP 1
# Goal: convert info in region_counts.csv to 300 summary regions
"""
# Go through each row in region_count.csv and add their ID numbers to region_df
for index, row in region_df.iterrows():
    # Find the id and find the corresponding row in 1_adult_mouse_brain_graph_mapping.csv by matching with the parcellation_index
    # Then save the id to region_df (region_counts.csv dataframe)
    ccf_all_regions_row = get_row_at_parcellation_idx(ccf_all_regions_df, row['region'])
    region_df.at[index, 'id'] = ccf_all_regions_row.iloc[0]['id']
region_df['id'] = region_df['id'].astype(int)  # Make sure region IDs are integers

# Now add the counts of region_df to the summary structures (300_summary_structures.csv)
for index, row in region_df.iterrows():
    curr_id = "MBA:" + str(row['id'])
    curr_count = row['count']
    # Add the count to the summary structures based on the identifier column. Ignores if it doesn't exist
    ccf_summary_structures_df.loc[ccf_summary_structures_df['identifier'] == curr_id, 'count'] = curr_count
"""
        
############################################

# However, some regions in region_count cannot be found in summary structures 
# because the region level is smaller than the 300 structures we want. 
# We need to figure out for this region (e.g. dentate gyrus, granule cell layer, region number/parcellation_index 622, id=632 
# is the lower level region of which regions inside 300 
# we need to first go over each row inside 300 summary regions.csv, based on the id number, check the column 
# structure_id_path in 1_adult_mouse_brain_graph_mapping.csv

# Go through each row of summary structures
for index, row in ccf_summary_structures_df.iterrows():
    # Grab the identifier if one exists, otherwise ignore that row, it is unassigned. 
    if row['identifier'] is not np.nan:
        curr_identifier = int(row['identifier'].replace("MBA:", ""))  # Remove the MBA: string
    
        # Get all of the rows in 1_adult_mouse_brain_graph_mapping.csv that contain the current identifier
        # in the structure_id_path column
        all_region_rows = get_rows_containing_value_in_path(ccf_all_regions_df, curr_identifier)
        parcellation_indices = all_region_rows['parcellation_index'].tolist()
        
        # Get the counts of all of the matching parcellation indices and combine them in total_count
        total_count = 0
        for p in parcellation_indices:
            region_df_row = get_row_at_region(region_df, p)
            if not region_df_row.empty:
                total_count += int(region_df_row['count'].iloc[0])

        # Now save the combined counts to their corresponding row in summary structures
        curr_count = ccf_summary_structures_df.at[index, 'count']
        # If the counts replace an existing value and don't match, print a warning.
        if curr_count != total_count and curr_count != 0:
            print("Mismatching counts with", ccf_summary_structures_df['name'][index])
            print("\tCurrent count:", curr_count)
            print("\tComputed count:", total_count)
            
        #else:  # Otherwise save it
        ccf_summary_structures_df.at[index, 'count'] = total_count

# Save the updated summary structures to a new file
ccf_summary_structures_df.to_csv(output_path, index=False)

In [24]:
ccf_summary_structures_df

for index, row in ccf_summary_structures_df.iterrows():
    if row['count'] != 0:
        print(row['name'])
        print(row['identifier'])
        print(row['count'])
        print()

### Sanity check for results

In [59]:
ccf_summary_structures_df.iloc[54]  # CA1 should equal 1810

acronym                    CA1
name                 Field CA1
color_hex_triplet      #7ED04B
graph_order              457.0
identifier             MBA:382
red                        126
green                      208
blue                        75
parent_identifier      MBA:375
count                     1810
Name: 54, dtype: object

In [61]:
ccf_summary_structures_df.iloc[57]  # Dentate gyrus should equal 95

acronym                         DG
name                 Dentate gyrus
color_hex_triplet          #7ED04B
graph_order                  473.0
identifier                 MBA:726
red                            126
green                          208
blue                            75
parent_identifier         MBA:1080
count                           95
Name: 57, dtype: object

In [62]:
ccf_summary_structures_df.iloc[60]  # Entorhinal area, lateral part should equal 8

acronym                                       ENTl
name                 Entorhinal area, lateral part
color_hex_triplet                          #32B825
graph_order                                  494.0
identifier                                 MBA:918
red                                             50
green                                          184
blue                                            37
parent_identifier                          MBA:909
count                                            8
Name: 60, dtype: object