In [None]:
#from aPhN2-SA_Activation import set_1
#%pip install statsmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from scipy.optimize import curve_fit
from venn import venn
import seaborn as sns
from matplotlib.colors import LogNorm
import statsmodels.api as sm

In [None]:
# Set seaborn theme to white
sns.set_theme(style='white')

# set up matplot lib theme
andy_theme = {'axes.grid': True,
              'grid.linestyle': '--',
              'legend.framealpha': 1,
              'legend.facecolor': 'white',
              'legend.shadow': False,
              'legend.fontsize': 14,
              'legend.title_fontsize': 14,
              'font.sans-serif':'Helvetica',
              'xtick.labelsize': 8,
              'ytick.labelsize': 8,
              'axes.labelsize': 12,
              'axes.titlesize': 16,
              'figure.dpi': 300}

plt.rcParams.update(andy_theme)

#Uncomment next 2 lines if matplotlib can not find Helvetica font
#plt.rcParams['font.family'] = 'DeJavu Serif'
#plt.rcParams['font.sans-serif'] = ['Arial']

## 1. FIRST ORDER ANALYSES

### Load the datasets with neurons and connections.

* This script assumes that the **CSV** files (`.csv.gz`) and **aPhN2-SAs lists** (`_new.csv` files) are in the same folder as this notebook or script.
* These files include four CSVs containing manually curated  aPhN2-SAs lists and four connectome datasets from FlyWire:
  1. **`classification.csv.gz`**
  2. **`connections.csv.gz`**
  3. **`neuropil_synapse_table.csv.gz`**
  4. **`neurons.csv.gz`**
* **Axon lists** were curated manually as described in the paper.
* **Connectome datasets** were downloaded from the FlyWire website using **snapshot 783** (previous snapshot 630).
* We focus on putative sensory axons from the Drosophila **pharyngeal nerve** in this analysis.


In [None]:
# Connections dataset and additional data sets

# Load the connections dataset
# columns: pre_root_id, post_root_id, neuropil, syn_count, nt_type
connections = pd.read_csv('/Users/yaolab/Library/CloudStorage/OneDrive-UniversityofFlorida/YaoLabUF/YaoLab/Drosophila_brain_model/connections.csv.gz')

# Neuropil synapses
# columns: root_id, input synapses, input partners, output synapses, output partners, etc
# Keep only root_id, input syanapses, output synapses
neuropil_synapse = pd.read_csv('/Users/yaolab/Library/CloudStorage/OneDrive-UniversityofFlorida/YaoLabUF/YaoLab/Drosophila_brain_model/neuropil_synapse_table.csv.gz')[['root_id', 'input synapses', 'output synapses']]

# Rename with underscores
neuropil_synapse.rename(columns={'input synapses': 'input_synapses','output synapses': 'output_synapses'}, inplace=True)

# Load classification table
# columns: root_id, flow, super_class, side, etc
# Keep only root_id and side
classification = pd.read_csv('/Users/yaolab/Library/CloudStorage/OneDrive-UniversityofFlorida/YaoLabUF/YaoLab/Drosophila_brain_model/classification.csv.gz')[['root_id', 'side']]
classification_other = pd.read_csv('/Users/yaolab/Library/CloudStorage/OneDrive-UniversityofFlorida/YaoLabUF/YaoLab/Drosophila_brain_model/classification.csv.gz')[['root_id', 'super_class', 'class']]

# Load data about each neuron
# columns: root_id, group, nt_type, etc
# Keep only root_id, nt_type
neurons = pd.read_csv('/Users/yaolab/Library/CloudStorage/OneDrive-UniversityofFlorida/YaoLabUF/YaoLab/Drosophila_brain_model/neurons.csv.gz')[['root_id', 'nt_type']]

# Merging additional data in one data set
neurons_data = pd.merge(neurons, pd.merge(classification, neuropil_synapse, on='root_id',how= 'outer'), on='root_id',how='outer')

# Load putative PSA lists
set_1 = pd.read_csv('/Users/yaolab/Downloads/taste-connectome-main/aPhN2-SA_v1/set_1.csv')
set_2 = pd.read_csv('/Users/yaolab/Downloads/taste-connectome-main/aPhN2-SA_v1/set_2.csv')
set_3 = pd.read_csv('/Users/yaolab/Downloads/taste-connectome-main/aPhN2-SA_v1/set_3.csv')

### Find downstream connections of aPhN2-SAs
- includes all neurons downstream of aPhN2-SAs - we will filter out set-set connections later
- minimum of 5 synapses between the two neurons

In [None]:
# Define function to get outputs of aPhN2-SAs
def neuronal_outputs(aph1_sa):
    # Merge the aph1_sa DataFrame with the 'connections' data, filtering out any connections
    # that have fewer than 5 synapses
    connectivity = pd.merge(
        aph1_sa['root_id'],
        connections[['pre_root_id','post_root_id','neuropil','syn_count','nt_type']],
        left_on='root_id',
        right_on='pre_root_id',
        how='inner'
    ).query("syn_count >= 5")

    # Remove the temporary 'root_id' column that came from the aph1_sa DataFrame
    connectivity = connectivity.drop(columns='root_id')

    # Define function to categorize connection location
    def projection(neuropil):
        if neuropil in ['GNG', 'PRW', 'SAD', 'FLA_L', 'FLA_R', 'CAN']:  # Example SEZ-related regions
            return 'local'
        else:
            return 'outside_SEZ'

    # Apply the projection categorization to each row in 'connectivity'
    connectivity['location_of_connection'] = connectivity['neuropil'].apply(projection)

    return connectivity

In [None]:
# Get the outputs for each set of aPhN2-SAs
set_1_outputs = neuronal_outputs(set_1)
set_2_outputs = neuronal_outputs(set_2)
set_3_outputs = neuronal_outputs(set_3)

### Quantifying aPhN2-SA to aPhN2-SA communication
# This creates a heatmap showing the number of synapses between each set of pharyngeal sensory axons.

In [None]:
def aphn1_sa_heatmap_matrix(outputs_list, sets_list):
    """
    Given a list of DataFrames representing the outputs of each aPhN2-SA set
    (e.g., set_1_outputs, set_2_outputs, etc.) and a list of DataFrames of the
    actual sets (e.g., set_1, set_2, ...), return an NxN matrix where
    matrix[i, j] is the sum of 'syn_count' from set i to set j.

    Parameters
    ----------
    outputs_list : list of pd.DataFrame
        [set_1_outputs, set_2_outputs, set_3_outputs, ...]
    sets_list : list of pd.DataFrame
        [set_1, set_2, set_3, ...]

    Returns
    -------
    np.ndarray
        An NxN integer matrix with syn_count sums.
    """
    n = len(sets_list)
    # Initialize an NxN matrix of zeros
    syn_matrix = np.zeros((n, n), dtype=int)

    # Loop over each set's outputs (the "upstream" side)
    for i, out_df in enumerate(outputs_list):
        # For each possible downstream set
        for j, set_df in enumerate(sets_list):
            merged_df = pd.merge(
                out_df,
                set_df,
                left_on='post_root_id',  # from the outputs side
                right_on='root_id',      # from the set's root IDs
                how='inner'
            )
            # Sum all syn_count
            syn_matrix[i, j] = merged_df['syn_count'].sum()

    return syn_matrix


# We assume you already have:
# set_1, set_2, set_3
# set_1_outputs, set_2_outputs, set_3_outputs

all_sets = [set_1, set_2, set_3]
all_outputs = [set_1_outputs, set_2_outputs, set_3_outputs]


# Create the matrix of syn_counts
syn_matrix = aphn1_sa_heatmap_matrix(all_outputs, all_sets)

# Provide labels for rows (upstream sets) and columns (downstream sets)
row_labels = ['set_1', 'set_2', 'set_3']
col_labels = ['set_1', 'set_2', 'set_3']

# Plot the heatmap
plt.figure(figsize=(6,5))  # optional figure size
sns.heatmap(
    syn_matrix,
    cmap='viridis',
    annot=True,
    fmt="d",
    xticklabels=col_labels,
    yticklabels=row_labels,
    square=True,
    vmin=0
)
plt.title("Set-to-Set Synapses (aPhN2-SAs)")
plt.ylabel("Upstream Set")
plt.xlabel("Downstream Set")
plt.show()


## 2. SECOND ORDER NEURON ANALYSES

## 2a. Identify 2Ns and their connections and organize the data

### Define and run a function to identify second order neurons (2Ns)
- neuron must be downstream of aPhN2-SAs
- minimum of 5 synapses between the two neurons
- we will filter out aPhN2-SAs from the list later

In [None]:
# Define function
def second_order(aph1_sa, set_label):
    """
    Given a set of pharyngeal sensory axons (aPhN2-SAs) with a 'root_id' column,
    this function extracts the connectivity (filtered for syn_count >= 5) and
    aggregates it to yield second-order outputs.

    Parameters:
        aph1_sa : pd.DataFrame
            DataFrame containing a column 'root_id' for the aPhN2-SAs.
        set_label : str
            A label (e.g., 'set_1', 'set_2', etc.) used to annotate the output columns.

    Returns:
        connectivity : pd.DataFrame
            DataFrame containing the filtered connectivity data.
        second_orders : pd.DataFrame
            Aggregated connectivity DataFrame with:
              - 'root_id': post-synaptic neuron ID,
              - 'upstream_<set_label>_aPhN2_SAs': count of unique upstream aPhN2-SAs,
              - '<set_label>_syn_count': total synapse count,
              - 'const': constant column for OLS regression.
    """
    # Get connectivity from aPhN2-SAs
    connectivity = pd.merge(
        aph1_sa['root_id'],
        connections[['pre_root_id', 'post_root_id', 'neuropil', 'syn_count', 'nt_type']],
        left_on='root_id',
        right_on='pre_root_id',
        how='inner'
    ).query("syn_count >= 5")

    connectivity = connectivity.drop(columns='root_id')

    # Aggregate connectivity to obtain second-order outputs
    second_orders = connectivity.groupby("post_root_id").agg({
        'pre_root_id': 'nunique',
        'syn_count': 'sum'
    }).reset_index()

    second_orders.columns = ['root_id', 'upstream_' + set_label + '_aPhN2_SAs', set_label + '_syn_count']
    second_orders['const'] = 1  # For OLS regression later

    return connectivity, second_orders


In [None]:
# Run function to get second-order connectivity and list of second-order outputs (2Ns) for each set
set_1_second_order_connectivity, set_1_2Ns = second_order(set_1, 'set_1')
set_2_second_order_connectivity, set_2_2Ns = second_order(set_2, 'set_2')
set_3_second_order_connectivity, set_3_2Ns = second_order(set_3, 'set_3')


In [None]:
# Take a look at set_6_2Ns as an example
set_1_2Ns

In [None]:
# Checking how many 2Ns we have for each set
print("Set_1 2Ns:", np.unique(set_1_2Ns.root_id.values).shape)
print("Set_2 2Ns:", np.unique(set_2_2Ns.root_id.values).shape)
print("Set_3 2Ns:", np.unique(set_3_2Ns.root_id.values).shape)

### Getting more information about 2Ns from flywire

In [None]:
# Select and merge rows from neurons_data for each set of 2Ns
# 'neurons_data' contains columns like nt_type, side, input_synapses, output_synapses
set_1_2Ns = pd.merge(set_1_2Ns, neurons_data, on='root_id', how='inner')
set_2_2Ns = pd.merge(set_2_2Ns, neurons_data, on='root_id', how='inner')
set_3_2Ns = pd.merge(set_3_2Ns, neurons_data, on='root_id', how='inner')

In [None]:

# Checking that we didn't drop neurons with the merge
print("Set_1 2Ns (after merge):", np.unique(set_1_2Ns.root_id.values).shape)
print("Set_2 2Ns (after merge):", np.unique(set_2_2Ns.root_id.values).shape)
print("Set_3 2Ns (after merge):", np.unique(set_3_2Ns.root_id.values).shape)

### Remove aPhN2-SAs from 2N lists

In [None]:
# Concatenate all sets into a single DataFrame
all_sets = pd.concat([set_1, set_2, set_3], axis=0)

# Drop 2Ns that are actually in the pharyngeal sensory axon sets
set_1_2Ns = set_1_2Ns[~set_1_2Ns['root_id'].isin(all_sets['root_id'])]
set_2_2Ns = set_2_2Ns[~set_2_2Ns['root_id'].isin(all_sets['root_id'])]
set_3_2Ns = set_3_2Ns[~set_3_2Ns['root_id'].isin(all_sets['root_id'])]

# Drop all second-order connections where post-synaptic neurons are also in the sets
set_1_second_order_connectivity = set_1_second_order_connectivity[
    ~set_1_second_order_connectivity['post_root_id'].isin(all_sets['root_id'])
]
set_2_second_order_connectivity = set_2_second_order_connectivity[
    ~set_2_second_order_connectivity['post_root_id'].isin(all_sets['root_id'])
]
set_3_second_order_connectivity = set_3_second_order_connectivity[
    ~set_3_second_order_connectivity['post_root_id'].isin(all_sets['root_id'])
]


In [None]:
# 1) Check how many 2Ns are left for each set
print("Set_1 2Ns:", np.unique(set_1_2Ns.root_id.values).shape)
print("Set_2 2Ns:", np.unique(set_2_2Ns.root_id.values).shape)
print("Set_3 2Ns:", np.unique(set_3_2Ns.root_id.values).shape)

### Add the data from the model simulations to the 2N dataframes

This will be necessary to analyze activated vs. non-activated 2Ns.

First, import the data and add the activation status

In [None]:
# 4) Make a DataFrame with data for all sets
all_2N = pd.concat([
    set_1_2Ns,
    set_2_2Ns,
    set_3_2Ns,
])

# The columns to merge and sum up depend on how you named them in the 'second_order' step
# Here, we assume they're named 'set_1_syn_count', 'set_2_syn_count', etc.
cols_merge_syn = [
    'set_1_syn_count', 'set_2_syn_count', 'set_3_syn_count'
]
all_2N = all_2N.assign(
    total_syn_count=all_2N[cols_merge_syn].sum(axis=1)
).drop(cols_merge_syn, axis=1)

# Similarly for the upstream columns, if you named them 'upstream_set_1_aPhN2_SAs', etc.
cols_upstream = [
    'upstream_set_1_aPhN2_SAs', 'upstream_set_2_aPhN2_SAs', 'upstream_set_3_aPhN2_SAs'
]
all_2N = all_2N.assign(
    total_upstream=all_2N[cols_upstream].sum(axis=1)
).drop(cols_upstream, axis=1)

# all_2N now contains a combined table of second-order neurons with:
#   - Activation status at each rate, for each set
#   - Summed total syn_count across all sets
#   - Summed total of upstream sets
#   - A 'label' column that indicates which set the neuron was originally associated with

In [None]:
# Inspecting the new data frame
all_2N

In [None]:
# First, get classifications for 2Ns
# This uses the "classification_other" dataset for the first time, which has classes/superclasses for all neurons

# get classifications for 2Ns
# For each set, merge the unique 2N root_ids with the classification_other dataset.
set_1_2Ns_classified = pd.merge(pd.DataFrame({'root_id': set_1_2Ns.root_id.unique()}), classification_other,
                                on='root_id')
set_2_2Ns_classified = pd.merge(pd.DataFrame({'root_id': set_2_2Ns.root_id.unique()}), classification_other,
                                on='root_id')
set_3_2Ns_classified = pd.merge(pd.DataFrame({'root_id': set_3_2Ns.root_id.unique()}), classification_other,
                                on='root_id')


In [None]:
# Get unique superclass categories from the classification_other dataset
categories = classification_other.super_class.dropna().unique()
print("Superclasses found:", categories)

In [None]:
# Reorder the superclasses in the desired order.
categories = ['sensory', 'ascending', 'central',
              'descending', 'motor', 'endocrine',
              'optic', 'visual_projection', 'visual_centrifugal']
print("Ordered superclasses:", categories)

In [None]:
# Plot graph of superclasses for 2Ns

# We assume you have set_1_2Ns_classified, set_2_2Ns_classified, etc.
# which contain a 'super_class' column from classification_other.

datasets = [
    set_1_2Ns_classified,
    set_2_2Ns_classified,
    set_3_2Ns_classified,
]

# Count superclasses for each dataset
# .dropna() avoids counts on NaN
datasets_counts = [
    ds['super_class'].dropna().value_counts() for ds in datasets
]

# Suppose we have 9 superclasses in a desired order
categories = [
    'sensory',
    'ascending',
    'central',
    'descending',
    'motor',
    'endocrine',
    'optic',
    'visual_projection',
    'visual_centrifugal'
]

# array_plot will hold one array per superclass
# each array has length = number of sets (6), representing the count in that set
array_plot = []
for superclass in categories:
    row_counts = []
    for ds_count in datasets_counts:
        # If the superclass is not found in a given set, we push 0
        row_counts.append(ds_count.get(superclass, 0))
    array_plot.append(np.array(row_counts))

# array_plot is now a list of 9 np.arrays, each of length 6.
# Convert it to a NumPy array of shape (9, 6) if convenient
array_plot = np.array(array_plot)

# Set labels for the sets, e.g. "Set 1", "Set 2", ...
set_labels = ["Set 1", "Set 2", "Set 3"]

fig = plt.figure(figsize=(5, 3))
ax = fig.add_subplot(1, 1, 1)

# We'll create 6 bars (one per set), each stacked with 9 layers (one per superclass).
bar_positions = np.arange(len(set_labels)) + 1  # e.g. [1,2,3,4,5,6]
bar_width = 0.6

# Define a color palette for the 9 superclasses
palette = [
    '#dc143c',    # sensory
    '#ffa500',    # ascending
    'green',      # central
    '#069af3',    # descending
    '#0000ff',    # motor
    '#9a0eea',    # endocrine
    '#c79fef',    # optic
    '#ffc0cb',    # visual_projection
    '#ff81c0'     # visual_centrifugal
]

# The first row of array_plot will be plotted directly;
# subsequent rows get stacked on top.
ax1 = ax.bar(
    bar_positions,
    array_plot[0],  # counts for first superclass
    width=bar_width,
    color=palette[0]
)
stack_totals = array_plot[0].copy()  # track the cumulative heights

# For each of the remaining superclasses (rows 1..8)
for i in range(1, len(categories)):
    ax_i = ax.bar(
        bar_positions,
        array_plot[i],
        bottom=stack_totals,
        width=bar_width,
        color=palette[i]
    )
    stack_totals += array_plot[i]

ax.set_title('Superclasses of 2Ns', fontsize=16)
ax.set_ylabel('# 2Ns', fontsize=16)

# Create a legend with the categories
ax.legend(
    categories,
    loc='upper left',
    bbox_to_anchor=(1, 1.1),
    frameon=False,
    fontsize=13
)

plt.xticks(bar_positions, set_labels, fontsize=7)
plt.yticks(fontsize=14)

# Optionally adjust y-limit if needed
plt.ylim(0, np.max(stack_totals)*1.1)
plt.tight_layout()
plt.show()

In [None]:
# Convert raw counts to proportions (column-wise)
# array_plot has shape (9,6) => 9 superclasses, 6 sets
col_sums = array_plot.sum(axis=0)  # sum over rows => shape (6,)
array_plot_prop = (array_plot / col_sums) * 100.0  # percentage

print("Proportions (as %):")
print(array_plot_prop)

# Then you can plot array_plot_prop the same way you plotted array_plot,
# just changing the bar heights and the y-axis label to "% 2Ns".


In [None]:
# Once you have array_plot (counts) or array_plot_prop (percentages),
# you can look at them directly:

print("array_plot (raw counts)\n", array_plot)
print("\narray_plot_prop (percent):\n", array_plot_prop)


In [None]:
# Get values for proportion of each type
array_plot/np.sum(array_plot, axis=0)

In [None]:
import pandas as pd

# Example: using your already computed array_plot and set_labels, and the ordered categories
categories = ['sensory', 'ascending', 'central', 'descending', 'motor',
              'endocrine', 'optic', 'visual_projection', 'visual_centrifugal']
set_labels = ["Set 1", "Set 2", "Set 3"]

# Create a DataFrame with rows as superclasses and columns as datasets
df_counts = pd.DataFrame(array_plot, index=categories, columns=set_labels)

# (Optional) Reset index so that the superclass names become a column in the CSV
df_counts = df_counts.reset_index().rename(columns={'index': 'super_class'})

# Save to CSV file
df_counts.to_csv("datasets_counts.csv", index=False)
print("CSV table saved as 'datasets_counts.csv'")


In [None]:
# Merge set_1_outputs with classification_other to add the super_class for each output neuron.
# (Assumes set_1_outputs has a column 'post_root_id' and classification_other has 'root_id' and 'super_class')
set_1_outputs_with_class = pd.merge(
    set_1_outputs,
    classification_other[['root_id', 'super_class']],
    left_on='post_root_id',
    right_on='root_id',
    how='left'
)

# Remove the extra 'root_id' column that came from the merge
set_1_outputs_with_class.drop(columns=['root_id'], inplace=True)

# (Optional) Rename the super_class column for clarity
set_1_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# Save the result as a CSV file
set_1_outputs_with_class.to_csv("set_1_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_1_opt_conns_superclass.csv'")


In [None]:
# ------------------------------------------------------------------------------
# Step 1: Use the original table from Option 1 and treat its post IDs as new pre IDs.
# ------------------------------------------------------------------------------
# We assume 'set_1_outputs' is the DataFrame from Option 1 with columns:
#    pre_root_id, post_root_id, neuropil, syn_count, nt_type, location_of_connection, etc.
#
# Create a DataFrame of unique neurons that were outputs (i.e. post ids) in the first round:
new_pre_neurons = pd.DataFrame({'root_id': set_1_outputs['post_root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Get the downstream outputs for these new pre neurons.
# ------------------------------------------------------------------------------
# Use the neuronal_outputs function (which expects a DataFrame with a column 'root_id')
# to find the downstream connections for these neurons.
new_outputs = neuronal_outputs(new_pre_neurons)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with classification info so that each connection includes
# the super_class of the output (i.e. new post) neuron.
# ------------------------------------------------------------------------------
new_outputs_with_class = pd.merge(
    new_outputs,
    classification_other[['root_id', 'super_class']],  # classification table for output neurons
    left_on='post_root_id',  # new outputs: these are the downstream neurons
    right_on='root_id',
    how='left'
)

# Drop the extra 'root_id' column that came from the merge
new_outputs_with_class.drop(columns=['root_id'], inplace=True)

# Rename the column for clarity
new_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as CSV.
# ------------------------------------------------------------------------------
new_outputs_with_class.to_csv("set_1_hop_1_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_1_hop_1_opt_conns_superclass.csv'")


i now want to make a sankey plot of this data where we see on axis 1 set 1 axis 2 the super classes and axis 3 as super classes. for context here is a head of one of the csvs (pre_root_id	post_root_id	neuropil	syn_count	nt_type	location_of_connection	output_super_class
720575940617034713	720575940628071211	PRW	5	SER	local	central
720575940617034713	720575940633548128	FLA_R	7	SER	local	ascending
720575940617034713	720575940621662332	FLA_R	8	SER	local	central
720575940617034713	720575940630672938	FLA_R	8	SER	local	central
720575940617034713	720575940630672938	PRW	16	ACH	local	central).

In [None]:
import pandas as pd

# ------------------------------------------------------------------------------
# Step 1: Load the previous CSV file and extract the unique post ids
# ------------------------------------------------------------------------------
prev_outputs = pd.read_csv("set_1_hop_1_opt_conns_superclass.csv")
# Here, the post ids from the previous file will now serve as the pre ids.
new_pre_neurons_2 = pd.DataFrame({'root_id': prev_outputs['post_root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Find downstream connections for these new pre neurons using your neuronal_outputs function
# ------------------------------------------------------------------------------
new_outputs_2 = neuronal_outputs(new_pre_neurons_2)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with the classification information to add super_class for each new downstream neuron
# ------------------------------------------------------------------------------
new_outputs_2_with_class = pd.merge(
    new_outputs_2,
    classification_other[['root_id', 'super_class']],
    left_on='post_root_id',  # these are the new downstream neurons
    right_on='root_id',
    how='left'
)

# Remove the extra 'root_id' column that came from the merge
new_outputs_2_with_class.drop(columns=['root_id'], inplace=True)
# Rename the super_class column for clarity
new_outputs_2_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as a CSV file
# ------------------------------------------------------------------------------
new_outputs_2_with_class.to_csv("set_1_hop_2_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_1_hop_2_opt_conns_superclass.csv'")


In [None]:
# ------------------------------------------------------------------------------
# Step 1: Create a DataFrame of unique neurons from the set_1 2Ns.
# ------------------------------------------------------------------------------
# Here, we assume that "set_1_2Ns" is your DataFrame of second-order neurons from set 1,
# which was generated earlier in your pipeline.
new_pre_neurons = pd.DataFrame({'root_id': set_1_2Ns['root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Get the downstream outputs for these new pre neurons (i.e. the 3NS).
# ------------------------------------------------------------------------------
# Use the neuronal_outputs function (which expects a DataFrame with a 'root_id' column)
# to extract all downstream connections for these neurons.
new_outputs = neuronal_outputs(new_pre_neurons)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with classification information so that each connection includes
# the super_class of the output (i.e. new post) neuron.
# ------------------------------------------------------------------------------
new_outputs_with_class = pd.merge(
    new_outputs,
    classification_other[['root_id', 'super_class']],  # Classification table for output neurons
    left_on='post_root_id',  # New outputs: these are the downstream neurons
    right_on='root_id',
    how='left'
)

# Drop the extra 'root_id' column that came from the merge
new_outputs_with_class.drop(columns=['root_id'], inplace=True)

# Rename the column for clarity
new_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as CSV.
# ------------------------------------------------------------------------------
new_outputs_with_class.to_csv("set_1_hop_3_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_1_hop_3_opt_conns_superclass.csv'")


In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the first round data: set 1 outputs (first-order)
df1 = pd.read_csv("set_1_opt_conns_superclass.csv")
# Load the second round data: outputs where we treat the first round outputs as new pre neurons
df2 = pd.read_csv("set_1_hop_1_opt_conns_superclass.csv")

# --- Build flows from "Set 1" to first-level super classes ---
# Group df1 by the first-level output super_class and count connections
flow1 = df1.groupby('output_super_class').size().reset_index(name='count')

# --- Build flows from first-level to second-level super classes ---
# Merge the two dataframes by linking the first round's post_root_id (the output neuron)
# with the second round's pre_root_id (the same neuron now acting as input)
merged = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id', suffixes=('_first', '_second'))

# Group by first-level and second-level super classes and count connections
flow2 = merged.groupby(['output_super_class_first', 'output_super_class_second']).size().reset_index(name='count')

# --- Define Sankey nodes ---
# In this diagram, we want three columns:
# Column 1: "Set 1" (a single node)
# Column 2: Unique first-level super classes from df1
# Column 3: Unique second-level super classes from the merged table
nodes = []
nodes.append("Set 1")  # Column 1

first_super_nodes = flow1['output_super_class'].unique().tolist()
nodes.extend(first_super_nodes)  # Column 2

second_super_nodes = flow2['output_super_class_second'].unique().tolist()
nodes.extend(second_super_nodes)  # Column 3

# Create a mapping from node label to node index for building links
node_index = {node: i for i, node in enumerate(nodes)}

# --- Create Sankey link data ---

# Flow 1: from "Set 1" (source) to each first-level super_class node
source1 = []
target1 = []
value1 = []
for _, row in flow1.iterrows():
    source1.append(node_index["Set 1"])  # all originate from "Set 1"
    target1.append(node_index[row['output_super_class']])
    value1.append(row['count'])

# Flow 2: from first-level super class to second-level super class
source2 = []
target2 = []
value2 = []
for _, row in flow2.iterrows():
    source2.append(node_index[row['output_super_class_first']])
    target2.append(node_index[row['output_super_class_second']])
    value2.append(row['count'])

# Combine flows from both steps
source = source1 + source2
target = target1 + target2
value  = value1  + value2

# --- Build and display the Sankey diagram ---
fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = nodes,
    ),
    link = dict(
        source = source,
        target = target,
        value = value
    ))])

fig.update_layout(title_text="Sankey Diagram: Set 1 to First and Second Level Super Classes", font_size=10)
fig.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go

# ---------------------------
# Load CSV files
# ---------------------------
df1 = pd.read_csv("set_1_opt_conns_superclass.csv")
df2 = pd.read_csv("set_1_hop_1_opt_conns_superclass.csv")
df3 = pd.read_csv("set_1_hop_2_opt_conns_superclass.csv")

# ---------------------------
# Build flows using vectorized operations
# ---------------------------
# Flow 1: From "Set 1" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class').size().reset_index(name='count')
flow1['source'] = "Set 1"  # all connections originate from "Set 1"

# Flow 2: From first-level to second-level super classes
# Merge df1 and df2 using the connection where first round's post_root_id becomes df2's pre_root_id.
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second']).size().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
# Merge df2 and df3 similarly (df2.post_root_id becomes df3.pre_root_id).
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third']).size().reset_index(name='count')

# ---------------------------
# Define nodes for the Sankey diagram
# ---------------------------
# We define four groups:
# Column 1: "Set 1" (a single node)
# Column 2: Unique first-level super classes from df1
# Column 3: Unique second-level super classes from the merged flows (from flow2 and flow3)
# Column 4: Unique third-level super classes from flow3
nodes = []
nodes.append("Set 1")
first_nodes = flow1['output_super_class'].unique().tolist()
nodes.extend(first_nodes)
second_nodes = pd.concat([flow2['output_super_class_second'], flow3['output_super_class_second']]).unique().tolist()
nodes.extend(second_nodes)
third_nodes = flow3['output_super_class_third'].unique().tolist()
nodes.extend(third_nodes)

# Create a mapping from node label to node index
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Vectorized mapping to get link indices
# ---------------------------
flow1['source_idx'] = node_index["Set 1"]
flow1['target_idx'] = flow1['output_super_class'].map(node_index)

flow2['source_idx'] = flow2['output_super_class_first'].map(node_index)
flow2['target_idx'] = flow2['output_super_class_second'].map(node_index)

flow3['source_idx'] = flow3['output_super_class_second'].map(node_index)
flow3['target_idx'] = flow3['output_super_class_third'].map(node_index)

# Combine flows into link lists for the Sankey diagram
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute incoming and outgoing values for each node
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}

for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    total = max(incoming, outgoing)
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Build and display the Sankey diagram with custom hover text
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])

fig.update_layout(title_text="Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()

# ---------------------------
# Determine hop counts for a target super class (e.g., "motor")
# ---------------------------
target_class = "motor"
hop1_motor = flow1[flow1['output_super_class'] == target_class]['count'].sum()
hop2_motor = flow2[flow2['output_super_class_second'] == target_class]['count'].sum()
hop3_motor = flow3[flow3['output_super_class_third'] == target_class]['count'].sum()

print(f"Connections reaching '{target_class}' neurons at 1 hop (direct):", hop1_motor)
print(f"Connections reaching '{target_class}' neurons at 2 hops:", hop2_motor)
print(f"Connections reaching '{target_class}' neurons at 3 hops:", hop3_motor)


In [None]:
#fig.show(renderer="browser")
#fig.write_html("sankey_diagram.html")

In [None]:
import pandas as pd
import plotly.graph_objects as go

# ---------------------------
# Load CSV files and filter out weak connections (< 5 synapses)
# ---------------------------
df1 = pd.read_csv("set_1_opt_conns_superclass.csv")
df1 = df1[df1['syn_count'] >= 5]  # only keep connections with at least 5 synapses

df2 = pd.read_csv("set_1_hop_1_opt_conns_superclass.csv")
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.read_csv("set_1_hop_2_opt_conns_superclass.csv")
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# Build flows (using summed synapse counts)
# ---------------------------
# Flow 1: From "Set 1" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class')['syn_count'].sum().reset_index(name='count')
flow1['source'] = "Set 1"  # all connections originate from "Set 1"

# Flow 2: From first-level to second-level super classes
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
# Sum the synapse counts from df2 (now in 'syn_count_second') for each pair.
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second'])['syn_count_second'] \
    .sum().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
# Sum the synapse counts from df3 (now in 'syn_count_third') for each pair.
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third'])['syn_count_third'] \
    .sum().reset_index(name='count')

# ---------------------------
# Build node lists directly from the data sources for each hop.
# This ensures that nodes appear in the column corresponding to their data source.
# ---------------------------
col1 = ["Set 1"]  # Column 1: the source
col2 = ["1: " + label for label in sorted(df1['output_super_class'].unique())]  # Column 2 from df1
col3 = ["2: " + label for label in sorted(df2['output_super_class'].unique())]  # Column 3 from df2
col4 = ["3: " + label for label in sorted(df3['output_super_class'].unique())]  # Column 4 from df3

nodes = col1 + col2 + col3 + col4
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Update flows to use the new, prefixed node labels.
# ---------------------------
# Flow 1: Source is "Set 1"; target from df1 gets prefix "1: "
flow1['source_idx'] = node_index["Set 1"]
flow1['target_idx'] = flow1['output_super_class'].apply(lambda x: node_index["1: " + x])

# Flow 2: Source is from df1 (first hop, prefix "1: ") and target is from df2 (second hop, prefix "2: ")
flow2['source_idx'] = flow2['output_super_class_first'].apply(lambda x: node_index["1: " + x])
flow2['target_idx'] = flow2['output_super_class_second'].apply(lambda x: node_index["2: " + x])

# Flow 3: Source is from df2 (second hop, prefix "2: ") and target is from df3 (third hop, prefix "3: ")
flow3['source_idx'] = flow3['output_super_class_second'].apply(lambda x: node_index["2: " + x])
flow3['target_idx'] = flow3['output_super_class_third'].apply(lambda x: node_index["3: " + x])

# Combine flows into link lists.
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute custom hover info for nodes.
# For each node, we sum incoming and outgoing flows based on the link lists.
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}

for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    # For nodes that both receive and send flows, Plotly by default uses max(incoming, outgoing)
    # Here we display both values and the "total" (the max of the two)
    total = incoming if incoming >= outgoing else outgoing
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Define fixed x positions so that the diagram is laid out in a linear, successive fashion.
# Column 1 ("Set 1") at x=0.0, Column 2 (first-level nodes) at x=0.33,
# Column 3 (second-level nodes) at x=0.66, Column 4 (third-level nodes) at x=1.0.
# ---------------------------
x_positions = []
x_positions += [0.0] * len(col1)
x_positions += [0.33] * len(col2)
x_positions += [0.66] * len(col3)
x_positions += [1.0] * len(col4)

# ---------------------------
# Build and display the Sankey diagram with custom hover text for nodes.
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    arrangement="freeform",  # you can use "fixed" or "freeform" as desired
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        x=x_positions,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])
fig.update_layout(title_text="Linear Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go
import colorsys

# ---------------------------
# Helper functions for colors
# ---------------------------
def get_color(i, n):
    """Generate a distinct RGB color for node index i among n nodes."""
    hue = i / n  # evenly spaced hue
    # Using moderate saturation and high brightness for vivid colors
    r, g, b = colorsys.hsv_to_rgb(hue, 0.6, 0.9)
    r, g, b = int(r * 255), int(g * 255), int(b * 255)
    return f"rgb({r},{g},{b})"

def make_rgba(rgb_str, alpha=0.5):
    """Convert an rgb string (e.g., 'rgb(31,119,180)') to an rgba string with given alpha."""
    rgb_values = rgb_str.strip("rgb(").strip(")").split(",")
    return f"rgba({rgb_values[0].strip()},{rgb_values[1].strip()},{rgb_values[2].strip()},{alpha})"

# ---------------------------
# Load CSV files and filter out weak connections (< 5 synapses)
# ---------------------------
df1 = pd.read_csv("set_1_opt_conns_superclass.csv")
df1 = df1[df1['syn_count'] >= 5]  # only keep connections with at least 5 synapses

df2 = pd.read_csv("set_1_hop_1_opt_conns_superclass.csv")
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.read_csv("set_1_hop_2_opt_conns_superclass.csv")
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# Build flows (using summed synapse counts)
# ---------------------------
# Flow 1: From "Set 1" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class')['syn_count'].sum().reset_index(name='count')
flow1['source'] = "Set 1"  # all connections originate from "Set 1"

# Flow 2: From first-level to second-level super classes
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second'])['syn_count_second'] \
    .sum().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third'])['syn_count_third'] \
    .sum().reset_index(name='count')

# ---------------------------
# Build node lists directly from the data sources for each hop.
# ---------------------------
# Define columns:
# Column 1: "Set 1"
# Column 2: Unique first-level super classes from df1 (prefixed for clarity)
# Column 3: Unique second-level super classes from df2 (prefixed)
# Column 4: Unique third-level super classes from df3 (prefixed)
col1 = ["Set 1"]
col2 = ["1: " + label for label in sorted(df1['output_super_class'].unique())]
col3 = ["2: " + label for label in sorted(df2['output_super_class'].unique())]
col4 = ["3: " + label for label in sorted(df3['output_super_class'].unique())]

nodes = col1 + col2 + col3 + col4
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Assign each node a unique color.
# ---------------------------
n_nodes = len(nodes)
node_colors = [get_color(i, n_nodes) for i in range(n_nodes)]

# ---------------------------
# Update flows to use the new, prefixed node labels.
# ---------------------------
# Flow 1: Source is "Set 1"; target from df1 gets prefix "1: "
flow1['source_idx'] = node_index["Set 1"]
flow1['target_idx'] = flow1['output_super_class'].apply(lambda x: node_index["1: " + x])

# Flow 2: Source is from df1 (first hop, prefix "1: ") and target is from df2 (prefix "2: ")
flow2['source_idx'] = flow2['output_super_class_first'].apply(lambda x: node_index["1: " + x])
flow2['target_idx'] = flow2['output_super_class_second'].apply(lambda x: node_index["2: " + x])

# Flow 3: Source is from df2 (prefix "2: ") and target is from df3 (prefix "3: ")
flow3['source_idx'] = flow3['output_super_class_second'].apply(lambda x: node_index["2: " + x])
flow3['target_idx'] = flow3['output_super_class_third'].apply(lambda x: node_index["3: " + x])

# Combine flows into link lists.
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute custom hover info for nodes.
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}
for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    total = incoming if incoming >= outgoing else outgoing
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Build link colors based on the source node's unique color (with 50% opacity).
# ---------------------------
link_colors = []
for s in source:
    base_color = node_colors[s]  # use the source node's color
    link_colors.append(make_rgba(base_color, 0.5))

# ---------------------------
# Define fixed x positions so that the diagram is laid out in a linear, successive fashion.
# ---------------------------
x_positions = []
x_positions += [0.0] * len(col1)
x_positions += [0.33] * len(col2)
x_positions += [0.66] * len(col3)
x_positions += [1.0] * len(col4)

# ---------------------------
# Build and display the Sankey diagram with custom hover text and link colors.
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    arrangement="freeform",
    node=dict(
        pad=80,
        thickness=50,
        line=dict(color="black", width=0.9),
        label=nodes,
        x=x_positions,
        color=node_colors,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors
    )
)])

fig.update_layout(title_text="Linear Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()


In [None]:
flow1

In [None]:
import pandas as pd

# Load connectivity CSV files
df1 = pd.read_csv("set_1_opt_conns_superclass.csv")
df2 = pd.read_csv("set_1_hop_1_opt_conns_superclass.csv")
df3 = pd.read_csv("set_1_hop_2_opt_conns_superclass.csv")

target_class = "motor"

# ---------------------------
# 1-hop: Direct connections from Set 1 to a motor neuron.
# ---------------------------
df1_motor = df1[df1['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
df1_motor = df1_motor.assign(hop=1)

# ---------------------------
# 2-hop: Chain from Set 1 → Neuron2 → Neuron3 where the second hop ends in a motor neuron.
# ---------------------------
df12 = pd.merge(
    df1[['pre_root_id', 'post_root_id']],
    df2,
    left_on='post_root_id',
    right_on='pre_root_id',
    suffixes=('_df1', '_df2')
)
# In df12, the starting neuron from Set 1 is now in 'pre_root_id' from df1,
# which has been renamed automatically to 'pre_root_id' (or to 'pre_root_id_df1' if needed).
# Let's check for our purposes:
if 'pre_root_id_df1' in df12.columns:
    start_col = 'pre_root_id_df1'
else:
    start_col = 'pre_root_id'

df12_motor = df12[df12['output_super_class'] == target_class][[start_col]].drop_duplicates()
df12_motor = df12_motor.rename(columns={start_col: 'pre_root_id'})
df12_motor = df12_motor.assign(hop=2)

# ---------------------------
# 3-hop: Chain from Set 1 → Neuron2 → Neuron3 → Neuron4 where the third hop ends in a motor neuron.
# ---------------------------
df123 = pd.merge(
    df12,
    df3,
    left_on='post_root_id_df2',  # the output from the second leg from df2
    right_on='pre_root_id',
    suffixes=('_df2', '_df3')
)
# After merging, the df3 column 'output_super_class' becomes 'output_super_class_df3'
df123_motor = df123[df123['output_super_class_df3'] == target_class][[start_col]].drop_duplicates()
df123_motor = df123_motor.rename(columns={start_col: 'pre_root_id'})
df123_motor = df123_motor.assign(hop=3)

# ---------------------------
# Combine and determine the minimum hop count per starting neuron.
# ---------------------------
df_hops = pd.concat([df1_motor, df12_motor, df123_motor], ignore_index=True)
min_hops = df_hops.groupby('pre_root_id', as_index=False)['hop'].min()

print(min_hops)


In [None]:
# Calculate descriptive statistics for the hops
mean_hops = min_hops['hop'].mean()
median_hops = min_hops['hop'].median()
std_hops = min_hops['hop'].std()
min_hop = min_hops['hop'].min()
max_hop = min_hops['hop'].max()
count = min_hops['hop'].count()

print("Descriptive statistics for hops to reach a motor neuron:")
print(f"Number of neurons: {count}")
print(f"Mean hops: {mean_hops:.2f}")
print(f"Median hops: {median_hops}")
print(f"Standard Deviation: {std_hops:.2f}")
print(f"Minimum hops: {min_hop}")
print(f"Maximum hops: {max_hop}")

# Alternatively, you can use describe() to get a summary:
print("\nSummary statistics:")
print(min_hops['hop'].describe())


In [None]:
import plotly.express as px

# Ensure 'hop' is numeric and get max_hop as a Python int.
min_hops['hop'] = pd.to_numeric(min_hops['hop'])
max_hop = int(min_hops['hop'].max())

# --- Histogram ---
# Set nbins to max_hop + 1 so that each integer hop has its own bin.
fig_hist = px.histogram(
    min_hops,
    x="hop",
    nbins=max_hop + 1,
    title="Distribution of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops", "count": "Number of Neurons"},
    range_x=[0.5, max_hop + 0.5]
)
fig_hist.update_xaxes(dtick=1)
fig_hist.show()

# --- Box Plot ---
fig_box = px.box(
    min_hops,
    y="hop",
    title="Box Plot of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops"}
)
fig_box.show()

# --- Violin Plot ---
fig_violin = px.violin(
    min_hops,
    y="hop",
    box=True,
    points="all",
    title="Violin Plot of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops"}
)
fig_violin.show()


In [None]:
##fig.show(renderer="browser")
#fig.write_html("sankey_diagram.html")

### Plot proportions of ascending and descending neurons

This figure is not included in the paper as it is redundant with the superclass figure above.

In [None]:
# Merge set_2_outputs with classification_other to add the super_class for each output neuron.
# (Assumes set_2_outputs has a column 'post_root_id' and classification_other has 'root_id' and 'super_class')
set_2_outputs_with_class = pd.merge(
    set_2_outputs,
    classification_other[['root_id', 'super_class']],
    left_on='post_root_id',
    right_on='root_id',
    how='left'
)

# Remove the extra 'root_id' column that came from the merge
set_2_outputs_with_class.drop(columns=['root_id'], inplace=True)

# (Optional) Rename the super_class column for clarity
set_2_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# Save the result as a CSV file
set_2_outputs_with_class.to_csv("set_2_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_2_opt_conns_superclass.csv'")


In [None]:
# ------------------------------------------------------------------------------
# Step 1: Use the original table from Option 1 and treat its post IDs as new pre IDs.
# ------------------------------------------------------------------------------
# We assume 'set_2_outputs' is the DataFrame from Option 1 with columns:
#    pre_root_id, post_root_id, neuropil, syn_count, nt_type, location_of_connection, etc.
#
# Create a DataFrame of unique neurons that were outputs (i.e. post ids) in the first round:
new_pre_neurons = pd.DataFrame({'root_id': set_2_outputs['post_root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Get the downstream outputs for these new pre neurons.
# ------------------------------------------------------------------------------
# Use the neuronal_outputs function (which expects a DataFrame with a column 'root_id')
# to find the downstream connections for these neurons.
new_outputs = neuronal_outputs(new_pre_neurons)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with classification info so that each connection includes
# the super_class of the output (i.e. new post) neuron.
# ------------------------------------------------------------------------------
new_outputs_with_class = pd.merge(
    new_outputs,
    classification_other[['root_id', 'super_class']],  # classification table for output neurons
    left_on='post_root_id',  # new outputs: these are the downstream neurons
    right_on='root_id',
    how='left'
)

# Drop the extra 'root_id' column that came from the merge
new_outputs_with_class.drop(columns=['root_id'], inplace=True)

# Rename the column for clarity
new_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as CSV.
# ------------------------------------------------------------------------------
new_outputs_with_class.to_csv("set_2_hop_1_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_2_hop_1_opt_conns_superclass.csv'")


i now want to make a sankey plot of this data where we see on axis 1 Set 2 axis 2 the super classes and axis 3 as super classes. for context here is a head of one of the csvs (pre_root_id	post_root_id	neuropil	syn_count	nt_type	location_of_connection	output_super_class
720575940617034713	720575940628071211	PRW	5	SER	local	central
720575940617034713	720575940633548128	FLA_R	7	SER	local	ascending
720575940617034713	720575940621662332	FLA_R	8	SER	local	central
720575940617034713	720575940630672938	FLA_R	8	SER	local	central
720575940617034713	720575940630672938	PRW	16	ACH	local	central).

In [None]:
import pandas as pd

# ------------------------------------------------------------------------------
# Step 1: Load the previous CSV file and extract the unique post ids
# ------------------------------------------------------------------------------
prev_outputs = pd.read_csv("set_2_hop_1_opt_conns_superclass.csv")
# Here, the post ids from the previous file will now serve as the pre ids.
new_pre_neurons_2 = pd.DataFrame({'root_id': prev_outputs['post_root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Find downstream connections for these new pre neurons using your neuronal_outputs function
# ------------------------------------------------------------------------------
new_outputs_2 = neuronal_outputs(new_pre_neurons_2)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with the classification information to add super_class for each new downstream neuron
# ------------------------------------------------------------------------------
new_outputs_2_with_class = pd.merge(
    new_outputs_2,
    classification_other[['root_id', 'super_class']],
    left_on='post_root_id',  # these are the new downstream neurons
    right_on='root_id',
    how='left'
)

# Remove the extra 'root_id' column that came from the merge
new_outputs_2_with_class.drop(columns=['root_id'], inplace=True)
# Rename the super_class column for clarity
new_outputs_2_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as a CSV file
# ------------------------------------------------------------------------------
new_outputs_2_with_class.to_csv("set_2_hop_2_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_2_hop_2_opt_conns_superclass.csv'")


In [None]:
# ------------------------------------------------------------------------------
# Step 1: Create a DataFrame of unique neurons from the set_2 2Ns.
# ------------------------------------------------------------------------------
# Here, we assume that "set_2_2Ns" is your DataFrame of second-order neurons from Set 2,
# which was generated earlier in your pipeline.
new_pre_neurons = pd.DataFrame({'root_id': set_2_2Ns['root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Get the downstream outputs for these new pre neurons (i.e. the 3NS).
# ------------------------------------------------------------------------------
# Use the neuronal_outputs function (which expects a DataFrame with a 'root_id' column)
# to extract all downstream connections for these neurons.
new_outputs = neuronal_outputs(new_pre_neurons)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with classification information so that each connection includes
# the super_class of the output (i.e. new post) neuron.
# ------------------------------------------------------------------------------
new_outputs_with_class = pd.merge(
    new_outputs,
    classification_other[['root_id', 'super_class']],  # Classification table for output neurons
    left_on='post_root_id',  # New outputs: these are the downstream neurons
    right_on='root_id',
    how='left'
)

# Drop the extra 'root_id' column that came from the merge
new_outputs_with_class.drop(columns=['root_id'], inplace=True)

# Rename the column for clarity
new_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as CSV.
# ------------------------------------------------------------------------------
new_outputs_with_class.to_csv("set_2_hop_3_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_2_hop_3_opt_conns_superclass.csv'")


In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the first round data: Set 2 outputs (first-order)
df1 = pd.read_csv("set_2_opt_conns_superclass.csv")
# Load the second round data: outputs where we treat the first round outputs as new pre neurons
df2 = pd.read_csv("set_2_hop_1_opt_conns_superclass.csv")

# --- Build flows from "Set 2" to first-level super classes ---
# Group df1 by the first-level output super_class and count connections
flow1 = df1.groupby('output_super_class').size().reset_index(name='count')

# --- Build flows from first-level to second-level super classes ---
# Merge the two dataframes by linking the first round's post_root_id (the output neuron)
# with the second round's pre_root_id (the same neuron now acting as input)
merged = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id', suffixes=('_first', '_second'))

# Group by first-level and second-level super classes and count connections
flow2 = merged.groupby(['output_super_class_first', 'output_super_class_second']).size().reset_index(name='count')

# --- Define Sankey nodes ---
# In this diagram, we want three columns:
# Column 1: "Set 2" (a single node)
# Column 2: Unique first-level super classes from df1
# Column 3: Unique second-level super classes from the merged table
nodes = []
nodes.append("Set 2")  # Column 1

first_super_nodes = flow1['output_super_class'].unique().tolist()
nodes.extend(first_super_nodes)  # Column 2

second_super_nodes = flow2['output_super_class_second'].unique().tolist()
nodes.extend(second_super_nodes)  # Column 3

# Create a mapping from node label to node index for building links
node_index = {node: i for i, node in enumerate(nodes)}

# --- Create Sankey link data ---

# Flow 1: from "Set 2" (source) to each first-level super_class node
source1 = []
target1 = []
value1 = []
for _, row in flow1.iterrows():
    source1.append(node_index["Set 2"])  # all originate from "Set 2"
    target1.append(node_index[row['output_super_class']])
    value1.append(row['count'])

# Flow 2: from first-level super class to second-level super class
source2 = []
target2 = []
value2 = []
for _, row in flow2.iterrows():
    source2.append(node_index[row['output_super_class_first']])
    target2.append(node_index[row['output_super_class_second']])
    value2.append(row['count'])

# Combine flows from both steps
source = source1 + source2
target = target1 + target2
value  = value1  + value2

# --- Build and display the Sankey diagram ---
fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = nodes,
    ),
    link = dict(
        source = source,
        target = target,
        value = value
    ))])

fig.update_layout(title_text="Sankey Diagram: Set 2 to First and Second Level Super Classes", font_size=10)
fig.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go

# ---------------------------
# Load CSV files
# ---------------------------
df1 = pd.read_csv("set_2_opt_conns_superclass.csv")
df2 = pd.read_csv("set_2_hop_1_opt_conns_superclass.csv")
df3 = pd.read_csv("set_2_hop_2_opt_conns_superclass.csv")

# ---------------------------
# Build flows using vectorized operations
# ---------------------------
# Flow 1: From "Set 2" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class').size().reset_index(name='count')
flow1['source'] = "Set 2"  # all connections originate from "Set 2"

# Flow 2: From first-level to second-level super classes
# Merge df1 and df2 using the connection where first round's post_root_id becomes df2's pre_root_id.
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second']).size().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
# Merge df2 and df3 similarly (df2.post_root_id becomes df3.pre_root_id).
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third']).size().reset_index(name='count')

# ---------------------------
# Define nodes for the Sankey diagram
# ---------------------------
# We define four groups:
# Column 1: "Set 2" (a single node)
# Column 2: Unique first-level super classes from df1
# Column 3: Unique second-level super classes from the merged flows (from flow2 and flow3)
# Column 4: Unique third-level super classes from flow3
nodes = []
nodes.append("Set 2")
first_nodes = flow1['output_super_class'].unique().tolist()
nodes.extend(first_nodes)
second_nodes = pd.concat([flow2['output_super_class_second'], flow3['output_super_class_second']]).unique().tolist()
nodes.extend(second_nodes)
third_nodes = flow3['output_super_class_third'].unique().tolist()
nodes.extend(third_nodes)

# Create a mapping from node label to node index
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Vectorized mapping to get link indices
# ---------------------------
flow1['source_idx'] = node_index["Set 2"]
flow1['target_idx'] = flow1['output_super_class'].map(node_index)

flow2['source_idx'] = flow2['output_super_class_first'].map(node_index)
flow2['target_idx'] = flow2['output_super_class_second'].map(node_index)

flow3['source_idx'] = flow3['output_super_class_second'].map(node_index)
flow3['target_idx'] = flow3['output_super_class_third'].map(node_index)

# Combine flows into link lists for the Sankey diagram
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute incoming and outgoing values for each node
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}

for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    total = max(incoming, outgoing)
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Build and display the Sankey diagram with custom hover text
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])

fig.update_layout(title_text="Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()

# ---------------------------
# Determine hop counts for a target super class (e.g., "motor")
# ---------------------------
target_class = "motor"
hop1_motor = flow1[flow1['output_super_class'] == target_class]['count'].sum()
hop2_motor = flow2[flow2['output_super_class_second'] == target_class]['count'].sum()
hop3_motor = flow3[flow3['output_super_class_third'] == target_class]['count'].sum()

print(f"Connections reaching '{target_class}' neurons at 1 hop (direct):", hop1_motor)
print(f"Connections reaching '{target_class}' neurons at 2 hops:", hop2_motor)
print(f"Connections reaching '{target_class}' neurons at 3 hops:", hop3_motor)


In [None]:
#fig.show(renderer="browser")
#fig.write_html("sankey_diagram.html")

In [None]:
import pandas as pd
import plotly.graph_objects as go

# ---------------------------
# Load CSV files and filter out weak connections (< 5 synapses)
# ---------------------------
df1 = pd.read_csv("set_2_opt_conns_superclass.csv")
df1 = df1[df1['syn_count'] >= 5]  # only keep connections with at least 5 synapses

df2 = pd.read_csv("set_2_hop_1_opt_conns_superclass.csv")
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.read_csv("set_2_hop_2_opt_conns_superclass.csv")
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# Build flows (using summed synapse counts)
# ---------------------------
# Flow 1: From "Set 2" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class')['syn_count'].sum().reset_index(name='count')
flow1['source'] = "Set 2"  # all connections originate from "Set 2"

# Flow 2: From first-level to second-level super classes
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
# Sum the synapse counts from df2 (now in 'syn_count_second') for each pair.
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second'])['syn_count_second'] \
    .sum().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
# Sum the synapse counts from df3 (now in 'syn_count_third') for each pair.
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third'])['syn_count_third'] \
    .sum().reset_index(name='count')

# ---------------------------
# Build node lists directly from the data sources for each hop.
# This ensures that nodes appear in the column corresponding to their data source.
# ---------------------------
col1 = ["Set 2"]  # Column 1: the source
col2 = ["1: " + label for label in sorted(df1['output_super_class'].unique())]  # Column 2 from df1
col3 = ["2: " + label for label in sorted(df2['output_super_class'].unique())]  # Column 3 from df2
col4 = ["3: " + label for label in sorted(df3['output_super_class'].unique())]  # Column 4 from df3

nodes = col1 + col2 + col3 + col4
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Update flows to use the new, prefixed node labels.
# ---------------------------
# Flow 1: Source is "Set 2"; target from df1 gets prefix "1: "
flow1['source_idx'] = node_index["Set 2"]
flow1['target_idx'] = flow1['output_super_class'].apply(lambda x: node_index["1: " + x])

# Flow 2: Source is from df1 (first hop, prefix "1: ") and target is from df2 (second hop, prefix "2: ")
flow2['source_idx'] = flow2['output_super_class_first'].apply(lambda x: node_index["1: " + x])
flow2['target_idx'] = flow2['output_super_class_second'].apply(lambda x: node_index["2: " + x])

# Flow 3: Source is from df2 (second hop, prefix "2: ") and target is from df3 (third hop, prefix "3: ")
flow3['source_idx'] = flow3['output_super_class_second'].apply(lambda x: node_index["2: " + x])
flow3['target_idx'] = flow3['output_super_class_third'].apply(lambda x: node_index["3: " + x])

# Combine flows into link lists.
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute custom hover info for nodes.
# For each node, we sum incoming and outgoing flows based on the link lists.
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}

for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    # For nodes that both receive and send flows, Plotly by default uses max(incoming, outgoing)
    # Here we display both values and the "total" (the max of the two)
    total = incoming if incoming >= outgoing else outgoing
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Define fixed x positions so that the diagram is laid out in a linear, successive fashion.
# Column 1 ("Set 2") at x=0.0, Column 2 (first-level nodes) at x=0.33,
# Column 3 (second-level nodes) at x=0.66, Column 4 (third-level nodes) at x=1.0.
# ---------------------------
x_positions = []
x_positions += [0.0] * len(col1)
x_positions += [0.33] * len(col2)
x_positions += [0.66] * len(col3)
x_positions += [1.0] * len(col4)

# ---------------------------
# Build and display the Sankey diagram with custom hover text for nodes.
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    arrangement="freeform",  # you can use "fixed" or "freeform" as desired
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        x=x_positions,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])
fig.update_layout(title_text="Linear Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go
import colorsys

# ---------------------------
# Helper functions for colors
# ---------------------------
def get_color(i, n):
    """Generate a distinct RGB color for node index i among n nodes."""
    hue = i / n  # evenly spaced hue
    # Using moderate saturation and high brightness for vivid colors
    r, g, b = colorsys.hsv_to_rgb(hue, 0.6, 0.9)
    r, g, b = int(r * 255), int(g * 255), int(b * 255)
    return f"rgb({r},{g},{b})"

def make_rgba(rgb_str, alpha=0.5):
    """Convert an rgb string (e.g., 'rgb(31,119,180)') to an rgba string with given alpha."""
    rgb_values = rgb_str.strip("rgb(").strip(")").split(",")
    return f"rgba({rgb_values[0].strip()},{rgb_values[1].strip()},{rgb_values[2].strip()},{alpha})"

# ---------------------------
# Load CSV files and filter out weak connections (< 5 synapses)
# ---------------------------
df1 = pd.read_csv("set_2_opt_conns_superclass.csv")
df1 = df1[df1['syn_count'] >= 5]  # only keep connections with at least 5 synapses

df2 = pd.read_csv("set_2_hop_1_opt_conns_superclass.csv")
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.read_csv("set_2_hop_2_opt_conns_superclass.csv")
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# Build flows (using summed synapse counts)
# ---------------------------
# Flow 1: From "Set 2" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class')['syn_count'].sum().reset_index(name='count')
flow1['source'] = "Set 2"  # all connections originate from "Set 2"

# Flow 2: From first-level to second-level super classes
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second'])['syn_count_second'] \
    .sum().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third'])['syn_count_third'] \
    .sum().reset_index(name='count')

# ---------------------------
# Build node lists directly from the data sources for each hop.
# ---------------------------
# Define columns:
# Column 1: "Set 2"
# Column 2: Unique first-level super classes from df1 (prefixed for clarity)
# Column 3: Unique second-level super classes from df2 (prefixed)
# Column 4: Unique third-level super classes from df3 (prefixed)
col1 = ["Set 2"]
col2 = ["1: " + label for label in sorted(df1['output_super_class'].unique())]
col3 = ["2: " + label for label in sorted(df2['output_super_class'].unique())]
col4 = ["3: " + label for label in sorted(df3['output_super_class'].unique())]

nodes = col1 + col2 + col3 + col4
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Assign each node a unique color.
# ---------------------------
n_nodes = len(nodes)
node_colors = [get_color(i, n_nodes) for i in range(n_nodes)]

# ---------------------------
# Update flows to use the new, prefixed node labels.
# ---------------------------
# Flow 1: Source is "Set 2"; target from df1 gets prefix "1: "
flow1['source_idx'] = node_index["Set 2"]
flow1['target_idx'] = flow1['output_super_class'].apply(lambda x: node_index["1: " + x])

# Flow 2: Source is from df1 (first hop, prefix "1: ") and target is from df2 (prefix "2: ")
flow2['source_idx'] = flow2['output_super_class_first'].apply(lambda x: node_index["1: " + x])
flow2['target_idx'] = flow2['output_super_class_second'].apply(lambda x: node_index["2: " + x])

# Flow 3: Source is from df2 (prefix "2: ") and target is from df3 (prefix "3: ")
flow3['source_idx'] = flow3['output_super_class_second'].apply(lambda x: node_index["2: " + x])
flow3['target_idx'] = flow3['output_super_class_third'].apply(lambda x: node_index["3: " + x])

# Combine flows into link lists.
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute custom hover info for nodes.
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}
for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    total = incoming if incoming >= outgoing else outgoing
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Build link colors based on the source node's unique color (with 50% opacity).
# ---------------------------
link_colors = []
for s in source:
    base_color = node_colors[s]  # use the source node's color
    link_colors.append(make_rgba(base_color, 0.5))

# ---------------------------
# Define fixed x positions so that the diagram is laid out in a linear, successive fashion.
# ---------------------------
x_positions = []
x_positions += [0.0] * len(col1)
x_positions += [0.33] * len(col2)
x_positions += [0.66] * len(col3)
x_positions += [1.0] * len(col4)

# ---------------------------
# Build and display the Sankey diagram with custom hover text and link colors.
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    arrangement="freeform",
    node=dict(
        pad=80,
        thickness=50,
        line=dict(color="black", width=0.9),
        label=nodes,
        x=x_positions,
        color=node_colors,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors
    )
)])

fig.update_layout(title_text="Linear Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()


In [None]:
flow1

In [None]:
import pandas as pd

# Load connectivity CSV files
df1 = pd.read_csv("set_2_opt_conns_superclass.csv")
df2 = pd.read_csv("set_2_hop_1_opt_conns_superclass.csv")
df3 = pd.read_csv("set_2_hop_2_opt_conns_superclass.csv")

target_class = "motor"

# ---------------------------
# 1-hop: Direct connections from Set 2 to a motor neuron.
# ---------------------------
df1_motor = df1[df1['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
df1_motor = df1_motor.assign(hop=1)

# ---------------------------
# 2-hop: Chain from Set 2 → Neuron2 → Neuron3 where the second hop ends in a motor neuron.
# ---------------------------
df12 = pd.merge(
    df1[['pre_root_id', 'post_root_id']],
    df2,
    left_on='post_root_id',
    right_on='pre_root_id',
    suffixes=('_df1', '_df2')
)
# In df12, the starting neuron from Set 2 is now in 'pre_root_id' from df1,
# which has been renamed automatically to 'pre_root_id' (or to 'pre_root_id_df1' if needed).
# Let's check for our purposes:
if 'pre_root_id_df1' in df12.columns:
    start_col = 'pre_root_id_df1'
else:
    start_col = 'pre_root_id'

df12_motor = df12[df12['output_super_class'] == target_class][[start_col]].drop_duplicates()
df12_motor = df12_motor.rename(columns={start_col: 'pre_root_id'})
df12_motor = df12_motor.assign(hop=2)

# ---------------------------
# 3-hop: Chain from Set 2 → Neuron2 → Neuron3 → Neuron4 where the third hop ends in a motor neuron.
# ---------------------------
df123 = pd.merge(
    df12,
    df3,
    left_on='post_root_id_df2',  # the output from the second leg from df2
    right_on='pre_root_id',
    suffixes=('_df2', '_df3')
)
# After merging, the df3 column 'output_super_class' becomes 'output_super_class_df3'
df123_motor = df123[df123['output_super_class_df3'] == target_class][[start_col]].drop_duplicates()
df123_motor = df123_motor.rename(columns={start_col: 'pre_root_id'})
df123_motor = df123_motor.assign(hop=3)

# ---------------------------
# Combine and determine the minimum hop count per starting neuron.
# ---------------------------
df_hops = pd.concat([df1_motor, df12_motor, df123_motor], ignore_index=True)
min_hops = df_hops.groupby('pre_root_id', as_index=False)['hop'].min()

print(min_hops)


In [None]:
# Calculate descriptive statistics for the hops
mean_hops = min_hops['hop'].mean()
median_hops = min_hops['hop'].median()
std_hops = min_hops['hop'].std()
min_hop = min_hops['hop'].min()
max_hop = min_hops['hop'].max()
count = min_hops['hop'].count()

print("Descriptive statistics for hops to reach a motor neuron:")
print(f"Number of neurons: {count}")
print(f"Mean hops: {mean_hops:.2f}")
print(f"Median hops: {median_hops}")
print(f"Standard Deviation: {std_hops:.2f}")
print(f"Minimum hops: {min_hop}")
print(f"Maximum hops: {max_hop}")

# Alternatively, you can use describe() to get a summary:
print("\nSummary statistics:")
print(min_hops['hop'].describe())


In [None]:
import plotly.express as px

# Ensure 'hop' is numeric and get max_hop as a Python int.
min_hops['hop'] = pd.to_numeric(min_hops['hop'])
max_hop = int(min_hops['hop'].max())

# --- Histogram ---
# Set nbins to max_hop + 1 so that each integer hop has its own bin.
fig_hist = px.histogram(
    min_hops,
    x="hop",
    nbins=max_hop + 1,
    title="Distribution of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops", "count": "Number of Neurons"},
    range_x=[0.5, max_hop + 0.5]
)
fig_hist.update_xaxes(dtick=1)
fig_hist.show()

# --- Box Plot ---
fig_box = px.box(
    min_hops,
    y="hop",
    title="Box Plot of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops"}
)
fig_box.show()

# --- Violin Plot ---
fig_violin = px.violin(
    min_hops,
    y="hop",
    box=True,
    points="all",
    title="Violin Plot of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops"}
)
fig_violin.show()


In [None]:
##fig.show(renderer="browser")
#fig.write_html("sankey_diagram.html")

In [None]:
# Merge set_3_outputs with classification_other to add the super_class for each output neuron.
# (Assumes set_3_outputs has a column 'post_root_id' and classification_other has 'root_id' and 'super_class')
set_3_outputs_with_class = pd.merge(
    set_3_outputs,
    classification_other[['root_id', 'super_class']],
    left_on='post_root_id',
    right_on='root_id',
    how='left'
)

# Remove the extra 'root_id' column that came from the merge
set_3_outputs_with_class.drop(columns=['root_id'], inplace=True)

# (Optional) Rename the super_class column for clarity
set_3_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# Save the result as a CSV file
set_3_outputs_with_class.to_csv("set_3_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_3_opt_conns_superclass.csv'")


In [None]:
# ------------------------------------------------------------------------------
# Step 1: Use the original table from Option 1 and treat its post IDs as new pre IDs.
# ------------------------------------------------------------------------------
# We assume 'set_3_outputs' is the DataFrame from Option 1 with columns:
#    pre_root_id, post_root_id, neuropil, syn_count, nt_type, location_of_connection, etc.
#
# Create a DataFrame of unique neurons that were outputs (i.e. post ids) in the first round:
new_pre_neurons = pd.DataFrame({'root_id': set_3_outputs['post_root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Get the downstream outputs for these new pre neurons.
# ------------------------------------------------------------------------------
# Use the neuronal_outputs function (which expects a DataFrame with a column 'root_id')
# to find the downstream connections for these neurons.
new_outputs = neuronal_outputs(new_pre_neurons)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with classification info so that each connection includes
# the super_class of the output (i.e. new post) neuron.
# ------------------------------------------------------------------------------
new_outputs_with_class = pd.merge(
    new_outputs,
    classification_other[['root_id', 'super_class']],  # classification table for output neurons
    left_on='post_root_id',  # new outputs: these are the downstream neurons
    right_on='root_id',
    how='left'
)

# Drop the extra 'root_id' column that came from the merge
new_outputs_with_class.drop(columns=['root_id'], inplace=True)

# Rename the column for clarity
new_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as CSV.
# ------------------------------------------------------------------------------
new_outputs_with_class.to_csv("set_3_hop_1_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_3_hop_1_opt_conns_superclass.csv'")


i now want to make a sankey plot of this data where we see on axis 1 Set 3 axis 2 the super classes and axis 3 as super classes. for context here is a head of one of the csvs (pre_root_id	post_root_id	neuropil	syn_count	nt_type	location_of_connection	output_super_class
720575940617034713	720575940628071211	PRW	5	SER	local	central
720575940617034713	720575940633548128	FLA_R	7	SER	local	ascending
720575940617034713	720575940621662332	FLA_R	8	SER	local	central
720575940617034713	720575940630672938	FLA_R	8	SER	local	central
720575940617034713	720575940630672938	PRW	16	ACH	local	central).

In [None]:
import pandas as pd

# ------------------------------------------------------------------------------
# Step 1: Load the previous CSV file and extract the unique post ids
# ------------------------------------------------------------------------------
prev_outputs = pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
# Here, the post ids from the previous file will now serve as the pre ids.
new_pre_neurons_2 = pd.DataFrame({'root_id': prev_outputs['post_root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Find downstream connections for these new pre neurons using your neuronal_outputs function
# ------------------------------------------------------------------------------
new_outputs_2 = neuronal_outputs(new_pre_neurons_2)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with the classification information to add super_class for each new downstream neuron
# ------------------------------------------------------------------------------
new_outputs_2_with_class = pd.merge(
    new_outputs_2,
    classification_other[['root_id', 'super_class']],
    left_on='post_root_id',  # these are the new downstream neurons
    right_on='root_id',
    how='left'
)

# Remove the extra 'root_id' column that came from the merge
new_outputs_2_with_class.drop(columns=['root_id'], inplace=True)
# Rename the super_class column for clarity
new_outputs_2_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as a CSV file
# ------------------------------------------------------------------------------
new_outputs_2_with_class.to_csv("set_3_hop_2_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_3_hop_2_opt_conns_superclass.csv'")


In [None]:
# ------------------------------------------------------------------------------
# Step 1: Create a DataFrame of unique neurons from the set_3 2Ns.
# ------------------------------------------------------------------------------
# Here, we assume that "set_3_2Ns" is your DataFrame of second-order neurons from Set 3,
# which was generated earlier in your pipeline.
new_pre_neurons = pd.DataFrame({'root_id': set_3_2Ns['root_id'].unique()})

# ------------------------------------------------------------------------------
# Step 2: Get the downstream outputs for these new pre neurons (i.e. the 3NS).
# ------------------------------------------------------------------------------
# Use the neuronal_outputs function (which expects a DataFrame with a 'root_id' column)
# to extract all downstream connections for these neurons.
new_outputs = neuronal_outputs(new_pre_neurons)

# ------------------------------------------------------------------------------
# Step 3: Merge the new outputs with classification information so that each connection includes
# the super_class of the output (i.e. new post) neuron.
# ------------------------------------------------------------------------------
new_outputs_with_class = pd.merge(
    new_outputs,
    classification_other[['root_id', 'super_class']],  # Classification table for output neurons
    left_on='post_root_id',  # New outputs: these are the downstream neurons
    right_on='root_id',
    how='left'
)

# Drop the extra 'root_id' column that came from the merge
new_outputs_with_class.drop(columns=['root_id'], inplace=True)

# Rename the column for clarity
new_outputs_with_class.rename(columns={'super_class': 'output_super_class'}, inplace=True)

# ------------------------------------------------------------------------------
# Step 4: Save the new table as CSV.
# ------------------------------------------------------------------------------
new_outputs_with_class.to_csv("set_3_hop_3_opt_conns_superclass.csv", index=False)
print("CSV table saved as 'set_3_hop_3_opt_conns_superclass.csv'")


In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the first round data: Set 3 outputs (first-order)
df1 = pd.read_csv("set_3_opt_conns_superclass.csv")
# Load the second round data: outputs where we treat the first round outputs as new pre neurons
df2 = pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")

# --- Build flows from "Set 3" to first-level super classes ---
# Group df1 by the first-level output super_class and count connections
flow1 = df1.groupby('output_super_class').size().reset_index(name='count')

# --- Build flows from first-level to second-level super classes ---
# Merge the two dataframes by linking the first round's post_root_id (the output neuron)
# with the second round's pre_root_id (the same neuron now acting as input)
merged = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id', suffixes=('_first', '_second'))

# Group by first-level and second-level super classes and count connections
flow2 = merged.groupby(['output_super_class_first', 'output_super_class_second']).size().reset_index(name='count')

# --- Define Sankey nodes ---
# In this diagram, we want three columns:
# Column 1: "Set 3" (a single node)
# Column 2: Unique first-level super classes from df1
# Column 3: Unique second-level super classes from the merged table
nodes = []
nodes.append("Set 3")  # Column 1

first_super_nodes = flow1['output_super_class'].unique().tolist()
nodes.extend(first_super_nodes)  # Column 2

second_super_nodes = flow2['output_super_class_second'].unique().tolist()
nodes.extend(second_super_nodes)  # Column 3

# Create a mapping from node label to node index for building links
node_index = {node: i for i, node in enumerate(nodes)}

# --- Create Sankey link data ---

# Flow 1: from "Set 3" (source) to each first-level super_class node
source1 = []
target1 = []
value1 = []
for _, row in flow1.iterrows():
    source1.append(node_index["Set 3"])  # all originate from "Set 3"
    target1.append(node_index[row['output_super_class']])
    value1.append(row['count'])

# Flow 2: from first-level super class to second-level super class
source2 = []
target2 = []
value2 = []
for _, row in flow2.iterrows():
    source2.append(node_index[row['output_super_class_first']])
    target2.append(node_index[row['output_super_class_second']])
    value2.append(row['count'])

# Combine flows from both steps
source = source1 + source2
target = target1 + target2
value  = value1  + value2

# --- Build and display the Sankey diagram ---
fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = nodes,
    ),
    link = dict(
        source = source,
        target = target,
        value = value
    ))])

fig.update_layout(title_text="Sankey Diagram: Set 3 to First and Second Level Super Classes", font_size=10)
fig.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go

# ---------------------------
# Load CSV files
# ---------------------------
df1 = pd.read_csv("set_3_opt_conns_superclass.csv")
df2 = pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
df3 = pd.read_csv("set_3_hop_2_opt_conns_superclass.csv")

# ---------------------------
# Build flows using vectorized operations
# ---------------------------
# Flow 1: From "Set 3" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class').size().reset_index(name='count')
flow1['source'] = "Set 3"  # all connections originate from "Set 3"

# Flow 2: From first-level to second-level super classes
# Merge df1 and df2 using the connection where first round's post_root_id becomes df2's pre_root_id.
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second']).size().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
# Merge df2 and df3 similarly (df2.post_root_id becomes df3.pre_root_id).
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third']).size().reset_index(name='count')

# ---------------------------
# Define nodes for the Sankey diagram
# ---------------------------
# We define four groups:
# Column 1: "Set 3" (a single node)
# Column 2: Unique first-level super classes from df1
# Column 3: Unique second-level super classes from the merged flows (from flow2 and flow3)
# Column 4: Unique third-level super classes from flow3
nodes = []
nodes.append("Set 3")
first_nodes = flow1['output_super_class'].unique().tolist()
nodes.extend(first_nodes)
second_nodes = pd.concat([flow2['output_super_class_second'], flow3['output_super_class_second']]).unique().tolist()
nodes.extend(second_nodes)
third_nodes = flow3['output_super_class_third'].unique().tolist()
nodes.extend(third_nodes)

# Create a mapping from node label to node index
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Vectorized mapping to get link indices
# ---------------------------
flow1['source_idx'] = node_index["Set 3"]
flow1['target_idx'] = flow1['output_super_class'].map(node_index)

flow2['source_idx'] = flow2['output_super_class_first'].map(node_index)
flow2['target_idx'] = flow2['output_super_class_second'].map(node_index)

flow3['source_idx'] = flow3['output_super_class_second'].map(node_index)
flow3['target_idx'] = flow3['output_super_class_third'].map(node_index)

# Combine flows into link lists for the Sankey diagram
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute incoming and outgoing values for each node
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}

for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    total = max(incoming, outgoing)
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Build and display the Sankey diagram with custom hover text
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])

fig.update_layout(title_text="Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()

# ---------------------------
# Determine hop counts for a target super class (e.g., "motor")
# ---------------------------
target_class = "motor"
hop1_motor = flow1[flow1['output_super_class'] == target_class]['count'].sum()
hop2_motor = flow2[flow2['output_super_class_second'] == target_class]['count'].sum()
hop3_motor = flow3[flow3['output_super_class_third'] == target_class]['count'].sum()

print(f"Connections reaching '{target_class}' neurons at 1 hop (direct):", hop1_motor)
print(f"Connections reaching '{target_class}' neurons at 2 hops:", hop2_motor)
print(f"Connections reaching '{target_class}' neurons at 3 hops:", hop3_motor)


In [None]:
#fig.show(renderer="browser")
#fig.write_html("sankey_diagram.html")

In [None]:
import pandas as pd
import plotly.graph_objects as go

# ---------------------------
# Load CSV files and filter out weak connections (< 5 synapses)
# ---------------------------
df1 = pd.read_csv("set_3_opt_conns_superclass.csv")
df1 = df1[df1['syn_count'] >= 5]  # only keep connections with at least 5 synapses

df2 = pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.read_csv("set_3_hop_2_opt_conns_superclass.csv")
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# Build flows (using summed synapse counts)
# ---------------------------
# Flow 1: From "Set 3" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class')['syn_count'].sum().reset_index(name='count')
flow1['source'] = "Set 3"  # all connections originate from "Set 3"

# Flow 2: From first-level to second-level super classes
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
# Sum the synapse counts from df2 (now in 'syn_count_second') for each pair.
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second'])['syn_count_second'] \
    .sum().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
# Sum the synapse counts from df3 (now in 'syn_count_third') for each pair.
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third'])['syn_count_third'] \
    .sum().reset_index(name='count')

# ---------------------------
# Build node lists directly from the data sources for each hop.
# This ensures that nodes appear in the column corresponding to their data source.
# ---------------------------
col1 = ["Set 3"]  # Column 1: the source
col2 = ["1: " + label for label in sorted(df1['output_super_class'].unique())]  # Column 2 from df1
col3 = ["2: " + label for label in sorted(df2['output_super_class'].unique())]  # Column 3 from df2
col4 = ["3: " + label for label in sorted(df3['output_super_class'].unique())]  # Column 4 from df3

nodes = col1 + col2 + col3 + col4
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Update flows to use the new, prefixed node labels.
# ---------------------------
# Flow 1: Source is "Set 3"; target from df1 gets prefix "1: "
flow1['source_idx'] = node_index["Set 3"]
flow1['target_idx'] = flow1['output_super_class'].apply(lambda x: node_index["1: " + x])

# Flow 2: Source is from df1 (first hop, prefix "1: ") and target is from df2 (second hop, prefix "2: ")
flow2['source_idx'] = flow2['output_super_class_first'].apply(lambda x: node_index["1: " + x])
flow2['target_idx'] = flow2['output_super_class_second'].apply(lambda x: node_index["2: " + x])

# Flow 3: Source is from df2 (second hop, prefix "2: ") and target is from df3 (third hop, prefix "3: ")
flow3['source_idx'] = flow3['output_super_class_second'].apply(lambda x: node_index["2: " + x])
flow3['target_idx'] = flow3['output_super_class_third'].apply(lambda x: node_index["3: " + x])

# Combine flows into link lists.
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute custom hover info for nodes.
# For each node, we sum incoming and outgoing flows based on the link lists.
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}

for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    # For nodes that both receive and send flows, Plotly by default uses max(incoming, outgoing)
    # Here we display both values and the "total" (the max of the two)
    total = incoming if incoming >= outgoing else outgoing
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Define fixed x positions so that the diagram is laid out in a linear, successive fashion.
# Column 1 ("Set 3") at x=0.0, Column 2 (first-level nodes) at x=0.33,
# Column 3 (second-level nodes) at x=0.66, Column 4 (third-level nodes) at x=1.0.
# ---------------------------
x_positions = []
x_positions += [0.0] * len(col1)
x_positions += [0.33] * len(col2)
x_positions += [0.66] * len(col3)
x_positions += [1.0] * len(col4)

# ---------------------------
# Build and display the Sankey diagram with custom hover text for nodes.
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    arrangement="freeform",  # you can use "fixed" or "freeform" as desired
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes,
        x=x_positions,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    )
)])
fig.update_layout(title_text="Linear Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go
import colorsys

# ---------------------------
# Helper functions for colors
# ---------------------------
def get_color(i, n):
    """Generate a distinct RGB color for node index i among n nodes."""
    hue = i / n  # evenly spaced hue
    # Using moderate saturation and high brightness for vivid colors
    r, g, b = colorsys.hsv_to_rgb(hue, 0.6, 0.9)
    r, g, b = int(r * 255), int(g * 255), int(b * 255)
    return f"rgb({r},{g},{b})"

def make_rgba(rgb_str, alpha=0.5):
    """Convert an rgb string (e.g., 'rgb(31,119,180)') to an rgba string with given alpha."""
    rgb_values = rgb_str.strip("rgb(").strip(")").split(",")
    return f"rgba({rgb_values[0].strip()},{rgb_values[1].strip()},{rgb_values[2].strip()},{alpha})"

# ---------------------------
# Load CSV files and filter out weak connections (< 5 synapses)
# ---------------------------
df1 = pd.read_csv("set_3_opt_conns_superclass.csv")
df1 = df1[df1['syn_count'] >= 5]  # only keep connections with at least 5 synapses

df2 = pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.read_csv("set_3_hop_2_opt_conns_superclass.csv")
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# Build flows (using summed synapse counts)
# ---------------------------
# Flow 1: From "Set 3" to first-level super classes (from df1)
flow1 = df1.groupby('output_super_class')['syn_count'].sum().reset_index(name='count')
flow1['source'] = "Set 3"  # all connections originate from "Set 3"

# Flow 2: From first-level to second-level super classes
merged1 = pd.merge(df1, df2, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_first', '_second'))
flow2 = merged1.groupby(['output_super_class_first', 'output_super_class_second'])['syn_count_second'] \
    .sum().reset_index(name='count')

# Flow 3: From second-level to third-level super classes
merged2 = pd.merge(df2, df3, left_on='post_root_id', right_on='pre_root_id',
                   suffixes=('_second', '_third'))
flow3 = merged2.groupby(['output_super_class_second', 'output_super_class_third'])['syn_count_third'] \
    .sum().reset_index(name='count')

# ---------------------------
# Build node lists directly from the data sources for each hop.
# ---------------------------
# Define columns:
# Column 1: "Set 3"
# Column 2: Unique first-level super classes from df1 (prefixed for clarity)
# Column 3: Unique second-level super classes from df2 (prefixed)
# Column 4: Unique third-level super classes from df3 (prefixed)
col1 = ["Set 3"]
col2 = ["1: " + label for label in sorted(df1['output_super_class'].unique())]
col3 = ["2: " + label for label in sorted(df2['output_super_class'].unique())]
col4 = ["3: " + label for label in sorted(df3['output_super_class'].unique())]

nodes = col1 + col2 + col3 + col4
node_index = {node: i for i, node in enumerate(nodes)}

# ---------------------------
# Assign each node a unique color.
# ---------------------------
n_nodes = len(nodes)
node_colors = [get_color(i, n_nodes) for i in range(n_nodes)]

# ---------------------------
# Update flows to use the new, prefixed node labels.
# ---------------------------
# Flow 1: Source is "Set 3"; target from df1 gets prefix "1: "
flow1['source_idx'] = node_index["Set 3"]
flow1['target_idx'] = flow1['output_super_class'].apply(lambda x: node_index["1: " + x])

# Flow 2: Source is from df1 (first hop, prefix "1: ") and target is from df2 (prefix "2: ")
flow2['source_idx'] = flow2['output_super_class_first'].apply(lambda x: node_index["1: " + x])
flow2['target_idx'] = flow2['output_super_class_second'].apply(lambda x: node_index["2: " + x])

# Flow 3: Source is from df2 (prefix "2: ") and target is from df3 (prefix "3: ")
flow3['source_idx'] = flow3['output_super_class_second'].apply(lambda x: node_index["2: " + x])
flow3['target_idx'] = flow3['output_super_class_third'].apply(lambda x: node_index["3: " + x])

# Combine flows into link lists.
source = pd.concat([flow1['source_idx'], flow2['source_idx'], flow3['source_idx']]).tolist()
target = pd.concat([flow1['target_idx'], flow2['target_idx'], flow3['target_idx']]).tolist()
value  = pd.concat([flow1['count'], flow2['count'], flow3['count']]).tolist()

# ---------------------------
# Compute custom hover info for nodes.
# ---------------------------
node_incoming = {node: 0 for node in nodes}
node_outgoing = {node: 0 for node in nodes}
for s, t, v in zip(source, target, value):
    node_outgoing[nodes[s]] += v
    node_incoming[nodes[t]] += v

customdata = []
for node in nodes:
    incoming = node_incoming[node]
    outgoing = node_outgoing[node]
    total = incoming if incoming >= outgoing else outgoing
    customdata.append(f"Incoming: {incoming}<br>Outgoing: {outgoing}<br>Total: {total}")

# ---------------------------
# Build link colors based on the source node's unique color (with 50% opacity).
# ---------------------------
link_colors = []
for s in source:
    base_color = node_colors[s]  # use the source node's color
    link_colors.append(make_rgba(base_color, 0.5))

# ---------------------------
# Define fixed x positions so that the diagram is laid out in a linear, successive fashion.
# ---------------------------
x_positions = []
x_positions += [0.0] * len(col1)
x_positions += [0.33] * len(col2)
x_positions += [0.66] * len(col3)
x_positions += [1.0] * len(col4)

# ---------------------------
# Build and display the Sankey diagram with custom hover text and link colors.
# ---------------------------
fig = go.Figure(data=[go.Sankey(
    arrangement="freeform",
    node=dict(
        pad=80,
        thickness=50,
        line=dict(color="black", width=0.9),
        label=nodes,
        x=x_positions,
        color=node_colors,
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors
    )
)])

fig.update_layout(title_text="Linear Sankey Diagram: Neural Pathway Hops", font_size=10)
fig.show()


In [None]:
flow1

In [None]:
import pandas as pd

# Load connectivity CSV files
df1 = pd.read_csv("set_3_opt_conns_superclass.csv")
df2 = pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
df3 = pd.read_csv("set_3_hop_2_opt_conns_superclass.csv")

target_class = "motor"

# ---------------------------
# 1-hop: Direct connections from Set 3 to a motor neuron.
# ---------------------------
df1_motor = df1[df1['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
df1_motor = df1_motor.assign(hop=1)

# ---------------------------
# 2-hop: Chain from Set 3 → Neuron2 → Neuron3 where the second hop ends in a motor neuron.
# ---------------------------
df12 = pd.merge(
    df1[['pre_root_id', 'post_root_id']],
    df2,
    left_on='post_root_id',
    right_on='pre_root_id',
    suffixes=('_df1', '_df2')
)
# In df12, the starting neuron from Set 3 is now in 'pre_root_id' from df1,
# which has been renamed automatically to 'pre_root_id' (or to 'pre_root_id_df1' if needed).
# Let's check for our purposes:
if 'pre_root_id_df1' in df12.columns:
    start_col = 'pre_root_id_df1'
else:
    start_col = 'pre_root_id'

df12_motor = df12[df12['output_super_class'] == target_class][[start_col]].drop_duplicates()
df12_motor = df12_motor.rename(columns={start_col: 'pre_root_id'})
df12_motor = df12_motor.assign(hop=2)

# ---------------------------
# 3-hop: Chain from Set 3 → Neuron2 → Neuron3 → Neuron4 where the third hop ends in a motor neuron.
# ---------------------------
df123 = pd.merge(
    df12,
    df3,
    left_on='post_root_id_df2',  # the output from the second leg from df2
    right_on='pre_root_id',
    suffixes=('_df2', '_df3')
)
# After merging, the df3 column 'output_super_class' becomes 'output_super_class_df3'
df123_motor = df123[df123['output_super_class_df3'] == target_class][[start_col]].drop_duplicates()
df123_motor = df123_motor.rename(columns={start_col: 'pre_root_id'})
df123_motor = df123_motor.assign(hop=3)

# ---------------------------
# Combine and determine the minimum hop count per starting neuron.
# ---------------------------
df_hops = pd.concat([df1_motor, df12_motor, df123_motor], ignore_index=True)
min_hops = df_hops.groupby('pre_root_id', as_index=False)['hop'].min()

print(min_hops)


In [None]:
# Calculate descriptive statistics for the hops
mean_hops = min_hops['hop'].mean()
median_hops = min_hops['hop'].median()
std_hops = min_hops['hop'].std()
min_hop = min_hops['hop'].min()
max_hop = min_hops['hop'].max()
count = min_hops['hop'].count()

print("Descriptive statistics for hops to reach a motor neuron:")
print(f"Number of neurons: {count}")
print(f"Mean hops: {mean_hops:.2f}")
print(f"Median hops: {median_hops}")
print(f"Standard Deviation: {std_hops:.2f}")
print(f"Minimum hops: {min_hop}")
print(f"Maximum hops: {max_hop}")

# Alternatively, you can use describe() to get a summary:
print("\nSummary statistics:")
print(min_hops['hop'].describe())


In [None]:
import plotly.express as px

# Ensure 'hop' is numeric and get max_hop as a Python int.
min_hops['hop'] = pd.to_numeric(min_hops['hop'])
max_hop = int(min_hops['hop'].max())

# --- Histogram ---
# Set nbins to max_hop + 1 so that each integer hop has its own bin.
fig_hist = px.histogram(
    min_hops,
    x="hop",
    nbins=max_hop + 1,
    title="Distribution of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops", "count": "Number of Neurons"},
    range_x=[0.5, max_hop + 0.5]
)
fig_hist.update_xaxes(dtick=1)
fig_hist.show()

# --- Box Plot ---
fig_box = px.box(
    min_hops,
    y="hop",
    title="Box Plot of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops"}
)
fig_box.show()

# --- Violin Plot ---
fig_violin = px.violin(
    min_hops,
    y="hop",
    box=True,
    points="all",
    title="Violin Plot of Minimum Hops to Reach a Motor Neuron",
    labels={"hop": "Minimum Hops"}
)
fig_violin.show()


In [None]:
##fig.show(renderer="browser")
#fig.write_html("sankey_diagram.html")

In [None]:
import pandas as pd
import plotly.express as px

def analyze_endocrine_hops(set_prefix, target_class="endocrine"):
    """
    Analyzes the minimum number of hops needed for a neuron from a given set
    to ultimately reach an endocrine neuron.

    Parameters
    ----------
    set_prefix : str
        The prefix to use for the file names, e.g. "set_1" or "set_2", etc.
    target_class : str, optional
        The target output super_class (default is "endocrine")

    Returns
    -------
    min_hops : pd.DataFrame
        DataFrame with each starting neuron (pre_root_id) and the minimum hop count.
    """
    # Load first-round (direct) connections from the specified set.
    df1 = pd.read_csv(f"{set_prefix}_opt_conns_superclass.csv")
    # Load second-round connections (first hop applied, now new pre neurons are the previous post ids).
    df2 = pd.read_csv(f"{set_prefix}_hop_1_opt_conns_superclass.csv")
    # Load third-round connections.
    df3 = pd.read_csv(f"{set_prefix}_hop_2_opt_conns_superclass.csv")

    # -------------------------------------------------------------------------
    # 1-Hop: Direct connections from set_X to an endocrine neuron.
    # -------------------------------------------------------------------------
    df1_target = df1[df1['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
    df1_target = df1_target.assign(hop=1)

    # -------------------------------------------------------------------------
    # 2-Hop: Set_X → Neuron (from df1) → Endocrine neuron (from df2).
    # Merge df1 and df2 on: previous post_root_id becomes new pre_root_id.
    # -------------------------------------------------------------------------
    df12 = pd.merge(
        df1[['pre_root_id', 'post_root_id']],
        df2,
        left_on='post_root_id',
        right_on='pre_root_id',  # new pre ids come from the df1 outputs
        suffixes=('_df1', '_df2')
    )
    # In df12, we want to keep only those chains that eventually end in an endocrine neuron.
    df12_target = df12[df12['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
    # We assume the starting neuron is the one from df1 (it appears in the original 'pre_root_id')
    df12_target = df12_target.rename(columns={'pre_root_id': 'pre_root_id'})
    df12_target = df12_target.assign(hop=2)

    # -------------------------------------------------------------------------
    # 3-Hop: Set_X → Neuron (df1) → Neuron (df2) → Endocrine neuron (from df3).
    # Merge the chain from df12 with df3.
    # -------------------------------------------------------------------------
    # We expect that in the merge of df12 with df3, the connection from the second hop
    # is used (often automatically renamed; adjust the key if needed).
    merge_key = 'post_root_id_df2' if 'post_root_id_df2' in df12.columns else 'post_root_id'

    df123 = pd.merge(
        df12,
        df3,
        left_on=merge_key,
        right_on='pre_root_id',
        suffixes=('_df2', '_df3')
    )
    # Filter for chains where df3 shows the target class.
    if 'output_super_class_df3' in df123.columns:
        df123_target = df123[df123['output_super_class_df3'] == target_class][['pre_root_id']].drop_duplicates()
    else:
        df123_target = df123[df123['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
    df123_target = df123_target.rename(columns={'pre_root_id': 'pre_root_id'})
    df123_target = df123_target.assign(hop=3)

    # -------------------------------------------------------------------------
    # Combine the results from all hops and choose the minimum hop per starting neuron.
    # -------------------------------------------------------------------------
    df_hops = pd.concat([df1_target, df12_target, df123_target], ignore_index=True)
    min_hops = df_hops.groupby('pre_root_id', as_index=False)['hop'].min()

    return min_hops

# Define the target class to look for ("endocrine")
target_class = "endocrine"
# List the sets (adjust if you have a different naming scheme)
set_numbers = [1, 2, 3]

# Dictionary to store results for each set
results = {}

for set_num in set_numbers:
    set_prefix = f"set_{set_num}"
    print(f"Analyzing endocrine hops for {set_prefix} ...")
    min_hops = analyze_endocrine_hops(set_prefix, target_class=target_class)
    results[set_prefix] = min_hops

    # Print descriptive statistics
    print(f"Descriptive statistics for {set_prefix}:")
    print(min_hops['hop'].describe())

    # Plot histogram of hop counts
    max_hop = int(min_hops['hop'].max())
    fig_hist = px.histogram(
        min_hops,
        x="hop",
        nbins=max_hop + 1,
        title=f"Distribution of Minimum Hops to Reach an Endocrine Neuron ({set_prefix})",
        labels={"hop": "Minimum Hops", "count": "Number of Neurons"}
    )
    fig_hist.update_xaxes(dtick=1)
    fig_hist.show()

# (Optional) You can combine results from all sets or save the results to CSV files.


In [None]:
# Merge df1 and df2 to build 2-hop chains.
df12 = pd.merge(
    df1[['pre_root_id', 'post_root_id']],
    df2,
    left_on='post_root_id',
    right_on='pre_root_id',
    suffixes=('_df1', '_df2')
)

# Determine which column holds the starting neuron
start_col = 'pre_root_id_df1' if 'pre_root_id_df1' in df12.columns else 'pre_root_id'

# Then filter for chains whose output is endocrine.
df12_target = df12[df12['output_super_class'] == target_class][[start_col]].drop_duplicates()
df12_target = df12_target.rename(columns={start_col: 'pre_root_id'})
df12_target = df12_target.assign(hop=2)


In [None]:
import pandas as pd
import plotly.express as px

def analyze_endocrine_hops(set_prefix, target_class="endocrine"):
    """
    Analyzes the minimum number of hops needed for a neuron from a given set
    to ultimately reach an endocrine neuron.

    Parameters
    ----------
    set_prefix : str
        The prefix to use for the file names (e.g., "set_1").
    target_class : str, optional
        The target output super_class (default is "endocrine").

    Returns
    -------
    min_hops : pd.DataFrame
        DataFrame with each starting neuron (pre_root_id) and the minimum hop count.
    """
    # Load the CSV files (modify paths as needed)
    df1 = pd.read_csv(f"{set_prefix}_opt_conns_superclass.csv")
    df2 = pd.read_csv(f"{set_prefix}_hop_1_opt_conns_superclass.csv")
    df3 = pd.read_csv(f"{set_prefix}_hop_2_opt_conns_superclass.csv")

    # -------------------------------------------------------------------------
    # 1-Hop: Direct connections from the set to an endocrine neuron.
    # -------------------------------------------------------------------------
    df1_target = df1[df1['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
    df1_target = df1_target.assign(hop=1)

    # -------------------------------------------------------------------------
    # 2-Hop: Merge df1 and df2 (chain: set neuron -> intermediate neuron -> output).
    # -------------------------------------------------------------------------
    df12 = pd.merge(
        df1[['pre_root_id', 'post_root_id']],
        df2,
        left_on='post_root_id',
        right_on='pre_root_id',
        suffixes=('_df1', '_df2')
    )
    # Because both df1 and df2 have a column named "pre_root_id", the left one is renamed.
    start_col = 'pre_root_id_df1' if 'pre_root_id_df1' in df12.columns else 'pre_root_id'
    df12_target = df12[df12['output_super_class'] == target_class][[start_col]].drop_duplicates()
    df12_target = df12_target.rename(columns={start_col: 'pre_root_id'})
    df12_target = df12_target.assign(hop=2)

    # -------------------------------------------------------------------------
    # 3-Hop: Merge the 2-hop chains (df12) with df3.
    # -------------------------------------------------------------------------
    # For the merge key, use the second hop’s output. Depending on your dataframe,
    # the column name might be automatically suffixed.
    merge_key = 'post_root_id_df2' if 'post_root_id_df2' in df12.columns else 'post_root_id'
    df123 = pd.merge(
        df12,
        df3,
        left_on=merge_key,
        right_on='pre_root_id',
        suffixes=('_df2', '_df3')
    )
    # In df123, check if the output column from df3 was renamed:
    out_col = 'output_super_class_df3' if 'output_super_class_df3' in df123.columns else 'output_super_class'
    df123_target = df123[df123[out_col] == target_class][[start_col]].drop_duplicates()
    df123_target = df123_target.rename(columns={start_col: 'pre_root_id'})
    df123_target = df123_target.assign(hop=3)

    # -------------------------------------------------------------------------
    # Combine and select the minimum hop count per starting neuron.
    # -------------------------------------------------------------------------
    df_hops = pd.concat([df1_target, df12_target, df123_target], ignore_index=True)
    min_hops = df_hops.groupby('pre_root_id', as_index=False)['hop'].min()

    return min_hops

# Example usage for set_1:
set_prefix = "set_1"
target_class = "endocrine"

min_hops_set1 = analyze_endocrine_hops(set_prefix, target_class=target_class)

# Display descriptive statistics:
print(f"Descriptive statistics for {set_prefix} (endocrine hops):")
print(min_hops_set1['hop'].describe())

# Plot a histogram:
max_hop = int(min_hops_set1['hop'].max())
fig_hist = px.histogram(
    min_hops_set1,
    x="hop",
    nbins=max_hop + 1,
    title=f"Distribution of Minimum Hops to Reach an Endocrine Neuron ({set_prefix})",
    labels={"hop": "Minimum Hops", "count": "Number of Neurons"}
)
fig_hist.update_xaxes(dtick=1)
fig_hist.show()

# Plot a violin plot:
fig_violin = px.violin(
    min_hops_set1,
    y="hop",
    box=True,
    points="all",
    title=f"Violin Plot of Minimum Hops to Reach an Endocrine Neuron ({set_prefix})",
    labels={"hop": "Minimum Hops"}
)
fig_violin.show()


In [None]:
import pandas as pd
import plotly.express as px

def analyze_endocrine_hops(set_prefix, target_class="endocrine"):
    """
    Analyzes the minimum number of hops needed for a neuron from a given set
    to ultimately reach an endocrine neuron.

    Parameters
    ----------
    set_prefix : str
        The prefix for the file names (e.g., "set_1").
    target_class : str, optional
        The target output super_class (default is "endocrine").

    Returns
    -------
    min_hops : pd.DataFrame
        DataFrame with each starting neuron (pre_root_id) and the minimum hop count.
    """
    # Load CSV files (adjust paths if necessary)
    df1 = pd.read_csv(f"{set_prefix}_opt_conns_superclass.csv")
    df2 = pd.read_csv(f"{set_prefix}_hop_1_opt_conns_superclass.csv")
    df3 = pd.read_csv(f"{set_prefix}_hop_2_opt_conns_superclass.csv")

    # --- 1-Hop ---
    # Direct connections: keep rows where output_super_class equals the target_class.
    df1_target = df1[df1['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
    df1_target = df1_target.assign(hop=1)

    # --- 2-Hop ---
    # Merge df1 (starting connections) with df2 on df1.post_root_id == df2.pre_root_id.
    df12 = pd.merge(
        df1[['pre_root_id', 'post_root_id']],
        df2,
        left_on='post_root_id',
        right_on='pre_root_id',
        suffixes=('_df1', '_df2')
    )
    # The starting neuron from df1 may be automatically renamed.
    start_col = 'pre_root_id_df1' if 'pre_root_id_df1' in df12.columns else 'pre_root_id'
    df12_target = df12[df12['output_super_class'] == target_class][[start_col]].drop_duplicates()
    df12_target = df12_target.rename(columns={start_col: 'pre_root_id'})
    df12_target = df12_target.assign(hop=2)

    # --- 3-Hop ---
    # Merge the 2-hop chains with df3. Use the second hop's output column.
    merge_key = 'post_root_id_df2' if 'post_root_id_df2' in df12.columns else 'post_root_id'
    df123 = pd.merge(
        df12,
        df3,
        left_on=merge_key,
        right_on='pre_root_id',
        suffixes=('_df2', '_df3')
    )
    # Depending on the merge, the output from df3 might be renamed.
    out_col = 'output_super_class_df3' if 'output_super_class_df3' in df123.columns else 'output_super_class'
    df123_target = df123[df123[out_col] == target_class][[start_col]].drop_duplicates()
    df123_target = df123_target.rename(columns={start_col: 'pre_root_id'})
    df123_target = df123_target.assign(hop=3)

    # --- Combine and compute the minimum hops ---
    df_hops = pd.concat([df1_target, df12_target, df123_target], ignore_index=True)
    min_hops = df_hops.groupby('pre_root_id', as_index=False)['hop'].min()

    return min_hops

# Loop over sets 1 to 6 and perform the analysis:
target_class = "endocrine"
results = {}

for set_num in range(1, 4):
    set_prefix = f"set_{set_num}"
    print(f"\nAnalyzing endocrine hops for {set_prefix} ...")

    # Compute minimum hops to endocrine neuron for current set
    min_hops = analyze_endocrine_hops(set_prefix, target_class=target_class)
    results[set_prefix] = min_hops

    # Print descriptive statistics
    print(f"Descriptive statistics for {set_prefix} (endocrine hops):")
    print(min_hops['hop'].describe())

    # Plot histogram for the set
    max_hop = int(min_hops['hop'].max())
    fig_hist = px.histogram(
        min_hops,
        x="hop",
        nbins=max_hop + 1,
        title=f"Distribution of Minimum Hops to Reach an Endocrine Neuron ({set_prefix})",
        labels={"hop": "Minimum Hops", "count": "Number of Neurons"}
    )
    fig_hist.update_xaxes(dtick=1)
    fig_hist.show()

    # Plot violin plot for the set
    fig_violin = px.violin(
        min_hops,
        y="hop",
        box=True,
        points="all",
        title=f"Violin Plot of Minimum Hops to Reach an Endocrine Neuron ({set_prefix})",
        labels={"hop": "Minimum Hops"}
    )
    fig_violin.show()

# (Optional) Combine all sets for overall analysis if needed:
all_results = pd.concat([results[sp] for sp in results], keys=results.keys(), names=["set", "index"]).reset_index()
print("Combined results for all sets:")
print(all_results.head())


In [None]:
import pandas as pd
import plotly.express as px

# Suppose you already have the results stored in a dictionary called results,
# with keys 'set_1', 'set_2', ..., 'set_6'. For example:
# results = {
#     "set_1": min_hops_set1,
#     "set_2": min_hops_set2,
#     ...
#     "set_6": min_hops_set6
# }
#
# If you used our previous function:
#    min_hops = analyze_endocrine_hops(set_prefix, target_class="endocrine")
# and stored each in the dictionary 'results', you now can combine them.

# --- Combine the results from sets 1 through 6 ---
all_results = []
for s in range(1, 4):
    set_prefix = f"set_{s}"
    # Assuming each results[set_prefix] is a DataFrame with 'pre_root_id' and 'hop'
    df_temp = results[set_prefix].copy()
    df_temp["set"] = set_prefix
    all_results.append(df_temp)

combined_df = pd.concat(all_results, ignore_index=True)

# --- Create a stacked bar chart ---
# Create a frequency table: for each set, count number of neurons with each hop value.
freq_df = combined_df.groupby(["set", "hop"]).size().reset_index(name="count")
# Plot the stacked bar chart
fig_bar = px.bar(
    freq_df,
    x="set",
    y="count",
    color="hop",
    barmode="stack",
    title="Stacked Bar Chart of Minimum Endocrine Hops by Set",
    labels={"set": "Set", "count": "Number of Neurons", "hop": "Minimum Hops"}
)
fig_bar.show()

# --- Create combined violin subplots ---
# You can use facet_col to create subplots by set.
fig_violin = px.violin(
    combined_df,
    y="hop",
    color="set",
    facet_col="set",
    box=True,
    points="all",
    title="Violin Plots of Minimum Endocrine Hops by Set",
    labels={"hop": "Minimum Hops"}
)
fig_violin.update_layout(showlegend=False)
fig_violin.show()


In [None]:
def analyze_target_hops(set_prefix, target_class="motor"):
    """
    Computes the minimum number of hops needed for neurons from a given aPhN2‑SA set
    to eventually reach a neuron whose output super_class equals target_class.

    Parameters
    ----------
    set_prefix : str
        The prefix for the file names (e.g., "set_1").
    target_class : str, optional
        The target output super_class (for motor, use "motor"; for endocrine, use "endocrine").

    Returns
    -------
    min_hops : pd.DataFrame
        DataFrame with each starting neuron (pre_root_id) and its minimum hop count.
    """
    import pandas as pd
    # Load the three connectivity files (modify paths if necessary)
    df1 = pd.read_csv(f"{set_prefix}_opt_conns_superclass.csv")
    df2 = pd.read_csv(f"{set_prefix}_hop_1_opt_conns_superclass.csv")
    df3 = pd.read_csv(f"{set_prefix}_hop_2_opt_conns_superclass.csv")

    # --- 1-Hop: Direct connections ---
    df1_target = df1[df1['output_super_class'] == target_class][['pre_root_id']].drop_duplicates()
    df1_target = df1_target.assign(hop=1)

    # --- 2-Hop: Merge df1 and df2 ---
    df12 = pd.merge(
        df1[['pre_root_id', 'post_root_id']],
        df2,
        left_on='post_root_id',
        right_on='pre_root_id',
        suffixes=('_df1', '_df2')
    )
    # The starting neuron from df1 may get renamed automatically
    start_col = 'pre_root_id_df1' if 'pre_root_id_df1' in df12.columns else 'pre_root_id'
    df12_target = df12[df12['output_super_class'] == target_class][[start_col]].drop_duplicates()
    df12_target = df12_target.rename(columns={start_col: 'pre_root_id'})
    df12_target = df12_target.assign(hop=2)

    # --- 3-Hop: Merge df12 with df3 ---
    merge_key = 'post_root_id_df2' if 'post_root_id_df2' in df12.columns else 'post_root_id'
    df123 = pd.merge(
        df12,
        df3,
        left_on=merge_key,
        right_on='pre_root_id',
        suffixes=('_df2', '_df3')
    )
    out_col = 'output_super_class_df3' if 'output_super_class_df3' in df123.columns else 'output_super_class'
    df123_target = df123[df123[out_col] == target_class][[start_col]].drop_duplicates()
    df123_target = df123_target.rename(columns={start_col: 'pre_root_id'})
    df123_target = df123_target.assign(hop=3)

    # --- Combine and compute minimum hops per starting neuron ---
    df_hops = pd.concat([df1_target, df12_target, df123_target], ignore_index=True)
    min_hops = df_hops.groupby('pre_root_id', as_index=False)['hop'].min()
    return min_hops


# (Make sure that the connectivity CSV files are in the working directory.)
min_hops_motor_set1 = analyze_target_hops("set_1", target_class="motor")
min_hops_motor_set2 = analyze_target_hops("set_2", target_class="motor")
min_hops_motor_set3 = analyze_target_hops("set_3", target_class="motor")

results_motor = {
    "set_1": min_hops_motor_set1,
    "set_2": min_hops_motor_set2,
    "set_3": min_hops_motor_set3
}

import plotly.express as px
import pandas as pd

# Combine results from all six sets
all_results_motor = []
for s in range(1, 4):
    set_prefix = f"set_{s}"
    df_temp = results_motor[set_prefix].copy()  # each DataFrame should contain 'pre_root_id' and 'hop'
    df_temp["set"] = set_prefix
    all_results_motor.append(df_temp)

combined_motor = pd.concat(all_results_motor, ignore_index=True)
# Convert hop values to string so they are treated as discrete categories
combined_motor['hop'] = combined_motor['hop'].astype(str)
combined_motor['set'] = pd.Categorical(combined_motor['set'], categories=["set_1", "set_2", "set_3"], ordered=True)

# --- Stacked Bar Chart ---
# Create a frequency table counting the number of neurons per set per hop value.
freq_motor = combined_motor.groupby(["set", "hop"]).size().reset_index(name="count")
# Define a color mapping for motor hop categories (1, 2, and 3 hops).
motor_color_map = {
    "1": "red",
    "2": "yellow",
    "3": "blue"
}
fig_bar_motor = px.bar(
    freq_motor,
    x="set",
    y="count",
    color="hop",
    barmode="stack",
    title="Stacked Bar Chart of Minimum Motor Hops by aPhN2‑SA Set",
    labels={"set": "aPhN2‑SA Set", "count": "Number of Neurons", "hop": "Minimum Motor Hops"},
    color_discrete_map=motor_color_map
)
fig_bar_motor.show()

# --- Combined Violin Plot ---
# Create violin plots faceted by set to show the distribution of hop values.
fig_violin_motor = px.violin(
    combined_motor,
    y="hop",
    color="set",
    facet_col="set",
    box=True,
    points="all",
    title="Violin Plots of Minimum Motor Hops by aPhN2‑SA Set",
    labels={"hop": "Minimum Motor Hops"}
)
fig_violin_motor.update_layout(showlegend=False)
fig_violin_motor.show()


In [None]:
min_hops_endocrine_set1 = analyze_target_hops("set_1", target_class="endocrine")
min_hops_endocrine_set2 = analyze_target_hops("set_2", target_class="endocrine")
min_hops_endocrine_set3 = analyze_target_hops("set_3", target_class="endocrine")


In [None]:
results_endocrine = {
    "set_1": min_hops_endocrine_set1,  # DataFrame with columns 'pre_root_id' and 'hop'
    "set_2": min_hops_endocrine_set2,
    "set_3": min_hops_endocrine_set3,
}

import pandas as pd
import plotly.express as px

all_results_endocrine = []
for s in range(1, 4):
    set_prefix = f"set_{s}"
    df_temp = results_endocrine[set_prefix].copy()
    df_temp["set"] = set_prefix
    all_results_endocrine.append(df_temp)

combined_endocrine = pd.concat(all_results_endocrine, ignore_index=True)

# Convert hop values to string so that Plotly treats them as discrete
combined_endocrine['hop'] = combined_endocrine['hop'].astype(str)
# Force the sets to be ordered as set_1, set_2, ... set_6
combined_endocrine['set'] = pd.Categorical(combined_endocrine['set'],
                                           categories=["set_1", "set_2", "set_3"],
                                           ordered=True)

# --- Create a stacked bar chart for endocrine hops ---
freq_endocrine = combined_endocrine.groupby(["set", "hop"]).size().reset_index(name="count")

# Define a discrete color mapping for endocrine hop categories.
endocrine_color_map = {
    "1": "red",
    "2": "yellow",
    "3": "blue"
}

fig_bar_endocrine = px.bar(
    freq_endocrine,
    x="set",
    y="count",
    color="hop",
    barmode="stack",
    title="Stacked Bar Chart of Minimum Endocrine Hops by aPhN2‑SA Set",
    labels={"set": "aPhN2‑SA Set", "count": "Number of Neurons", "hop": "Minimum Endocrine Hops"},
    color_discrete_map=endocrine_color_map,
    category_orders={"set": ["set_1", "set_2", "set_3"]}
)
fig_bar_endocrine.show()

# --- Create combined violin subplots for endocrine hops ---
fig_violin_endocrine = px.violin(
    combined_endocrine,
    y="hop",
    color="set",
    facet_col="set",
    box=True,
    points="all",
    title="Violin Plots of Minimum Endocrine Hops by aPhN2‑SA Set",
    labels={"hop": "Minimum Endocrine Hops"}
)
fig_violin_endocrine.update_layout(showlegend=False)
fig_violin_endocrine.show()


In [None]:
# --- after you've built & reindexed freq_motor as you already have ---
all_sets = list(total_counts.keys())
all_hops = ['1','2','3','>3']

# force categorical ordering
freq_motor['hop'] = pd.Categorical(freq_motor['hop'], categories=all_hops, ordered=True)

# --- define your motor_color_map (note '>3' is grey) ---
motor_color_map = {
    '1': 'red',
    '2': 'yellow',
    '3': 'blue',
    '>3': 'grey'
}

# --- your existing px.bar(...) call ---
fig_bar_motor = px.bar(
    freq_motor,
    x="set",
    y="count",
    color="hop",
    barmode="stack",
    category_orders={"set": all_sets, "hop": all_hops},
    color_discrete_map=motor_color_map,
    title="Stacked Bar Chart of Minimum Motor Hops by aPhN2‑SA Set",
    labels={"set":"aPhN2‑SA Set","count":"# Neurons","hop":"Hops to Motor"}
)
fig_bar_motor.update_layout(xaxis_title=None)

# --- now force a dummy grey “>3” trace in the legend if it isn’t already there ---
if not any(t.name == ">3" for t in fig_bar_motor.data):
    fig_bar_motor.add_trace(
        go.Bar(
            x=[all_sets[0]],      # just needs one x‑value
            y=[0],                # zero height
            name=">3",
            marker_color="grey",
            showlegend=True,
            visible="legendonly"  # keeps it out of the actual bars
        )
    )

fig_bar_motor.show()



In [None]:
import pandas as pd
import plotly.express as px

# --- 1) compute total aPhN2‑SAs per set (same as motor) ---
total_counts = {
    'set_1': set_1['root_id'].nunique(),
    'set_2': set_2['root_id'].nunique(),
    'set_3': set_3['root_id'].nunique()
}

# --- 2) build your existing freq table for endocrine ---
freq_endocrine = (
    combined_endocrine
    .groupby(["set","hop"])
    .size()
    .reset_index(name="count")
)

# --- 3) insert the >3 category per set ---
extra = []
for s, total in total_counts.items():
    reached = freq_endocrine.loc[freq_endocrine['set']==s, 'count'].sum()
    unreached = total - reached
    if unreached > 0:
        extra.append({'set': s, 'hop': '>3', 'count': unreached})
if extra:
    freq_endocrine = pd.concat([freq_endocrine, pd.DataFrame(extra)], ignore_index=True)

# --- 3b) **force** every set × hop combination to exist (fill missing with 0) ---
all_sets = list(total_counts.keys())
all_hops = ['1','2','3','>3']
idx = pd.MultiIndex.from_product([all_sets, all_hops], names=['set','hop'])
freq_endocrine = (
    freq_endocrine
    .set_index(['set','hop'])
    .reindex(idx, fill_value=0)
    .reset_index()
)

# --- 4) force ordering and define colors ---
freq_endocrine['hop'] = pd.Categorical(
    freq_endocrine['hop'],
    categories=all_hops,
    ordered=True
)

endocrine_color_map = {
    '1': 'red',
    '2': 'yellow',
    '3': 'blue',
    '>3': 'grey'
}

# --- 5) plot the stacked bar ---
fig_bar_endocrine = px.bar(
    freq_endocrine,
    x="set",
    y="count",
    color="hop",
    barmode="stack",
    category_orders={"set": all_sets, "hop": all_hops},
    color_discrete_map=endocrine_color_map,
    title="Stacked Bar Chart of Minimum Endocrine Hops by aPhN2‑SA Set",
    labels={"set":"aPhN2‑SA Set","count":"# Neurons","hop":"Hops to Endocrine"}
)

fig_bar_endocrine.update_layout(xaxis_title=None)
fig_bar_endocrine.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go
import colorsys

# ---------------------------
# Helper functions for colors
# ---------------------------
def get_color(i, n):
    """Generate a distinct RGB color for node index i among n nodes."""
    hue = i / n
    r, g, b = colorsys.hsv_to_rgb(hue, 0.6, 0.9)
    return f"rgb({int(r*255)},{int(g*255)},{int(b*255)})"

def make_rgba(rgb_str, alpha=0.5):
    """Convert 'rgb(r,g,b)' to 'rgba(r,g,b,alpha)'."""
    r, g, b = rgb_str.lstrip("rgb(").rstrip(")").split(",")
    return f"rgba({r},{g},{b},{alpha})"

# ---------------------------
# 1) Load & concatenate all three sets at each hop level
# ---------------------------
df1 = pd.concat([
    pd.read_csv("set_1_opt_conns_superclass.csv"),
    pd.read_csv("set_2_opt_conns_superclass.csv"),
    pd.read_csv("set_3_opt_conns_superclass.csv")
], ignore_index=True)
df1 = df1[df1['syn_count'] >= 5]

df2 = pd.concat([
    pd.read_csv("set_1_hop_1_opt_conns_superclass.csv"),
    pd.read_csv("set_2_hop_1_opt_conns_superclass.csv"),
    pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
], ignore_index=True)
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.concat([
    pd.read_csv("set_1_hop_2_opt_conns_superclass.csv"),
    pd.read_csv("set_2_hop_2_opt_conns_superclass.csv"),
    pd.read_csv("set_3_hop_2_opt_conns_superclass.csv")
], ignore_index=True)
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# 2) Build flows (summing syn_count)
# ---------------------------
flow1 = (
    df1.groupby('output_super_class')['syn_count']
       .sum().reset_index(name='count')
)
flow1['source'] = "All Sets"

m12 = pd.merge(df1, df2,
               left_on='post_root_id', right_on='pre_root_id',
               suffixes=('_1','_2'))
flow2 = (
    m12.groupby(['output_super_class_1','output_super_class_2'])
       ['syn_count_2'].sum().reset_index(name='count')
)

m23 = pd.merge(df2, df3,
               left_on='post_root_id', right_on='pre_root_id',
               suffixes=('_2','_3'))
flow3 = (
    m23.groupby(['output_super_class_2','output_super_class_3'])
       ['syn_count_3'].sum().reset_index(name='count')
)

# ---------------------------
# 3) Build node lists for each column
# ---------------------------
col1 = ["All Sets"]
col2 = [f"1: {c}" for c in sorted(df1['output_super_class'].unique())]
col3 = [f"2: {c}" for c in sorted(df2['output_super_class'].unique())]
col4 = [f"3: {c}" for c in sorted(df3['output_super_class'].unique())]

nodes = col1 + col2 + col3 + col4
node_index = {n:i for i,n in enumerate(nodes)}

# ---------------------------
# 4) Assign colors
# ---------------------------
n_nodes = len(nodes)
node_colors = [get_color(i, n_nodes) for i in range(n_nodes)]
link_colors = []

# ---------------------------
# 5) Map flows to indices
# ---------------------------
# Flow1 → 0→1
flow1['source_idx'] = node_index["All Sets"]
flow1['target_idx'] = flow1['output_super_class'].map(lambda x: node_index[f"1: {x}"])
# Flow2 → 1→2
flow2['source_idx'] = flow2['output_super_class_1'].map(lambda x: node_index[f"1: {x}"])
flow2['target_idx'] = flow2['output_super_class_2'].map(lambda x: node_index[f"2: {x}"])
# Flow3 → 2→3
flow3['source_idx'] = flow3['output_super_class_2'].map(lambda x: node_index[f"2: {x}"])
flow3['target_idx'] = flow3['output_super_class_3'].map(lambda x: node_index[f"3: {x}"])

source = (
    flow1['source_idx'].tolist() +
    flow2['source_idx'].tolist() +
    flow3['source_idx'].tolist()
)
target = (
    flow1['target_idx'].tolist() +
    flow2['target_idx'].tolist() +
    flow3['target_idx'].tolist()
)
value = (
    flow1['count'].tolist() +
    flow2['count'].tolist() +
    flow3['count'].tolist()
)

# ---------------------------
# 6) Build link colors
# ---------------------------
for s in source:
    link_colors.append(make_rgba(node_colors[s], 0.5))

# ---------------------------
# 7) Compute hover stats
# ---------------------------
node_in  = dict.fromkeys(nodes, 0)
node_out = dict.fromkeys(nodes, 0)
for s, t, v in zip(source, target, value):
    node_out[nodes[s]] += v
    node_in [nodes[t]] += v

customdata = [
    f"Incoming: {node_in[n]}<br>"
    f"Outgoing: {node_out[n]}<br>"
    f"Total: {max(node_in[n], node_out[n])}"
    for n in nodes
]

# ---------------------------
# 8) Define fixed x‐positions
# ---------------------------
x_positions = (
    [0.0]*len(col1) +
    [0.33]*len(col2) +
    [0.66]*len(col3) +
    [1.0] *len(col4)
)

# ---------------------------
# 9) Draw the Sankey
# ---------------------------
fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        label=nodes,
        x=x_positions,
        color=node_colors,
        pad=20,
        thickness=20,
        line=dict(color="black", width=0.5),
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors
    )
))

fig.update_layout(
    title_text="Combined aPhN2-SA Sets: 3-Hop Superclass Sankey",
    font_size=12
)
fig.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import colorsys

# ---------------------------
# Helper functions for colors
# ---------------------------
def get_color(i, n):
    """Generate a distinct RGB color for index i out of n using HSV."""
    hue = i / n
    r, g, b = colorsys.hsv_to_rgb(hue, 0.6, 0.9)
    return f"rgb({int(r*255)},{int(g*255)},{int(b*255)})"

def make_rgba(rgb_str, alpha=0.5):
    """Convert 'rgb(r,g,b)' to 'rgba(r,g,b,alpha)'; fallback for non-rgb."""
    if not rgb_str.startswith("rgb"):
        # fallback to light grey
        return f"rgba(200,200,200,{alpha})"
    r, g, b = rgb_str.lstrip("rgb(").rstrip(")").split(",")
    return f"rgba({r},{g},{b},{alpha})"

# ---------------------------
# 1) Load & concatenate all three sets at each hop level
# ---------------------------
df1 = pd.concat([
    pd.read_csv("set_1_opt_conns_superclass.csv"),
    pd.read_csv("set_2_opt_conns_superclass.csv"),
    pd.read_csv("set_3_opt_conns_superclass.csv")
], ignore_index=True)
df1 = df1[df1['syn_count'] >= 5]

df2 = pd.concat([
    pd.read_csv("set_1_hop_1_opt_conns_superclass.csv"),
    pd.read_csv("set_2_hop_1_opt_conns_superclass.csv"),
    pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
], ignore_index=True)
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.concat([
    pd.read_csv("set_1_hop_2_opt_conns_superclass.csv"),
    pd.read_csv("set_2_hop_2_opt_conns_superclass.csv"),
    pd.read_csv("set_3_hop_2_opt_conns_superclass.csv")
], ignore_index=True)
df3 = df3[df3['syn_count'] >= 5]

# ---------------------------
# 2) Build flows by summing syn_count
# ---------------------------
flow1 = (
    df1.groupby('output_super_class')['syn_count']
       .sum().reset_index(name='count')
)
flow1['source'] = "All Sets"

m12 = pd.merge(df1, df2,
               left_on='post_root_id', right_on='pre_root_id',
               suffixes=('_1','_2'))
flow2 = (
    m12.groupby(['output_super_class_1','output_super_class_2'])
       ['syn_count_2'].sum().reset_index(name='count')
)

m23 = pd.merge(df2, df3,
               left_on='post_root_id', right_on='pre_root_id',
               suffixes=('_2','_3'))
flow3 = (
    m23.groupby(['output_super_class_2','output_super_class_3'])
       ['syn_count_3'].sum().reset_index(name='count')
)

# ---------------------------
# 3) Node lists for each column
# ---------------------------
col1 = ["All Sets"]
col2 = [f"1: {c}" for c in sorted(df1['output_super_class'].unique())]
col3 = [f"2: {c}" for c in sorted(df2['output_super_class'].unique())]
col4 = [f"3: {c}" for c in sorted(df3['output_super_class'].unique())]

nodes = col1 + col2 + col3 + col4
node_index = {n:i for i,n in enumerate(nodes)}

# ---------------------------
# 4) Assign unique color per superclass from Safe palette
# ---------------------------
palette = px.colors.qualitative.Safe
all_classes = sorted({
    *df1['output_super_class'].unique(),
    *df2['output_super_class'].unique(),
    *df3['output_super_class'].unique()
})
color_map = {cls: palette[i % len(palette)] for i, cls in enumerate(all_classes)}

node_colors = []
for label in nodes:
    if label == "All Sets":
        node_colors.append("lightgrey")
    else:
        cls = label.split(": ",1)[1]
        node_colors.append(color_map[cls])

# ---------------------------
# 5) Map flows to source/target indices
# ---------------------------
flow1['source_idx'] = node_index["All Sets"]
flow1['target_idx'] = flow1['output_super_class'] \
                      .map(lambda x: node_index[f"1: {x}"])

flow2['source_idx'] = flow2['output_super_class_1'] \
                      .map(lambda x: node_index[f"1: {x}"])
flow2['target_idx'] = flow2['output_super_class_2'] \
                      .map(lambda x: node_index[f"2: {x}"])

flow3['source_idx'] = flow3['output_super_class_2'] \
                      .map(lambda x: node_index[f"2: {x}"])
flow3['target_idx'] = flow3['output_super_class_3'] \
                      .map(lambda x: node_index[f"3: {x}"])

source = (
    flow1['source_idx'].tolist() +
    flow2['source_idx'].tolist() +
    flow3['source_idx'].tolist()
)
target = (
    flow1['target_idx'].tolist() +
    flow2['target_idx'].tolist() +
    flow3['target_idx'].tolist()
)
value = (
    flow1['count'].tolist() +
    flow2['count'].tolist() +
    flow3['count'].tolist()
)

# ---------------------------
# 6) Build link colors (50% alpha from node color)
# ---------------------------
link_colors = [make_rgba(node_colors[s], 0.5) for s in source]

# ---------------------------
# 7) Compute hover info
# ---------------------------
node_in  = dict.fromkeys(nodes, 0)
node_out = dict.fromkeys(nodes, 0)
for s, t, v in zip(source, target, value):
    node_out[nodes[s]] += v
    node_in [nodes[t]] += v

customdata = [
    f"Incoming: {node_in[n]}<br>Outgoing: {node_out[n]}<br>"
    f"Total: {max(node_in[n], node_out[n])}"
    for n in nodes
]

# ---------------------------
# 8) Define x-positions for columns
# ---------------------------
x_positions = (
    [0.0]*len(col1) +
    [0.33]*len(col2) +
    [0.66]*len(col3) +
    [1.0] *len(col4)
)

# ---------------------------
# 9) Draw the Sankey with arrangement="snap"
# ---------------------------
fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(
        label=nodes,
        x=x_positions,
        color=node_colors,
        pad=20,
        thickness=20,
        line=dict(color="black", width=0.5),
        customdata=customdata,
        hovertemplate='%{customdata}<extra>%{label}</extra>'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors
    )
))

fig.update_layout(
    title_text="aPhN2-SA Sankey Diagram",
    font_size=20
)
fig.show()


In [None]:
# save as PNG
fig.write_image("aphn2_sa_sankey.png", width=1200, height=800, scale=2)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Load & concatenate all three sets at each hop level
df1 = pd.concat([
    pd.read_csv("set_1_opt_conns_superclass.csv"),
    pd.read_csv("set_2_opt_conns_superclass.csv"),
    pd.read_csv("set_3_opt_conns_superclass.csv")
], ignore_index=True)
df1 = df1[df1['syn_count'] >= 5]

df2 = pd.concat([
    pd.read_csv("set_1_hop_1_opt_conns_superclass.csv"),
    pd.read_csv("set_2_hop_1_opt_conns_superclass.csv"),
    pd.read_csv("set_3_hop_1_opt_conns_superclass.csv")
], ignore_index=True)
df2 = df2[df2['syn_count'] >= 5]

df3 = pd.concat([
    pd.read_csv("set_1_hop_2_opt_conns_superclass.csv"),
    pd.read_csv("set_2_hop_2_opt_conns_superclass.csv"),
    pd.read_csv("set_3_hop_2_opt_conns_superclass.csv")
], ignore_index=True)
df3 = df3[df3['syn_count'] >= 5]

# 2) Build the full list of superclasses
all_classes = sorted(
    set(df1.output_super_class)
  | set(df2.output_super_class)
  | set(df3.output_super_class)
)

# 3) Build the three heatmap matrices

# Hop 0: "All Sets" → superclass
flow0 = df1.groupby('output_super_class')['syn_count'] \
           .sum() \
           .reindex(all_classes, fill_value=0)
mat0 = pd.DataFrame([flow0.values],
                    index=["All Sets"],
                    columns=all_classes)

# Hop 1: superclass → superclass
m12 = pd.merge(
    df1[['post_root_id','output_super_class','syn_count']],
    df2[['pre_root_id','output_super_class','syn_count']],
    left_on='post_root_id', right_on='pre_root_id',
    suffixes=('_from','_to')
)
mat1 = ( m12
    .groupby(['output_super_class_from','output_super_class_to'])['syn_count_to']
    .sum()
    .unstack(fill_value=0)
    .reindex(index=all_classes, columns=all_classes, fill_value=0)
)

# Hop 2: superclass → superclass
m23 = pd.merge(
    df2[['post_root_id','output_super_class','syn_count']],
    df3[['pre_root_id','output_super_class','syn_count']],
    left_on='post_root_id', right_on='pre_root_id',
    suffixes=('_from','_to')
)
mat2 = ( m23
    .groupby(['output_super_class_from','output_super_class_to'])['syn_count_to']
    .sum()
    .unstack(fill_value=0)
    .reindex(index=all_classes, columns=all_classes, fill_value=0)
)

# 4) Plotting
sns.set_theme(style='white')
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Hop 0 heatmap
vmax0 = mat0.values.max()
sns.heatmap(
    mat0,
    cmap='turbo',
    annot=False,
    square=True,
    vmin=0, vmax=vmax0,
    cbar_kws={'label':'synapse count'},
    ax=axes[0]
)
axes[0].set_title("Hop 0: All Sets → Superclass")
axes[0].set_ylabel("Source")
axes[0].set_xlabel("Target Superclass")

# Hop 1 heatmap
vmax1 = mat1.values.max()
sns.heatmap(
    mat1,
    cmap='turbo',
    annot=False,
    square=True,
    vmin=0, vmax=vmax1,
    cbar_kws={'label':'synapse count'},
    ax=axes[1]
)
axes[1].set_title("Hop 1: Superclass → Superclass")
axes[1].set_ylabel("Source Superclass")
axes[1].set_xlabel("Target Superclass")

# Hop 2 heatmap
vmax2 = mat2.values.max()
sns.heatmap(
    mat2,
    cmap='turbo',
    annot=False,
    square=True,
    vmin=0, vmax=vmax2,
    cbar_kws={'label':'synapse count'},
    ax=axes[2]
)
axes[2].set_title("Hop 2: Superclass → Superclass")
axes[2].set_ylabel("Source Superclass")
axes[2].set_xlabel("Target Superclass")

plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import plotly.express as px

def analyze_target_hops_combined(target_class="motor"):
    """
    Loads 0-, 1-, and 2-hop CSVs for sets 1–3, concatenates them,
    then finds for each pre_root_id the minimum hop (1, 2, or 3)
    at which output_super_class == target_class.
    Returns a DataFrame with ['pre_root_id','hop'].
    """
    # 1) 0-hop (direct) across all sets
    df1 = pd.concat([
        pd.read_csv("set_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 2) 1-hop outputs
    df2 = pd.concat([
        pd.read_csv("set_1_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_1_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 3) 2-hop outputs
    df3 = pd.concat([
        pd.read_csv("set_1_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_2_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # --- hop 1 hits ---
    df1_t = (
        df1[df1["output_super_class"] == target_class]
           [["pre_root_id"]]
           .drop_duplicates()
           .assign(hop=1)
    )

    # --- hop 2 hits ---
    df12 = pd.merge(
        df1[["pre_root_id","post_root_id"]],
        df2,
        left_on="post_root_id",
        right_on="pre_root_id",
        suffixes=("_1","_2")
    )
    start_col = "pre_root_id_1" if "pre_root_id_1" in df12 else "pre_root_id"
    df12_t = (
        df12[df12["output_super_class"] == target_class]
            [[start_col]]
            .drop_duplicates()
            .rename(columns={start_col:"pre_root_id"})
            .assign(hop=2)
    )

    # --- hop 3 hits ---
    post2 = "post_root_id_2" if "post_root_id_2" in df12 else "post_root_id"
    df123 = pd.merge(
        df12,
        df3,
        left_on=post2,
        right_on="pre_root_id",
        suffixes=("_2","_3")
    )
    out3 = "output_super_class_3" if "output_super_class_3" in df123 else "output_super_class"
    df123_t = (
        df123[df123[out3] == target_class]
             [[start_col]]
             .drop_duplicates()
             .rename(columns={start_col:"pre_root_id"})
             .assign(hop=3)
    )

    # --- combine and pick minimum hop per neuron ---
    all_hits = pd.concat([df1_t, df12_t, df123_t], ignore_index=True)
    min_hops = all_hits.groupby("pre_root_id", as_index=False)["hop"].min()
    return min_hops

if __name__ == "__main__":
    # compute minimum hops to “motor”
    min_h = analyze_target_hops_combined("motor")

    # turn hop into string for discrete coloring
    min_h["hop_str"] = min_h["hop"].astype(int).astype(str)

    # --- stacked bar chart: one bar, segments = hop1,2,3 counts ---
    freq = (
        min_h
        .groupby("hop_str")
        .size()
        .reset_index(name="count")
        .rename(columns={"hop_str":"hop"})
    )
    # add a dummy x-axis so plotly will stack
    freq["all_sets"] = "All Sets"

    fig_bar = px.bar(
        freq,
        x="all_sets",
        y="count",
        color="hop",
        barmode="stack",
        title="All-Sets Minimum Motor Hops (Stacked)",
        labels={
            "all_sets": "",
            "count": "Number of Neurons",
            "hop": "Hops to Motor"
        }
    )
    fig_bar.update_layout(xaxis={"visible": False})
    fig_bar.show()

    # --- violin plot of hop distribution ---
    fig_violin = px.violin(
        min_h,
        y="hop_str",
        box=True,
        points="all",
        title="Distribution of Minimum Motor Hops (All Sets)",
        labels={"hop_str":"Hops to Motor"}
    )
    fig_violin.update_traces(side="positive")
    fig_violin.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_target_hops_combined(target_class="motor"):
    """
    Loads 0-, 1-, and 2-hop CSVs for sets 1–3, concatenates them,
    then finds for each pre_root_id the minimum hop (1, 2, or 3)
    at which output_super_class == target_class.
    Returns a DataFrame with columns ['pre_root_id','hop'].
    """
    # 0-hop (direct)
    df1 = pd.concat([
        pd.read_csv("set_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 1-hop
    df2 = pd.concat([
        pd.read_csv("set_1_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_1_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 2-hop
    df3 = pd.concat([
        pd.read_csv("set_1_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_2_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # --- hop=1 hits ---
    df1_t = (
        df1[df1["output_super_class"] == target_class]
           [["pre_root_id"]]
           .drop_duplicates()
           .assign(hop=1)
    )

    # --- hop=2 hits via df1→df2 merge ---
    df12 = pd.merge(
        df1[["pre_root_id","post_root_id"]],
        df2,
        left_on="post_root_id",
        right_on="pre_root_id",
        suffixes=("_1","_2")
    )
    start_col = "pre_root_id_1" if "pre_root_id_1" in df12 else "pre_root_id"
    df12_t = (
        df12[df12["output_super_class"] == target_class]
            [[start_col]]
            .drop_duplicates()
            .rename(columns={start_col:"pre_root_id"})
            .assign(hop=2)
    )

    # --- hop=3 hits via df12→df3 merge ---
    post2 = "post_root_id_2" if "post_root_id_2" in df12 else "post_root_id"
    df123 = pd.merge(
        df12,
        df3,
        left_on=post2,
        right_on="pre_root_id",
        suffixes=("_2","_3")
    )
    out3 = "output_super_class_3" if "output_super_class_3" in df123 else "output_super_class"
    df123_t = (
        df123[df123[out3] == target_class]
             [[start_col]]
             .drop_duplicates()
             .rename(columns={start_col:"pre_root_id"})
             .assign(hop=3)
    )

    # --- combine and take minimum hop per neuron ---
    all_hits = pd.concat([df1_t, df12_t, df123_t], ignore_index=True)
    min_hops = all_hits.groupby("pre_root_id", as_index=False)["hop"].min()
    return min_hops

# -------------------------
# Main
# -------------------------
if __name__ == "__main__":
    # Compute min hops to “motor” across all sets
    min_h = analyze_target_hops_combined("motor")
    # For plotting purposes
    min_h["hop_str"] = min_h["hop"].astype(int).astype(str)

    # --- Prepare frequency table for stacked bar ---
    freq = (
        min_h["hop_str"]
        .value_counts()
        .sort_index(key=lambda idx: idx.astype(int))
        .rename_axis("hop")
        .reset_index(name="count")
    )
    # single x-position
    freq["dummy"] = "All Sets"

    # --- Plot with seaborn/matplotlib ---
    sns.set_theme(style="whitegrid")
    fig, (ax_bar, ax_viol) = plt.subplots(1, 2, figsize=(12, 5))

    # 1) Stacked bar chart
    bottom = np.zeros(1)
    hops = freq["hop"].tolist()
    for hop in hops:
        cnt = freq.loc[freq.hop == hop, "count"].values
        ax_bar.bar(
            x=["All Sets"],
            height=cnt,
            bottom=bottom,
            label=f"{hop} hop" + ("s" if hop != "1" else "")
        )
        bottom += cnt  # update bottom for next segment

    ax_bar.set_ylabel("Number of Neurons")
    ax_bar.set_title("All-Sets Minimum Motor Hops (Stacked)")
    ax_bar.legend(title="Hops to Motor")
    ax_bar.set_xticks([])  # hide the single x-tick

    # 2) Violin plot of hop distribution
    sns.violinplot(
        y="hop_str",
        data=min_h,
        inner="box",
        ax=ax_viol,
        

    )

    ax_bar.legend(
        title="Hops to Motor",
        loc="upper left",
        bbox_to_anchor=(1.05, 1)
)


    
    ax_viol.set_ylabel("Hops to Motor")
    ax_viol.set_title("Distribution of Minimum Motor Hops\n(All Sets)")
    ax_viol.invert_yaxis()

    
    # tighten but leave space on the right
    plt.tight_layout(rect=[0,0,0.85,1])    # [left, bottom, right, top] in figure coords
    plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_target_hops_combined(target_class="motor"):
    """
    Loads 0-, 1-, and 2-hop CSVs for sets 1–3, concatenates them,
    then finds for each pre_root_id the minimum hop (1, 2, or 3)
    at which output_super_class == target_class.
    Returns a DataFrame with columns ['pre_root_id','hop'].
    """
    # 0-hop (direct)
    df1 = pd.concat([
        pd.read_csv("set_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 1-hop
    df2 = pd.concat([
        pd.read_csv("set_1_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_1_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 2-hop
    df3 = pd.concat([
        pd.read_csv("set_1_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_2_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # --- hop=1 hits ---
    df1_t = (
        df1[df1["output_super_class"] == target_class]
           [["pre_root_id"]]
           .drop_duplicates()
           .assign(hop=1)
    )

    # --- hop=2 hits via df1→df2 merge ---
    df12 = pd.merge(
        df1[["pre_root_id","post_root_id"]],
        df2,
        left_on="post_root_id",
        right_on="pre_root_id",
        suffixes=("_1","_2")
    )
    start_col = "pre_root_id_1" if "pre_root_id_1" in df12 else "pre_root_id"
    df12_t = (
        df12[df12["output_super_class"] == target_class]
            [[start_col]]
            .drop_duplicates()
            .rename(columns={start_col:"pre_root_id"})
            .assign(hop=2)
    )

    # --- hop=3 hits via df12→df3 merge ---
    post2 = "post_root_id_2" if "post_root_id_2" in df12 else "post_root_id"
    df123 = pd.merge(
        df12,
        df3,
        left_on=post2,
        right_on="pre_root_id",
        suffixes=("_2","_3")
    )
    out3 = "output_super_class_3" if "output_super_class_3" in df123 else "output_super_class"
    df123_t = (
        df123[df123[out3] == target_class]
             [[start_col]]
             .drop_duplicates()
             .rename(columns={start_col:"pre_root_id"})
             .assign(hop=3)
    )

    # --- combine and take minimum hop per neuron ---
    all_hits = pd.concat([df1_t, df12_t, df123_t], ignore_index=True)
    min_hops = all_hits.groupby("pre_root_id", as_index=False)["hop"].min()
    return min_hops

# -------------------------
# Main
# -------------------------
if __name__ == "__main__":
    # 1) Compute min hops to “motor”
    min_h = analyze_target_hops_combined("motor")

    # 2) String‐label the hops
    min_h["hop_str"] = min_h["hop"].astype(int).astype(str)

    # 3) Detect any neurons that *never* hit “motor” in 1–3 hops, tag as ">3"
    all_pres = pd.concat([
        pd.read_csv("set_1_opt_conns_superclass.csv")[["pre_root_id"]],
        pd.read_csv("set_2_opt_conns_superclass.csv")[["pre_root_id"]],
        pd.read_csv("set_3_opt_conns_superclass.csv")[["pre_root_id"]],
    ], ignore_index=True).drop_duplicates()

    missed = all_pres.loc[~all_pres["pre_root_id"].isin(min_h["pre_root_id"]), ["pre_root_id"]]
    if not missed.empty:
        missed = missed.copy()
        missed["hop_str"] = ">3"
        # only keep the string column for consistency
        min_h = pd.concat([min_h[["pre_root_id","hop_str"]], missed], ignore_index=True)

    # 4) Build frequency table and *force* all four categories—even if zero
    categories = ["1","2","3",">3"]
    freq = (
        min_h["hop_str"]
             .value_counts()
             .rename_axis("hop")
             .reset_index(name="count")
             .set_index("hop")
             .reindex(categories, fill_value=0)
             .reset_index()
    )
    freq["dummy"] = "All Sets"

    # 5) Plot
    sns.set_theme(style="whitegrid")
    fig, (ax_bar) = plt.subplots(1, figsize=(12, 5))

    # ---- Stacked bar chart ----
    bottom = np.zeros(1)
    for hop in categories:
        cnt = freq.loc[freq.hop == hop, "count"].values
        ax_bar.bar(
            x=["All Sets"],
            height=cnt,
            bottom=bottom,
            label=f"{hop} hop" + ("s" if hop != "1" else "")
        )
        bottom += cnt

    ax_bar.set_ylabel("Number of Neurons")
    ax_bar.set_title("aPhN2 Motor Hops")
    ax_bar.set_xticks([])  # hide the single x-tick
    ax_bar.legend(
        title="Hops to Motor",
        loc="upper left",
        bbox_to_anchor=(1.05, 1)
    )



    # 6) Layout tweak
    plt.tight_layout(rect=[0, 0, 0.85, 1])  # leave room on the right for legend
    plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_target_hops_combined(target_class="endocrine"):
    """
    Loads 0-, 1-, and 2-hop CSVs for sets 1–3, concatenates them,
    then finds for each pre_root_id the minimum hop (1, 2, or 3)
    at which output_super_class == target_class.
    Returns a DataFrame with columns ['pre_root_id','hop'].
    """
    # 0-hop (direct)
    df1 = pd.concat([
        pd.read_csv("set_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 1-hop
    df2 = pd.concat([
        pd.read_csv("set_1_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_1_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_1_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # 2-hop
    df3 = pd.concat([
        pd.read_csv("set_1_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_2_hop_2_opt_conns_superclass.csv"),
        pd.read_csv("set_3_hop_2_opt_conns_superclass.csv"),
    ], ignore_index=True)

    # --- hop=1 hits ---
    df1_t = (
        df1[df1["output_super_class"] == target_class]
           [["pre_root_id"]]
           .drop_duplicates()
           .assign(hop=1)
    )

    # --- hop=2 hits via df1→df2 merge ---
    df12 = pd.merge(
        df1[["pre_root_id","post_root_id"]],
        df2,
        left_on="post_root_id",
        right_on="pre_root_id",
        suffixes=("_1","_2")
    )
    start_col = "pre_root_id_1" if "pre_root_id_1" in df12 else "pre_root_id"
    df12_t = (
        df12[df12["output_super_class"] == target_class]
            [[start_col]]
            .drop_duplicates()
            .rename(columns={start_col:"pre_root_id"})
            .assign(hop=2)
    )

    # --- hop=3 hits via df12→df3 merge ---
    post2 = "post_root_id_2" if "post_root_id_2" in df12 else "post_root_id"
    df123 = pd.merge(
        df12,
        df3,
        left_on=post2,
        right_on="pre_root_id",
        suffixes=("_2","_3")
    )
    out3 = "output_super_class_3" if "output_super_class_3" in df123 else "output_super_class"
    df123_t = (
        df123[df123[out3] == target_class]
             [[start_col]]
             .drop_duplicates()
             .rename(columns={start_col:"pre_root_id"})
             .assign(hop=3)
    )

    # --- combine and take minimum hop per neuron ---
    all_hits = pd.concat([df1_t, df12_t, df123_t], ignore_index=True)
    min_hops = all_hits.groupby("pre_root_id", as_index=False)["hop"].min()
    return min_hops

# -------------------------
# Main
# -------------------------
if __name__ == "__main__":
    # 1) Compute min hops to “endocrine”
    min_h = analyze_target_hops_combined("endocrine")

    # 2) String‐label the hops
    min_h["hop_str"] = min_h["hop"].astype(int).astype(str)

    # 3) Detect any neurons that *never* hit “endocrine” in 1–3 hops, tag as ">3"
    all_pres = pd.concat([
        pd.read_csv("set_1_opt_conns_superclass.csv")[["pre_root_id"]],
        pd.read_csv("set_2_opt_conns_superclass.csv")[["pre_root_id"]],
        pd.read_csv("set_3_opt_conns_superclass.csv")[["pre_root_id"]],
    ], ignore_index=True).drop_duplicates()

    missed = all_pres.loc[~all_pres["pre_root_id"].isin(min_h["pre_root_id"]), ["pre_root_id"]]
    if not missed.empty:
        missed = missed.copy()
        missed["hop_str"] = ">3"
        # only keep the string column for consistency
        min_h = pd.concat([min_h[["pre_root_id","hop_str"]], missed], ignore_index=True)

    # 4) Build frequency table and *force* all four categories—even if zero
    categories = ["1","2","3",">3"]
    freq = (
        min_h["hop_str"]
             .value_counts()
             .rename_axis("hop")
             .reset_index(name="count")
             .set_index("hop")
             .reindex(categories, fill_value=0)
             .reset_index()
    )
    freq["dummy"] = "All Sets"

    # 5) Plot
    sns.set_theme(style="whitegrid")
    fig, (ax_bar) = plt.subplots(1, figsize=(12, 5))

    # ---- Stacked bar chart ----
    bottom = np.zeros(1)
    for hop in categories:
        cnt = freq.loc[freq.hop == hop, "count"].values
        ax_bar.bar(
            x=["All Sets"],
            height=cnt,
            bottom=bottom,
            label=f"{hop} hop" + ("s" if hop != "1" else "")
        )
        bottom += cnt

    ax_bar.set_ylabel("Number of Neurons")
    ax_bar.set_title("aPhN2 Endocrine Hops")
    ax_bar.set_xticks([])  # hide the single x-tick
    ax_bar.legend(
        title="Hops to Endocrine",
        loc="upper left",
        bbox_to_anchor=(1.05, 1)
    )



    # 6) Layout tweak
    plt.tight_layout(rect=[0, 0, 0.85, 1])  # leave room on the right for legend
    plt.show()


In [None]:
# build a single DataFrame with a hop column
all_summaries = []
for df, hop in [(df1,0),(df2,1),(df3,2)]:
    s = (df.groupby("output_super_class")["syn_count"]
           .sum()
           .reset_index(name="syn_count"))
    s["hop"] = f"hop {hop}"
    s["pct"] = 100 * s["syn_count"] / s["syn_count"].sum()
    all_summaries.append(s)

big = pd.concat(all_summaries, ignore_index=True)

fig = px.bar(
    big,
    x="output_super_class",
    y="pct",
    color="hop",
    barmode="group",
    title="Synapse % by Superclass Across Hops",
    labels={"output_super_class":"Superclass","pct":"% of synapses"},
)
fig.update_layout(xaxis_tickangle=45)
fig.show()


In [None]:
import pandas as pd
import plotly.express as px

# 1) Build the percent summary for each hop
all_hops = []
for df, hop in [(df1, "All Sets→SC"), (df2, "1st hop→SC"), (df3, "2nd hop→SC")]:
    s = (
        df.groupby("output_super_class")["syn_count"]
          .sum()
          .reset_index(name="syn_count")
    )
    total = s["syn_count"].sum()
    s["pct"] = 100 * s["syn_count"] / total
    s["hop"] = hop
    all_hops.append(s)

big = pd.concat(all_hops, ignore_index=True)

# 2) Plot a stacked bar chart
fig = px.bar(
    big,
    x="hop",
    y="pct",
    color="output_super_class",
    title="Stacked Bar: % of Synapses by Superclass Across Hops",
    labels={
        "hop": "Connection stage",
        "pct": "% of synapses",
        "output_super_class": "Superclass"
    },
    category_orders={"hop": ["All Sets→SC", "1st hop→SC", "2nd hop→SC"]}
)
fig.update_layout(
    barmode="stack",
    xaxis_title=None,
    yaxis_range=[0,100],
    legend_title_text="Superclass",
    yaxis=dict(ticksuffix="%")
)
fig.show()


In [None]:
import pandas as pd
import plotly.express as px

# 1) Build the raw‐count summary for each hop
all_hops = []
for df, hop in [
    (df1, "All Sets → SC"),
    (df2, "1st hop → SC"),
    (df3, "2nd hop → SC")
]:
    s = (
        df.groupby("output_super_class")["syn_count"]
          .sum()
          .reset_index(name="syn_count")
    )
    s["hop"] = hop
    all_hops.append(s)

big = pd.concat(all_hops, ignore_index=True)

# 2) Plot a stacked bar chart of raw counts
fig = px.bar(
    big,
    x="hop",
    y="syn_count",
    color="output_super_class",
    title="Stacked Bar: Synapse Counts by Superclass Across Hops",
    labels={
        "hop": "Connection stage",
        "syn_count": "Total synapses",
        "output_super_class": "Superclass"
    },
    category_orders={"hop": ["All Sets → SC", "1st hop → SC", "2nd hop → SC"]}
)
fig.update_layout(
    barmode="stack",
    xaxis_title=None,
    legend_title_text="Superclass"
)
fig.show()
