In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import math
import re

# for making legends 
import matplotlib.patches as mpatches
import matplotlib.lines as mlines

from scipy.sparse import csc_matrix, coo_matrix, diags
import scipy as sp

# get axon-dendrite connectome 
Connectivity matrix from [here](https://www.science.org/doi/10.1126/science.add9330).

In [None]:
ad = pd.read_csv('https://raw.githubusercontent.com/YijieYin/interpret_connectome/main/data/ad_connectivity_matrix.csv', index_col=0)
ad

So the columns and rows are skids, and the values are synapse numbers, not input proportion. So we need the total number of synapses on the dendrites for each skid. 

In [None]:
# histogram of total number of postsynapses per neuron
ad.sum(axis = 'rows').hist(bins = 50)

In [None]:
ad_edgelist = ad.melt(ignore_index=False).reset_index()
ad_edgelist.columns = ['pre','post','weight']
ad_edgelist = ad_edgelist[ad_edgelist.weight>0]
ad_edgelist.pre = ad_edgelist.pre.astype(str)
ad_edgelist

In [None]:
nodes = set(ad.index.astype(str)).union(set(ad.columns))
nodes_to_idx = {node:num for num, node in enumerate(nodes)}

# Create COO matrix
row = ad_edgelist.pre.map(nodes_to_idx).values
col = ad_edgelist.post.map(nodes_to_idx).values
data = ad_edgelist.weight.values
matrix_size = len(nodes)
coo = coo_matrix((data, (row, col)), shape=(matrix_size, matrix_size))

# then turn it into csc matrix
csc = coo.tocsc()

csc_size = csc.data.nbytes  # Size of the data array
csc_size += csc.indices.nbytes  # Size of the indices array
csc_size += csc.indptr.nbytes  # Size of the index pointer array
# number of MB
csc_size/1e6

In [None]:
len(nodes)

In [None]:
col_sums = csc.sum(axis=0)
# Handling division by zero in case some columns have a sum of zero
# that is, where a neuron doesn't have incoming synapses
# .A turns it from a sparse matrix into a dense np array
col_sums_with_inversion = np.reciprocal(col_sums.A.squeeze().astype(float), where=col_sums.A.squeeze() != 0)
# Multiply each column by the inverse of its sum
inprop = csc.multiply(col_sums_with_inversion)
# and then reduce the precision to float32 to save memory
inprop = inprop.astype(np.float32)

In [None]:
ad

In [None]:
inprop

In [None]:
sp.sparse.save_npz('data/larva_inprop.npz', inprop)

In [None]:
# Set up the figure
fig, ax = plt.subplots(figsize = (8,5))

# Plot the histogram with log scale for both x and y axes
ax.hist(inprop.data, bins=np.logspace(-3, 0, 40))
ax.set_xscale('log')
ax.set_yscale('log')

# Set labels and title
ax.set_xlabel('Strength of connection (Log Scale)')
ax.set_ylabel('Number of connections (Log Scale)')
ax.set_title('Number of connections vs. strength of connection')

# Show the plot
plt.show()

# get meta info

In [None]:
meta = pd.read_csv('https://raw.githubusercontent.com/YijieYin/interpret_connectome/main/data/brain-neurons_meta-data.csv')
meta.head()

In [None]:
# which types are there? 
meta.celltype.value_counts()

In [None]:
# what about sub-type?
meta.annotated_name.value_counts()

In [None]:
# how many neurons on the right without a contralateral homologue? 
sum(meta.leftid == 'no pair')

In [None]:
sum(meta.rightid == 'no pair')

In [None]:
# which kind of sensory neurons are there? 
meta[meta.celltype.isin(['sensory'])]['annotated_name'].value_counts()

## make a type dictionary

In [None]:
# there are many values in the left_id or right_id column that is 'no pair'. So multiple values are assigned to the 'no pair' key. 
# when this happens, only the last value is retained in the dictionary. 
# but this is okay because we don't care about the 'no pair' ids. 
types = dict(zip(pd.concat([meta.leftid, meta.rightid]),
                 pd.concat([meta.celltype,meta.celltype])))
del types['no pair']
# have a look at a few 
dict(list(types.items())[0:5])

In [None]:
# and a neuron name dictionary 
names = dict(zip(pd.concat([meta.leftid, meta.rightid]),
                 pd.concat([meta.left_name,meta.right_name])))

In [None]:
# and a side dictionary 
sides = dict.fromkeys(meta.leftid, 'left')
sides.update(dict.fromkeys(meta.rightid, 'right'))

In [None]:
# additional info from annotated_name column
types_add = dict(zip(pd.concat([meta.leftid, meta.rightid]),
                 pd.concat([meta.annotated_name,meta.annotated_name])))

In [None]:
# remove side info and calculate the mean between sides  
# names_noside = dict([(skid, re.sub('( left| right|;right|;left|_left|_right|-R-|-L-|l$|r$|l |r )','', name)) for (skid, name) in names.items()])
# remove things in brackets
names_noside = dict([(skid, re.sub('( left| right|;right|;left|_left|_right|-R-|-L-|l$|r$|l |r |\(.*\))','', name)) for (skid, name) in names.items()])
# map name to type 
nametotype = dict([(names_noside[skid], thistype) for skid, thistype in types.items()])
nametotype_add = dict([(names_noside[skid], thistype) for skid, thistype in types_add.items() if skid != 'no pair'])

In [None]:
meta_out = pd.DataFrame(types.items(), columns = ['skid','cell_type'])
meta_out['idx'] = meta_out.skid.map(nodes_to_idx)
# meta_out['idx'] = meta_out.idx.astype(np.int64)

meta_out['name'] = meta_out.skid.map(names)
meta_out['name_noside'] = meta_out.skid.map(names_noside)
meta_out['side'] = meta_out.skid.map(sides)
meta_out['annotated_name'] = meta_out.skid.map(types_add) 
meta_out

In [None]:
meta_out.to_csv('data/larva_type_meta.csv')