In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix, coo_matrix, diags
import scipy as sp

In [2]:
inputs = pd.read_feather('C:\\Users\\44745\\Documents\\malevnc_inputs.feather')
output = pd.read_feather(
    'C:\\Users\\44745\\Documents\\malevnc_outputs.feather')
meta = pd.read_csv('C:\\Users\\44745\\Documents\\manc_meta.csv', index_col=0)

In [3]:
inputs.bodyid = inputs.bodyid.astype('Int64')
output.bodyid = output.bodyid.astype('Int64')
inputs.partner = inputs.partner.astype('Int64')
output.partner = output.partner.astype('Int64')

# filter to only consider bodyids in meta 
inputs = inputs[inputs.bodyid.isin(meta.bodyid) & inputs.partner.isin(meta.bodyid)]
output = output[output.bodyid.isin(meta.bodyid) & output.partner.isin(meta.bodyid)] 

inputs

Unnamed: 0,bodyid,partner,prepost,weight,name,type
0,10439,10725,0.0,1080,IN05B008_A1_R,IN05B008
1,11104,10480,0.0,1072,IN05B008_A1_L,IN05B008
2,10302,10146,0.0,1001,IN19A002_T2_R,IN19A002
3,13006,10352,0.0,960,IN19A005_T3_R,IN19A005
4,10220,10256,0.0,946,IN19A002_T1_L,IN19A002
...,...,...,...,...,...,...
5299425,99990,15978,0.0,1,INXXX257_A8_R,INXXX257
5299426,99990,15860,0.0,1,INXXX243_A8_L,INXXX243
5299427,99990,15681,0.0,1,INXXX262_A9_R,INXXX262
5299428,99990,14512,0.0,1,IN18B033_A6_L,IN18B033


In [4]:
meta.predictedNt.value_counts()

predictedNt
acetylcholine    11512
glutamate         6225
gaba              5719
unknown             55
unclear             39
Name: count, dtype: int64

In [5]:
# use name if available, otherwise use bodyid
meta['combined_type'] = meta.name 
meta.combined_type.fillna(meta.bodyid.astype(str), inplace=True)

# only excitatory if acetlycholine
meta['nt_binary'] = meta.predictedNt.apply(
    lambda x: 1 if x == 'acetylcholine' else -1)
meta

Unnamed: 0,bodyid,hemilineage,class,predictedNt,somaNeuromere,rootSide,systematicType,type,somaSide,name,combined_type,nt_binary
1,10000,,descending neuron,acetylcholine,,RHS,DNlt002,DNp01,,DNlt002_CvC_R,DNlt002_CvC_R,1
2,100000,12B,intrinsic neuron,gaba,T3,,IN12B033,IN12B033,RHS,IN12B033_T3_R,IN12B033_T3_R,-1
3,100002,09A,intrinsic neuron,gaba,T1,,IN09A063,IN09A063,RHS,IN09A063_T1_R,IN09A063_T1_R,-1
4,10001,,sensory neuron,glutamate,,,SNxxxx,SNxxxx,,SNxxxx_ProLN_R,SNxxxx_ProLN_R,-1
5,10002,,descending neuron,acetylcholine,,LHS,DNlt002,DNp01,,DNlt002_CvC_L,DNlt002_CvC_L,1
...,...,...,...,...,...,...,...,...,...,...,...,...
23546,99425,,sensory neuron,glutamate,,LHS,SNta39,SNta39,,SNta39_MesoLN_L,SNta39_MesoLN_L,-1
23547,99431,,sensory neuron,glutamate,,LHS,SNxxxx,SNxxxx,,SNxxxx_MetaLN_L,SNxxxx_MetaLN_L,-1
23548,99612,,sensory neuron,glutamate,,LHS,SNta39,SNta39,,SNta39_MetaLN_L,SNta39_MetaLN_L,-1
23549,99837,,sensory ascending,acetylcholine,,LHS,SNxx29,SNxx29,,SNxx29_DProN_L,SNxx29_DProN_L,1


In [6]:
ntcount_perinstance = meta.groupby('name')['nt_binary'].nunique()
# neurons with multiple nt
diffnt = ntcount_perinstance[ntcount_perinstance > 1]
diffnt

name
AN07B101_T1_R      2
AN08B018_T3_L      2
AN08B018_T3_R      2
AN08B069_T1_L      2
AN09B017_A10_R     2
                  ..
SNxxxx_MesoLN_L    2
SNxxxx_MesoLN_R    2
SNxxxx_MetaLN_R    2
SNxxxx_ProLN_L     2
SNxxxx_ProLN_R     2
Name: nt_binary, Length: 342, dtype: int64

In [7]:
id_to_combined_type = dict(zip(meta.bodyid, meta.combined_type))
name_to_nt_binary = dict(zip(meta.name, meta.nt_binary))

In [8]:
# standardise nt_binary for neurons with multiple nt
# count the number of +/- neurons for each name
nt_count_per_name = meta[meta.name.isin(diffnt.index)].groupby('name')['nt_binary'].value_counts().reset_index(name='n_neurons')

for inst in nt_count_per_name.name.unique(): 
    df = nt_count_per_name[nt_count_per_name.name == inst].copy()
    df.sort_values('n_neurons', ascending=False, inplace=True)
    # if there is a max count, use the corresponding nt_binary
    if df.n_neurons.iloc[0] > df.n_neurons.iloc[1]:
        name_to_nt_binary[inst] = df.nt_binary.iloc[0]
    # if there is a tie, randomly choose one
    else:
        name_to_nt_binary[inst] = df.nt_binary.sample(1).values[0]

meta['nt_binary'] = meta.name.map(name_to_nt_binary)

In [9]:
in_conn = inputs.rename(columns={'bodyid':'post', 'partner':'pre'})
out_conn = output.rename(
    columns={'bodyid': 'pre', 'partner': 'post'})

conn = pd.concat([in_conn[['pre', 'post', 'weight']], out_conn[[
                 'pre', 'post', 'weight']]], ignore_index=True).drop_duplicates()
conn = conn[conn.weight > 0]
conn

Unnamed: 0,pre,post,weight
0,10725,10439,1080
1,10480,11104,1072
2,10146,10302,1001
3,10352,13006,960
4,10256,10220,946
...,...,...,...
5299425,15978,99990,1
5299426,15860,99990,1
5299427,15681,99990,1
5299428,14512,99990,1


In [10]:
type_level = False 

if type_level:
    conntt = conn.copy()
    conntt['pre_type'] = conntt.pre.map(id_to_combined_type)
    conntt['post_type'] = conntt.post.map(id_to_combined_type)
    # connectivity between combined_types
    conntt = conntt.groupby(['pre_type', 'post_type']).weight.sum().reset_index()
    
    # instead of making a dense matrix based on the edgelist above, let's make a sparse one from the edgelist directly
    # first make a coo matrix
    nodes = set(meta.combined_type)
    sorted_nodes = sorted(nodes)  # Convert the set to a sorted list
    nodes_to_idx = {node: int(num) for num, node in enumerate(sorted_nodes)}

    # type to type connttectivity
    conntt['pre_idx'] = conntt['pre_type'].map(nodes_to_idx)
    conntt['post_idx'] = conntt['post_type'].map(nodes_to_idx)

    # Create COO matrix
    row = conntt['pre_idx'].values
    col = conntt['post_idx'].values
    data = conntt['weight'].values
    matrix_size = len(nodes)
    coo = coo_matrix((data, (row, col)), shape=(matrix_size, matrix_size))
else: 
    # instead of making a dense matrix based on the edgelist above, let's make a sparse one from the edgelist directly
    # first make a coo matrix
    nodes = set(meta.bodyid)
    sorted_nodes = sorted(nodes)  # Convert the set to a sorted list
    nodes_to_idx = {node:int(num) for num, node in enumerate(sorted_nodes)}

    # type to type connectivity
    conn['pre_idx'] = conn['pre'].map(nodes_to_idx)
    conn['post_idx'] = conn['post'].map(nodes_to_idx)

    # Create COO matrix
    row = conn['pre_idx'].values
    col = conn['post_idx'].values
    data = conn['weight'].values
    matrix_size = len(nodes)
    coo = coo_matrix((data, (row, col)), shape=(matrix_size, matrix_size))
# then turn it into csc matrix
csc = coo.tocsc()

csc_size = csc.data.nbytes  # Size of the data array
csc_size += csc.indices.nbytes  # Size of the indices array
csc_size += csc.indptr.nbytes  # Size of the index pointer array
# number of MB
csc_size/1e6

42.489644

In [11]:
col_sums = csc.sum(axis=0)
# Handling division by zero in case some columns have a sum of zero
# that is, where a neuron doesn't have incoming synapses
# .A turns it from a sparse matrix into a dense np array
col_sums_with_inversion = np.reciprocal(col_sums.A.squeeze().astype(float), where=col_sums.A.squeeze() != 0)
# Multiply each column by the inverse of its sum
inprop = csc.multiply(col_sums_with_inversion)
# and then reduce the precision to float32 to save memory
inprop = inprop.astype(np.float32)

In [12]:
if type_level:
    sp.sparse.save_npz('data/manc_type_inprop.npz', inprop)
else:
    sp.sparse.save_npz('data/manc_inprop.npz', inprop)

In [13]:
if type_level: 
    meta['idx'] = meta.combined_type.map(nodes_to_idx)
    # get the unique rows
    meta_type = meta.drop('bodyid', axis=1).drop_duplicates()
    meta_type.to_csv('data/manc_type_meta.csv')
    
else: 
    meta['idx'] = meta.bodyid.map(nodes_to_idx)
    meta.to_csv('data/manc_meta.csv')