In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix, coo_matrix, diags
import scipy as sp

In [None]:
inputs = pd.read_feather('C:\\Users\\44745\\Documents\\malevnc_inputs.feather')
output = pd.read_feather(
    'C:\\Users\\44745\\Documents\\malevnc_outputs.feather')
meta = pd.read_csv('C:\\Users\\44745\\Documents\\manc_meta.csv', index_col=0)

In [None]:
inputs.bodyid = inputs.bodyid.astype('Int64')
output.bodyid = output.bodyid.astype('Int64')
inputs.partner = inputs.partner.astype('Int64')
output.partner = output.partner.astype('Int64')

# filter to only consider bodyids in meta 
inputs = inputs[inputs.bodyid.isin(meta.bodyid) & inputs.partner.isin(meta.bodyid)]
output = output[output.bodyid.isin(meta.bodyid) & output.partner.isin(meta.bodyid)] 

inputs

In [None]:
meta['combined_type'] = meta.instance 
meta.combined_type.fillna(meta.bodyid.astype(str), inplace=True)

meta['nt_binary'] = meta.predicted_nt.apply(
    lambda x: 1 if x == 'acetylcholine' else -1)
meta

In [None]:
ntcount_perinstance = meta.groupby('instance')['nt_binary'].nunique()
# neurons with multiple nt
diffnt = ntcount_perinstance[ntcount_perinstance > 1]
diffnt

In [None]:
id_to_combined_type = dict(zip(meta.bodyid, meta.combined_type))
instance_to_nt_binary = dict(zip(meta.instance, meta.nt_binary))

In [None]:
# standardise nt_binary for neurons with multiple nt
# count the number of +/- neurons for each instance
nt_count_per_instance = meta[meta.instance.isin(diffnt.index)].groupby('instance')['nt_binary'].value_counts().reset_index(name='n_neurons')

for inst in nt_count_per_instance.instance.unique(): 
    df = nt_count_per_instance[nt_count_per_instance.instance == inst].copy()
    df.sort_values('n_neurons', ascending=False, inplace=True)
    # if there is a max count, use the corresponding nt_binary
    if df.n_neurons.iloc[0] > df.n_neurons.iloc[1]:
        instance_to_nt_binary[inst] = df.nt_binary.iloc[0]
    # if there is a tie, randomly choose one
    else:
        instance_to_nt_binary[inst] = df.nt_binary.sample(1).values[0]

meta['nt_binary'] = meta.instance.map(instance_to_nt_binary)

In [None]:
in_conn = inputs.rename(columns={'bodyid':'post', 'partner':'pre'})
out_conn = output.rename(
    columns={'bodyid': 'pre', 'partner': 'post'})

conn = pd.concat([in_conn[['pre', 'post', 'weight']], out_conn[[
                 'pre', 'post', 'weight']]], ignore_index=True).drop_duplicates()
conn = conn[conn.weight > 0]
conn

In [None]:
type_level = False 

if type_level:
    conntt = conn.copy()
    conntt['pre_type'] = conntt.pre.map(id_to_combined_type)
    conntt['post_type'] = conntt.post.map(id_to_combined_type)
    # connectivity between combined_types
    conntt = conntt.groupby(['pre_type', 'post_type']).weight.sum().reset_index()
    
    # instead of making a dense matrix based on the edgelist above, let's make a sparse one from the edgelist directly
    # first make a coo matrix
    nodes = set(meta.combined_type)
    sorted_nodes = sorted(nodes)  # Convert the set to a sorted list
    nodes_to_idx = {node: int(num) for num, node in enumerate(sorted_nodes)}

    # type to type connttectivity
    conntt['pre_idx'] = conntt['pre_type'].map(nodes_to_idx)
    conntt['post_idx'] = conntt['post_type'].map(nodes_to_idx)

    # Create COO matrix
    row = conntt['pre_idx'].values
    col = conntt['post_idx'].values
    data = conntt['weight'].values
    matrix_size = len(nodes)
    coo = coo_matrix((data, (row, col)), shape=(matrix_size, matrix_size))
else: 
    # instead of making a dense matrix based on the edgelist above, let's make a sparse one from the edgelist directly
    # first make a coo matrix
    nodes = set(meta.bodyid)
    sorted_nodes = sorted(nodes)  # Convert the set to a sorted list
    nodes_to_idx = {node:int(num) for num, node in enumerate(sorted_nodes)}

    # type to type connectivity
    conn['pre_idx'] = conn['pre'].map(nodes_to_idx)
    conn['post_idx'] = conn['post'].map(nodes_to_idx)

    # Create COO matrix
    row = conn['pre_idx'].values
    col = conn['post_idx'].values
    data = conn['weight'].values
    matrix_size = len(nodes)
    coo = coo_matrix((data, (row, col)), shape=(matrix_size, matrix_size))
# then turn it into csc matrix
csc = coo.tocsc()

csc_size = csc.data.nbytes  # Size of the data array
csc_size += csc.indices.nbytes  # Size of the indices array
csc_size += csc.indptr.nbytes  # Size of the index pointer array
# number of MB
csc_size/1e6

In [None]:
col_sums = csc.sum(axis=0)
# Handling division by zero in case some columns have a sum of zero
# that is, where a neuron doesn't have incoming synapses
# .A turns it from a sparse matrix into a dense np array
col_sums_with_inversion = np.reciprocal(col_sums.A.squeeze().astype(float), where=col_sums.A.squeeze() != 0)
# Multiply each column by the inverse of its sum
inprop = csc.multiply(col_sums_with_inversion)
# and then reduce the precision to float32 to save memory
inprop = inprop.astype(np.float32)

In [None]:
if type_level:
    sp.sparse.save_npz('data/manc_type_inprop.npz', inprop)
else:
    sp.sparse.save_npz('data/manc_inprop.npz', inprop)

In [None]:
if type_level: 
    meta['idx'] = meta.combined_type.map(nodes_to_idx)
    # get the unique rows
    meta_type = meta.drop('bodyid', axis=1).drop_duplicates()
    meta_type.to_csv('data/manc_type_meta.csv')
    
else: 
    meta['idx'] = meta.bodyid.map(nodes_to_idx)
    meta.to_csv('data/manc_meta.csv')