In [None]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import time
from scipy.sparse import csc_matrix, coo_matrix, diags
import scipy as sp
from tqdm import tqdm


In [None]:
import seaserpent as ss

# reading data

In [None]:
info = pd.read_csv('https://raw.githubusercontent.com/flyconnectome/flywire_annotations/main/supplemental_files/Supplemental_file1_neuron_annotations.tsv',
                   delimiter = '\t')
info.columns

In [None]:
info.super_class.value_counts(dropna = False)

In [None]:
# to make the matrix smaller, let's take visual projection neurons as visual input, and remove the optic lobe neurons
cb = info[~info.super_class.isin(['optic'])]
# filter out the visual sensory neurons, except for the ocellar ones
cb = cb[(cb.cell_class != 'visual') | (cb.cell_type == 'ocellar retinula cell')]
cb

In [None]:
# which sensory modalities are there?
cb[cb.super_class == 'sensory']['cell_class'].value_counts()

## no type

In [None]:
# how many neurons have no type information?
cb[['cell_type','hemibrain_type']].isna().value_counts()

In [None]:
cbnotype = cb[cb.hemibrain_type.isna() & cb.cell_type.isna()]
cbnotype.status.value_counts(dropna=False)

In [None]:
# which hemilineages do the neurons without a type belong to?
cbnotype[cbnotype.status.isna()].ito_lee_hemilineage.value_counts(dropna = False)

In [None]:
cbnotype[cbnotype.status.isna()].super_class.value_counts()

In [None]:
cbnotype.root_id[cbnotype.ito_lee_hemilineage.isna() & (cbnotype.super_class == 'central')].values

# add CB types from live info?

In [None]:
# read from info
infolive = ss.Table('info', base='main')
infodf = infolive.to_frame()
# remove not a neurons and duplicate entries
infodf = infodf[~infodf.super_class.isin(['not_a_neuron']) & ~infodf.status.isin(['bad_nucleus','not_a_neuron'])]

In [None]:
infodfnotype = infodf[infodf.root_783.isin(cbnotype.root_id.astype(str))]
infodfnotype.cb_type.value_counts()

In [None]:
# have a look at a few
infodfnotype[infodfnotype.cb_type == 'AN_6_22,29,36,37'].root_783.astype(np.int64).values

In [None]:
# exclude ones with 'None' in the type name - they don't seem good
# first exclude the ones without a cb_type
infodfnotype = infodfnotype[~infodfnotype.cb_type.isna()]
infodfnotype = infodfnotype[['None' not in cbtype for cbtype in infodfnotype.cb_type]]
infodfnotype.cb_type.value_counts()

In [None]:
infodfnotype.shape

So we can add additional labels to ~1700 neurons.

In [None]:
cbtypes = dict(zip(infodfnotype.root_783.astype(np.int64), infodfnotype.cb_type))

# Combine types into one column

In [None]:
cb["combined_type"] = cb.cell_type
# fill the leftover nas with hemibrain_type
cb.combined_type.fillna(cb.hemibrain_type, inplace= True)
# fill the still leftover nas with cbtype
cb.combined_type.fillna(cb.root_id.map(cbtypes), inplace= True)
# fill the still leftover nas with root ids
cb.combined_type.fillna(cb.root_id.astype(str), inplace= True)
cb.combined_type.value_counts()

In [None]:
typedict = dict(zip(cb.root_id, cb.combined_type))
sidedict = dict(zip(cb.root_id, cb.side))

In [None]:
conn = pd.read_feather('/Users/yijieyin/Downloads/syn_proof_analysis_filtered_consolidated_783.feather')
conn

In [None]:
# filter for the connectivity between components of the central brain
conn = conn[conn.pre_pt_root_id.isin(cb.root_id) & conn.post_pt_root_id.isin(cb.root_id)]

# add meta information
conn['pre_type'] = conn.pre_pt_root_id.map(typedict)
conn['pre_side'] = conn.pre_pt_root_id.map(sidedict)
conn['post_type'] = conn.post_pt_root_id.map(typedict)
conn['post_side'] = conn.post_pt_root_id.map(sidedict)

# and put type and side info into one column
# this separates the neurons of the same type from different sides, and
# keeps the possibility of looking at circuits where the side of the neuron makes a difference
conn['pre_type_side'] = conn.pre_type +'_'+ conn.pre_side
conn['post_type_side'] = conn.post_type +'_'+ conn.post_side
conn

In [None]:
# sum the number of synapses across neuropils and root_ids of the same type and side
conntt = conn.groupby(['pre_type_side','post_type_side'])['syn_count'].sum().reset_index()
conntt

In [None]:
# how many neurons are there roughly?
print('Total number of unique ids: ', len(set(conn.pre_pt_root_id)))
print('Total number of unique types (including where root_ids are used as a type): ', len(set(cb.combined_type)))
print('Total number of unique type_sides: ', len(set(conn.pre_type_side)))

In [None]:
# the amount of memory it would take for a dense matrix in Gb
# number of integers * 64 bit integers / 8 bits per byte / 1e9 bytes per Gb
(len(set(conn.pre_type_side))**2)*64/8/1e9

- COO Matrix (Coordinate format):
1. In COO format, a sparse matrix is represented by three arrays: row indices, column indices, and the values themselves.  
2. This format is ideal for constructing sparse matrices when you have the coordinates of the non-zero elements (like from an edge list).  
3. It's efficient for matrix construction but not for matrix operations (like multiplication or indexing).  

- CSC Matrix (Compressed Sparse Column format):
1. In CSC format, the matrix is represented by column pointers, row indices, and the non-zero values.  
2. This format is efficient for arithmetic operations, column slicing, and matrix-vector products.  
3. However, constructing a CSC matrix directly from an edge list isn't as straightforward as with a COO matrix.

In [None]:
# instead of making a dense matrix based on the edgelist above, let's make a sparse one from the edgelist directly
# first make a coo matrix
nodes = set(conntt.pre_type_side).union(set(conntt.post_type_side))
nodes_to_idx = {node:num for num, node in enumerate(nodes)}

# type to type connectivity
conntt['pre_idx'] = conntt['pre_type_side'].map(nodes_to_idx)
conntt['post_idx'] = conntt['post_type_side'].map(nodes_to_idx)

# Create COO matrix
row = conntt['pre_idx'].values
col = conntt['post_idx'].values
data = conntt['syn_count'].values
matrix_size = len(nodes)
coo = coo_matrix((data, (row, col)), shape=(matrix_size, matrix_size))

# then turn it into csc matrix
csc = coo.tocsc()

csc_size = csc.data.nbytes  # Size of the data array
csc_size += csc.indices.nbytes  # Size of the indices array
csc_size += csc.indptr.nbytes  # Size of the index pointer array
# number of MB
csc_size/1e6

In [None]:
csc.shape

In [None]:
csc

In [None]:
col_sums = csc.sum(axis=0)
# Handling division by zero in case some columns have a sum of zero
# that is, where a neuron doesn't have incoming synapses
# .A turns it from a sparse matrix into a dense np array
col_sums_with_inversion = np.reciprocal(col_sums.A.squeeze().astype(float), where=col_sums.A.squeeze() != 0)
# Multiply each column by the inverse of its sum
inprop = csc.multiply(col_sums_with_inversion)
# and then reduce the precision to float32 to save memory
inprop = inprop.astype(np.float32)

In [None]:
sp.sparse.save_npz('data/adult_inprop.npz', inprop)

# int64 vs. int32

In [None]:
# sensitivity analysis - does int32/16 matter?
import numpy as np
from scipy.sparse import random as sparse_random, csc_matrix

# Example matrix size
n = 100

# Generate a random sparse matrix (for demonstration purposes)
rng = np.random.default_rng()
matrix_float64 = sparse_random(n, n, density=0.1, format='csc', data_rvs=rng.random).astype(np.float64)
matrix_float32 = matrix_float64.astype(np.float32)
matrix_float16 = matrix_float64.astype(np.float16)

# Function to perform a series of matrix multiplications
def multiply_matrix(matrix, steps):
    result = matrix
    for _ in range(steps):
        result = result @ matrix
    return result

# Perform the operations
steps = 5
result_64 = multiply_matrix(matrix_float64, steps)
result_32 = multiply_matrix(matrix_float32, steps)
result_16 = multiply_matrix(matrix_float16, steps)

# Compare the results
diff_64_32 = np.abs(result_64 - result_32.astype(np.float64))
diff_64_16 = np.abs(result_64 - result_16.astype(np.float64))

# Print maximum differences
print(f"Max difference between float64 and float32: {diff_64_32.max()}")
print(f"Max difference between float64 and float16: {diff_64_16.max()}")

So we need float32.

# parallelising vs. not

In [None]:
import multiprocessing
import time

import numpy as np

In [None]:
num_cores = multiprocessing.cpu_count()

In [None]:
# Define the matrix multiplication function
def matrix_multiply(args):
    A, B = args
    return np.dot(A, B)

In [None]:
# Create two random matrices of size 1000x1000
A = np.random.rand(1000, 1000)
B = np.random.rand(1000, 1000)

# Split the matrices into 4 parts
A_parts = np.array_split(A, 4, axis=1)
B_parts = np.array_split(B, 4)

In [None]:
from concurrent.futures import ProcessPoolExecutor


In [None]:
start = time.time()

# # Map the matrix multiplication function to the 4 parts of the matrices
# C_parts = pool.map(matrix_multiply,
#                    [(A_part, B_part) for A_part, B_part in zip(A_parts, B_parts)])
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(matrix_multiply, A_part, B_part) for A_part, B_part in zip(A_parts, B_parts)]
    C_parts = [f.result() for f in futures]

# Concatenate the parts of the result matrix
C = np.concatenate(C_parts, axis=1)

end = time.time()
print(end - start)

Process SpawnProcess-1:
Traceback (most recent call last):
  File "/Users/yijieyin/.pyenv/versions/3.8.11/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/yijieyin/.pyenv/versions/3.8.11/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yijieyin/.pyenv/versions/3.8.11/lib/python3.8/concurrent/futures/process.py", line 233, in _process_worker
    call_item = call_queue.get(block=True)
  File "/Users/yijieyin/.pyenv/versions/3.8.11/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'matrix_multiply' on <module '__main__' (built-in)>
Process SpawnProcess-2:
Traceback (most recent call last):
  File "/Users/yijieyin/.pyenv/versions/3.8.11/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/yijieyin/.pyenv/versions/3.8.11/lib/python3.8/multiprocessi

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.