## Preprocess annotations

In [1]:
import os
import sys
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import colorcet as cc
from ipywidgets import interactive

import numpy as np
import pandas as pd
import logging
import pickle
import pickle

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.style.use('dark_background')

logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_info = print

## Configuration

In [2]:
proc_data_path = '../../data/MICrONS__L23__8_8_40__processed'
tables_path = '../../tables'

In [3]:
# load meta.csv
meta_df = pd.read_csv(os.path.join(proc_data_path, 'meta.csv'))
annotations_df = pd.read_csv(os.path.join(tables_path, 'annotated_synapse_features.csv'), index_col=0)

In [4]:
# drop NaN columns
bad_mask = np.zeros((len(annotations_df),), dtype=bool)
for column in annotations_df.columns:
    c_mask = np.isnan(annotations_df[column].values)
    bad_mask = bad_mask | c_mask
    print(column, np.sum(c_mask))
annotations_df = annotations_df[~bad_mask]
annotations_df.reset_index(drop=True, inplace=True)

syn_id 0
cleft_size_vx 0
presyn_soma_dist_um 25
postsyn_soma_dist_um 151
n_mitos_pre 90
n_mitos_post 90
pre_cell_type 0
post_cell_type 0
mito_size_pre_vx 90
mito_size_post_vx 90


In [5]:
kept_annotations_list = [
    'cleft_size_vx',
    'presyn_soma_dist_um',
    'postsyn_soma_dist_um',
    'pre_cell_type',
    'post_cell_type',
    'n_mitos_pre',
    'n_mitos_post',
    'mito_size_pre_vx',
    'mito_size_post_vx'
]

kept_annotations_type = [
    'continuous',
    'continuous',
    'continuous',
    'categorical',
    'categorical',
    'continuous',
    'continuous',
    'continuous',
    'continuous'
]

final_annotations_column_names = [
    'cleft_size_log1p_zscore',
    'presyn_soma_dist_log1p_zscore',
    'postsyn_soma_dist_log1p_zscore',
    'pre_cell_type',
    'post_cell_type',
    'n_mitos_pre',
    'n_mitos_post',
    'mito_size_pre_vx_log1p_zscore_zi',
    'mito_size_post_vx_log1p_zscore_zi'
]

identity = lambda x: x

def log1p_zscore(values):
    v = np.log1p(values)
    m, s = np.mean(v), np.std(v)
    return (v - m) / s

def log1p_zscore_zero_inflated(values):
    zero_mask = (values == 0.)
    v = np.log1p(values[~zero_mask])
    m, s = np.mean(v), np.std(v)
    z =  (v - m) / s
    out = np.zeros((len(values),))
    out[~zero_mask] = z[:]
    out[zero_mask] = float("-inf")
    return out

def log1p_zscore_meta(values):
    v = np.log1p(values)
    m, s = np.mean(v), np.std(v)
    return m, s

def log1p_zscore_zero_inflated_meta(values):
    zero_mask = (values == 0.)
    v = np.log1p(values[~zero_mask])
    m, s = np.mean(v), np.std(v)
    return m, s

transforms = [
    log1p_zscore,
    log1p_zscore,
    log1p_zscore,
    identity,
    identity,
    identity,
    identity,
    log1p_zscore_zero_inflated,
    log1p_zscore_zero_inflated
]

In [6]:
annotations_df

Unnamed: 0,syn_id,cleft_size_vx,presyn_soma_dist_um,postsyn_soma_dist_um,n_mitos_pre,n_mitos_post,pre_cell_type,post_cell_type,mito_size_pre_vx,mito_size_post_vx
0,1484,798,221.313279,85.491890,0.0,0.0,0,0,0.0,0.0
1,2254,129,279.652318,84.258586,0.0,1.0,0,0,0.0,563600.0
2,3785,62,130.542692,90.183479,0.0,0.0,0,0,0.0,0.0
3,3863,62,336.784967,241.501515,0.0,1.0,0,0,0.0,4264516.0
4,4075,62,154.577558,135.108827,0.0,0.0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
6178,1533469,603,117.594227,162.934700,1.0,0.0,1,1,66332.0,0.0
6179,287537,276,202.799738,111.211712,0.0,1.0,1,1,0.0,157080.0
6180,3032966,181,119.304778,13.846323,3.0,3.0,1,1,528440.0,2785792.0
6181,531494,107,217.056124,24.085480,2.0,3.0,1,1,375388.0,395452.0


In [7]:
# drop nans
bad_indices = []
for col_name in kept_annotations_list:
    bad_indices += list(annotations_df[np.isnan(annotations_df[col_name].values)].index)
good_indices = list(set(list(annotations_df.index)).difference(set(bad_indices)))
annotations_df = annotations_df.iloc[good_indices]

In [8]:
final_annotations_dict = dict()

for orig_column_name, annotation_type, final_column_name, transform in zip(
        kept_annotations_list, kept_annotations_type, final_annotations_column_names, transforms):

    assert annotation_type in {'continuous', 'categorical'}
    
    orig_values = annotations_df[orig_column_name].values

    if annotation_type == 'continuous':
        final_values = transform(orig_values)
                
    elif annotation_type == 'categorical':
        final_values = orig_values.astype(int)
    
    final_annotations_dict[final_column_name] = final_values

In [9]:
final_annotations_dict['has_mito_pre'] = np.isfinite(final_annotations_dict['mito_size_pre_vx_log1p_zscore_zi']).astype(int)
final_annotations_dict['has_mito_post'] = np.isfinite(final_annotations_dict['mito_size_post_vx_log1p_zscore_zi']).astype(int)
assert np.all(final_annotations_dict['has_mito_pre'] == (final_annotations_dict['n_mitos_pre'] > 0).astype(int))
assert np.all(final_annotations_dict['has_mito_post'] == (final_annotations_dict['n_mitos_post'] > 0).astype(int))

In [10]:
from collections import Counter
counts = Counter(list(zip(final_annotations_dict['pre_cell_type'], final_annotations_dict['post_cell_type'])))
for key, value in counts.items():
    print(f'type: {key}, fraction: {value / len(annotations_df):.2f}')

type: (0, 0), fraction: 0.31
type: (1, 0), fraction: 0.49
type: (0, 1), fraction: 0.17
type: (1, 1), fraction: 0.03


In [11]:
em_syn_ids = meta_df['synapse_id'].values
annotated_syn_ids = annotations_df['syn_id'].values
mutual_syn_ids = sorted(list(set(em_syn_ids).intersection(set(annotated_syn_ids))))
mutual_meta_df = meta_df[meta_df['synapse_id'].isin(mutual_syn_ids)]
mutual_meta_df_synapse_ids = mutual_meta_df['synapse_id'].values
synapse_id_to_annotations_df_row_index = {synapse_id: index for index, synapse_id in enumerate(annotations_df['syn_id'])}
annotations_df_row_indices = list(map(synapse_id_to_annotations_df_row_index.get, mutual_meta_df_synapse_ids))
final_annotations_df = pd.DataFrame(data=dict(**{'synapse_id': annotations_df['syn_id'].values}, **final_annotations_dict))
mutual_meta_df_aligned_final_annotations_df = final_annotations_df.iloc[annotations_df_row_indices]
first_df = mutual_meta_df.set_index('synapse_id')
second_df = mutual_meta_df_aligned_final_annotations_df.set_index('synapse_id')
meta_ext_df = pd.concat((first_df, second_df), axis=1)
meta_ext_df = meta_ext_df.reset_index()

In [12]:
meta_ext_df.to_csv(os.path.join(proc_data_path, 'meta_ext.csv'), index=False)

In [13]:
meta_ext_df

Unnamed: 0,synapse_id,filename,n_cutout_sections,post_synaptic_volume,pre_synaptic_volume,synaptic_cleft_volume,cleft_size_log1p_zscore,presyn_soma_dist_log1p_zscore,postsyn_soma_dist_log1p_zscore,pre_cell_type,post_cell_type,n_mitos_pre,n_mitos_post,mito_size_pre_vx_log1p_zscore_zi,mito_size_post_vx_log1p_zscore_zi,has_mito_pre,has_mito_post
0,1000004,1000004__2_256_256_52.npy,1,2855510.0,1382710.0,5805.0,0.221381,-0.087308,0.110199,1,0,1.0,1.0,0.361396,2.663820,1,1
1,1001064,1001064__2_256_256_52.npy,7,757705.0,1228365.0,35695.0,2.527689,-0.955527,0.452097,0,0,1.0,0.0,-1.579014,-inf,1,0
2,1001959,1001959__2_256_256_52.npy,3,7581970.0,1306600.0,10230.0,0.932481,-0.033998,-1.494799,1,0,1.0,2.0,0.306555,0.365641,1,1
3,1003796,1003796__2_256_256_52.npy,2,131530.0,302620.0,3070.0,-0.541478,1.540747,1.279091,0,0,0.0,0.0,-inf,-inf,0,0
4,100426,100426__2_256_256_52.npy,0,443370.0,344255.0,1410.0,-1.557111,-1.791284,0.243304,0,0,0.0,1.0,-inf,-0.591276,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5659,995738,995738__2_256_256_52.npy,1,1090620.0,283245.0,3060.0,-0.549512,0.285049,0.785046,0,0,0.0,1.0,-inf,1.833237,0,1
5660,998414,998414__2_256_256_52.npy,0,656790.0,861390.0,2365.0,-0.900531,-0.427890,0.545759,1,0,1.0,0.0,1.172815,-inf,1,0
5661,998624,998624__2_256_256_52.npy,2,1744330.0,1349180.0,3110.0,-0.549512,1.018483,0.531163,1,0,1.0,1.0,0.571826,1.677105,1,1
5662,998959,998959__2_256_256_52.npy,2,6834865.0,1799490.0,2750.0,-0.722024,-0.162076,-0.831938,1,0,1.0,1.0,0.927955,-0.830449,1,1


## Transformation metadata

In [14]:
trans_meta_dict = dict()

for orig_column_name, annotation_type, final_column_name, transform in zip(
        kept_annotations_list, kept_annotations_type, final_annotations_column_names, transforms):

    assert annotation_type in {'continuous', 'categorical'}
    
    orig_values = annotations_df[orig_column_name].values

    if annotation_type == 'continuous':
        if transform == log1p_zscore:
            loc, scale = log1p_zscore_meta(orig_values)
        elif transform == log1p_zscore_zero_inflated:
            loc, scale = log1p_zscore_zero_inflated_meta(orig_values)
        else:
            loc, scale = 0., 1.
        trans_meta_dict[final_column_name + "__loc"] = [loc]
        trans_meta_dict[final_column_name + "__scale"] = [scale]

In [15]:
trans_meta_df = pd.DataFrame(data=trans_meta_dict)
trans_meta_df

Unnamed: 0,cleft_size_log1p_zscore__loc,cleft_size_log1p_zscore__scale,presyn_soma_dist_log1p_zscore__loc,presyn_soma_dist_log1p_zscore__scale,postsyn_soma_dist_log1p_zscore__loc,postsyn_soma_dist_log1p_zscore__scale,n_mitos_pre__loc,n_mitos_pre__scale,n_mitos_post__loc,n_mitos_post__scale,mito_size_pre_vx_log1p_zscore_zi__loc,mito_size_pre_vx_log1p_zscore_zi__scale,mito_size_post_vx_log1p_zscore_zi__loc,mito_size_post_vx_log1p_zscore_zi__scale
0,5.486893,0.795317,5.193572,0.370629,3.816027,0.754296,0.0,1.0,0.0,1.0,11.628451,1.018936,13.295805,1.109729


In [16]:
trans_meta_df.to_csv(os.path.join(tables_path, 'meta_ext_loc_scale.csv'), index=False)