In [1]:
from scipy.io import mmread
import pandas as pd
import anndata
from tqdm.notebook import tqdm
import numpy as np
from numba import njit

## Read Variant Calling Data

In [2]:
NUM_EMBRYOS = 3
VARIANT_MTX_PATHS = ['data/emb1/out_mtrix.mtx', 'data/emb2/out_mtrix.mtx', 'data/emb3/out_mtrix.mtx']
MUTATION_ANNO_PATHS = ['data/emb1/mouse_1.var', 'data/emb2/mouse_2.var', 'data/emb3/mouse_3.var']
CELL_BARCODE_PATHS = ['data/emb1/mouse_1_cell_barcodes.tsv', 'data/emb2/mouse_2_cell_barcodes.tsv', 'data/emb3/mouse_3_cell_barcodes.tsv']

In [3]:
variant_mats = []
mutation_annos = []
cell_barcodes = []

for i in range(NUM_EMBRYOS):
    # Read variant call matrices
    variant_mat = mmread(VARIANT_MTX_PATHS[i])
    variant_mats.append(variant_mat)
    
    # Read the identifiers for the mutations
    with open(MUTATION_ANNO_PATHS[i]) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        
    mutation_annos.append(lines)
    
    # Read the barcodes for the cells
    with open(CELL_BARCODE_PATHS[i]) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        
    cell_barcodes.append(lines)

In [4]:
variant_adatas = []

# Convert each of the variant datasets to anndata
for i in range(NUM_EMBRYOS):
    obs = pd.DataFrame(index=cell_barcodes[i])
    obs['embryo'] = i
    var = pd.DataFrame(index=mutation_annos[i])
    
    adata = anndata.AnnData(obs=obs, var=var, X=variant_mats[i].T)
    variant_adatas.append(adata)

In [5]:
variants_joined = anndata.concat(variant_adatas, join='outer')

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


## Compute Adjusted Hamming Distances and Cluster

In [12]:
@njit
def get_adj_ham_dist_dense(cell1, cell2):
    # Find the shared mutations on which we have a call
    shared_inds = (cell1 != 0) & (cell2 != 0)
    num_shared = np.sum(shared_inds)
    
    # Find the number of variants called differently
    dif_call = cell1[shared_inds] != cell2[shared_inds]
    num_dif = np.sum(dif_call)
    
    return num_dif/num_shared

# Computes the distance matrix using adjusted hamming distance
# Assumes rows are cells
def compute_adj_hamming_dists(mtx):
    num_cells = mtx.shape[0]
    
    dists = np.zeros((num_cells, num_cells))
    
    for i in tqdm(range(num_cells)):   
        for j in range(i+1, num_cells):
            dists[i][j] = get_adj_ham_dist_dense(mtx[i], mtx[j])
            
    return dists + dists.T

In [None]:
compute_adj_hamming_dists(variants_joined.X.toarray())

  0%|          | 0/39827 [00:00<?, ?it/s]