In [1]:
from scipy.io import mmread
import pandas as pd
import anndata
from tqdm.notebook import tqdm
import numpy as np
import gc

## Preprocess Embryos

Read the embryo variant calls and associated reads and create anndatas with them.

In [2]:
NUM_EMBRYOS = 3
VARIANT_MTX_PATHS = ['data/emb1/out_mtrix.mtx', 'data/emb2/out_mtrix.mtx', 'data/emb3/out_mtrix.mtx']
MUTATION_ANNO_PATHS = ['data/emb1/mouse_1.var', 'data/emb2/mouse_2.var', 'data/emb3/mouse_3.var']
CELL_BARCODE_PATHS = ['data/emb1/mouse_1_cell_barcodes.tsv', 'data/emb2/mouse_2_cell_barcodes.tsv', 'data/emb3/mouse_3_cell_barcodes.tsv']
REF_READS_PATHS = ['data/emb1/mouse_1_coverage.ref.mtx', 'data/emb2/mouse_2_coverage.ref.mtx', 'data/emb3/mouse_3_coverage.ref.mtx']
VAR_READS_PATHS = ['data/emb1/mouse_1_coverage.out.mtx', 'data/emb2/mouse_2_coverage.out.mtx', 'data/emb3/mouse_3_coverage.out.mtx']

CALLS_SAVE_PATH = 'data/calls_2022_07_06.h5ad'
READS_SAVE_PATH = 'data/reads_2022_07_06.h5ad'

In [3]:
variant_mats = []
mutation_annos = []
cell_barcodes = []
ref_reads_mats = []
var_reads_mats = []

for i in range(NUM_EMBRYOS):
    # Read variant call matrices
    variant_mat = mmread(VARIANT_MTX_PATHS[i])
    variant_mats.append(variant_mat)
    
    # Read the number of reads for ref and variants for the cells
    ref_reads_mat = mmread(REF_READS_PATHS[i])
    ref_reads_mats.append(ref_reads_mat)
    
    var_reads_mat = mmread(VAR_READS_PATHS[i])
    var_reads_mats.append(var_reads_mat)
    
    # Read the identifiers for the mutations
    with open(MUTATION_ANNO_PATHS[i]) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        
    mutation_annos.append(lines)
    
    # Read the barcodes for the cells
    with open(CELL_BARCODE_PATHS[i]) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        
    cell_barcodes.append(lines)

In [4]:
variant_adatas = []
read_adatas = []

# Convert each of the variant datasets to anndata
for i in range(NUM_EMBRYOS):
    obs = pd.DataFrame(index=cell_barcodes[i])
    obs['embryo'] = i
    var = pd.DataFrame(index=mutation_annos[i])
    X = variant_mats[i].T.toarray()
    
    adata = anndata.AnnData(obs=obs, var=var, X=X)
    variant_adatas.append(adata)
    
# Convert the number of reads for each cell/variant into anndata
# Note that we add together the number of reference reads and variant reads
for i in range(NUM_EMBRYOS):
    obs = pd.DataFrame(index=cell_barcodes[i])
    obs['embryo'] = i
    var = pd.DataFrame(index=mutation_annos[i])
    
    # Combine both reads
    X = ref_reads_mats[i].T + var_reads_mats[i].T
    X = X.toarray()
    
    adata = anndata.AnnData(obs=obs, var=var, X=X)
    read_adatas.append(adata)

In [5]:
# Join the data from each embryo
variants_joined = anndata.concat(variant_adatas, join='outer')
reads_joined = anndata.concat(read_adatas, join='outer')

# Replace any missing data with zeros
variants_joined.X[np.isnan(variants_joined.X)] = 0
reads_joined.X[np.isnan(reads_joined.X)] = 0

# Fix dtypes
variants_joined.X = variants_joined.X.astype('int8')
reads_joined.X = reads_joined.X.astype('int16')

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [6]:
variants_joined.write_h5ad(CALLS_SAVE_PATH, compression='gzip')
reads_joined.write_h5ad(READS_SAVE_PATH, compression='gzip')