In [1]:
import numpy as np
import pandas as pd
import gzip
import pickle
import anndata
from scipy import sparse
import mygene

In [2]:
basepath = 'data/reads_per_umi_tables/'

In [3]:
filename_multiexp = 'Ziegenhain2017.hd1.txt.gz'
filename_smartseq3 = 'Hagemann-Jensen2020_Smartseq3_SE.hd1.txt.gz' 
filename_smartseq3_PE ='Johnsson2022_Smartseq3_PE.hd1.txt.gz'
filename_smartseq3xpress = 'Hagemann-Jensen2022_Smartseq3xpress.hd1.txt.gz'

In [4]:
def preload_long_data(path,filename):
    
    with gzip.open(f'{path}{filename}') as f:
        
        lines=[]
        for i,line in enumerate(f):

            line_decoded = line.decode("utf-8").replace('\n','').split('\t')

            if i == 0:
                header = line_decoded
            else:
                lines.append(line_decoded)
    df_long = pd.DataFrame(np.array(lines),columns=header)
    df_long['N'] = df_long['N'].astype(int)
    
    return df_long

In [5]:
%%time
# 17min
df_multiexp = preload_long_data(basepath,filename_multiexp)
df_smartseq3 = preload_long_data(basepath,filename_smartseq3)  
df_smartseq3_PE = preload_long_data(basepath,filename_smartseq3_PE)
df_smartseq3xpress = preload_long_data(basepath,filename_smartseq3xpress)

tcmalloc: large alloc 10691682304 bytes == 0x344bc000 @ 
tcmalloc: large alloc 8619941888 bytes == 0x7b1ae000 @ 
tcmalloc: large alloc 22113992704 bytes == 0x2ece26000 @ 
tcmalloc: large alloc 1922957312 bytes == 0xb1540000 @ 
tcmalloc: large alloc 2221088768 bytes == 0x93344000 @ 


CPU times: user 9min 13s, sys: 1min 37s, total: 10min 51s
Wall time: 10min 54s


In [6]:
df_multiexp.rename(columns={"experiment": "Experiment"},inplace=True)

df_smartseq3xpress['Experiment'] = 'Smartseq3xpressA'

df_multiexp.set_index('Experiment',inplace=True)
df_smartseq3.set_index('Experiment',inplace=True)
df_smartseq3_PE.set_index('Experiment',inplace=True)
df_smartseq3xpress.set_index('Experiment',inplace=True)

df_all = pd.concat([df_multiexp,df_smartseq3,df_smartseq3_PE,df_smartseq3xpress],join='inner')

tcmalloc: large alloc 2550333440 bytes == 0x17fa3a000 @ 


In [7]:
dfs = [exp_df_tuple[1] for exp_df_tuple in df_all.groupby('Experiment')]

tcmalloc: large alloc 2550333440 bytes == 0x31f8e2000 @ 


In [8]:
%%time
#4min
with open(f'{basepath}reads_per_umi_dfs_hd1_preproc.pickle','wb') as f:
    pickle.dump(dfs,f) 

tcmalloc: large alloc 4294967296 bytes == 0x49ce8a000 @ 
tcmalloc: large alloc 8589934592 bytes == 0x59ce8a000 @ 
tcmalloc: large alloc 17179869184 bytes == 0x827e90000 @ 


CPU times: user 3min 24s, sys: 54.8 s, total: 4min 19s
Wall time: 4min 20s


### Prepare anndata objects for each experiment

In [9]:
experiments = [df.index[0] for df in dfs]

In [10]:
experiments

['CELseq2A',
 'CELseq2B',
 'DropSeqA',
 'DropSeqB',
 'MARSseqA',
 'MARSseqB',
 'SCRBseqA',
 'SCRBseqB',
 'Smartseq3_Fibroblast',
 'Smartseq3_Fibroblast_PE',
 'Smartseq3xpressA']

In [None]:
%%time
ads = []
for exp,df_exp in zip(experiments,dfs):
    df_exp.reset_index(inplace=True)
    print(exp)
   
    #set index as multilevel index
    df_idxd = df_exp.set_index(keys = ['Experiment','RG','GE','UB'])
    #change remaining count column to sparse datatype
    series_sparse = df_idxd['N'].astype('Sparse[int32]')
    #unpack long raw data into sparse coo format (which is implicitly wide) 
    print('unpacking..')
    mtx_sparse, rows_idxs, column_idxs = series_sparse.sparse.to_coo(row_levels=["Experiment", "RG", 'UB'], column_levels=["GE"],sort_labels=True)
            
    #extract row and column annotations
    genes = column_idxs
    reads_per_umi_cells = [r[1] for r in rows_idxs]
    reads_per_umi_umis =  [r[2] for r in rows_idxs]
    
    #make sparse dataframe to extract read/UMI counts per cell
    df_sparse = pd.DataFrame.sparse.from_spmatrix(mtx_sparse,index=[reads_per_umi_cells,reads_per_umi_umis],columns=column_idxs)
    df_sparse.index.set_names(['cells','umis'],inplace=True)
    
    #group by cell
    df_grouped = df_sparse.groupby(by='cells')
    #extract read and UMI counts per cell
    UMIs = []
    reads = []
    cells = []
    print('extracting cells',end='')
    for idx,group in df_grouped:
        print('.',end='')
        UMI = np.sum(group>0,axis=0) #count number of UMIs per gene
        read = np.sum(group,axis=0)  #sum number of reads per gene
        UMIs.append(UMI.values)
        reads.append(read.values)
        cells.append(idx)
    UMIs = np.vstack(UMIs)
    reads= np.vstack(reads)


    #make read/umi count adata
    ad_readcounts_umicounts = anndata.AnnData(sparse.csr_matrix(UMIs),
                             var=dict(genes=genes),
                             obs=dict(cells=cells),
                             uns=dict(experiment=exp,
                                      type='UMI counts in X, read counts in `layers`'),
                             layers=dict(reads=sparse.csr_matrix(reads)))
    
    ad_readcounts_umicounts.var.set_index('genes',inplace=True)

    print('read/umi counts:',ad_readcounts_umicounts)
    ad_readcounts_umicounts.write_h5ad(f'{basepath}ad_readcounts_umicounts_hd1_{exp}')
    
    ads.append(ad_readcounts_umicounts)

CELseq2A
unpacking..
extracting cells..................................



read/umi counts: AnnData object with n_obs × n_vars = 34 × 23555
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
CELseq2B
unpacking..
extracting cells.....................................



read/umi counts: AnnData object with n_obs × n_vars = 37 × 25479
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
DropSeqA
unpacking..
extracting cells..........................................



read/umi counts: AnnData object with n_obs × n_vars = 42 × 23530
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
DropSeqB
unpacking..
extracting cells..................................



read/umi counts: AnnData object with n_obs × n_vars = 34 × 22104
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
MARSseqA
unpacking..
extracting cells.............................



read/umi counts: AnnData object with n_obs × n_vars = 29 × 20307
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
MARSseqB
unpacking..
extracting cells....................................



read/umi counts: AnnData object with n_obs × n_vars = 36 × 21150
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
SCRBseqA
unpacking..
extracting cells.......................................



read/umi counts: AnnData object with n_obs × n_vars = 39 × 24054
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
SCRBseqB
unpacking..
extracting cells.............................................



read/umi counts: AnnData object with n_obs × n_vars = 45 × 24281
    obs: 'cells'
    uns: 'experiment', 'type'
    layers: 'reads'
Smartseq3_Fibroblast
unpacking..
extracting cells

In [None]:
def add_annotations(ad,loadfile='',savefile='',species='mouse'):
    
    mg = mygene.MyGeneInfo()
    
    if not loadfile:
        print('Not loading, re-fetching annotations..')
        gene_names = ad.var.index
        gene_annotations = mg.querymany(list(gene_names), scopes='ensemblgene',species=species,fields=['name','symbol','type_of_gene','ensembl.type_of_gene'],as_dataframe=True)
        
        #if any genes gave duplicate hits, we dont assign the annotations and add the `not_uniquely_assignable` category
        gene_annotation_names,n_hits = np.unique(gene_annotations.index,return_counts=True)
        multi_hits = gene_annotation_names[n_hits>1]
        gene_annotations_dedup = gene_annotations.drop(multi_hits)
        #make and append one row for each of the multi hits
        duplicates_df = pd.DataFrame({clm:{dup:'not_uniquely_assignable' for dup in multi_hits} for clm in gene_annotations.columns})
        gene_annotations = pd.concat([gene_annotations_dedup,duplicates_df])
        #after that the number of genes should match again
        gene_annotations = gene_annotations.fillna('missing')

    else:
        print('Loading annotations from',loadfile)
        with open(loadfile,'rb') as f:
            gene_annotations = pickle.load(f)
        gene_names = ad.var.index

    #update ad.var
    columns_to_keep = ['type_of_gene','symbol','name','ensembl.type_of_gene']
    columns_to_drop = [c for c in gene_annotations.columns if c not in columns_to_keep]
    gene_annotations_slim = gene_annotations.drop(columns=columns_to_drop)
    
    ad.var = gene_annotations_slim.loc[ad.var.index,:]
    
    #add ERCC info
    ercc_idx = np.array([s[:5]=='gERCC' for s in ad.var.index])
    ad.var['type_of_gene'][ercc_idx] = 'artificial/spike-in'
    ad.var['ensembl.type_of_gene'][ercc_idx] = 'artificial/spike-in'
    ad.var['name'][ercc_idx] = ad.var.index[ercc_idx]
    ad.var['symbol'][ercc_idx] = [s.split('-')[1] for s in ad.var.index[ercc_idx]]

    if savefile:
        print('Saving annotations to',savefile)
        with open(savefile,'wb') as f:
            pickle.dump(gene_annotations,f)
        
def add_coarse_gene_annotations_single_exp(ad):
    
    types_master = np.unique(ad.var['ensembl.type_of_gene'])
    coarse_types_pseudo_idx = ['pseudogene' in t for t in types_master]
    
    fine2coarse_map = {fine:'pseudogene' for fine in types_master[coarse_types_pseudo_idx]}
    fine2coarse_map['artificial/spike-in']='artificial/spike-in'
    fine2coarse_map['missing']='missing'
    fine2coarse_map['not_uniquely_assignable']='missing'
    fine2coarse_map['protein_coding']='protein_coding'

    other_types = types_master[~np.isin(types_master,list(fine2coarse_map.keys()))]
    for other_t in other_types:
        fine2coarse_map[other_t] = 'other'

    ad.var['coarse_types'] = ad.var['ensembl.type_of_gene'].copy()
    ad.var.replace({'coarse_types':fine2coarse_map},inplace=True)

In [None]:
ad_all = anndata.concat(ads,join='outer')
#annotate all ads at once
annotation_filename = f'{basepath}mygene_annotations.pickle'
add_annotations(ad_all,savefile=annotation_filename,species='human,mouse')

for exp,ad in zip(experiments,ads):
    add_annotations(ad,loadfile=annotation_filename)
    add_coarse_gene_annotations_single_exp(ad)
    print('read/umi counts:',ad)
    ad.write_h5ad(f'{basepath}ad_readcounts_umicounts_hd1_{exp}')