## Load Tasic data

In [1]:
import numpy as np
import warnings
import pandas as pd

from scipy import sparse
import pickle
import anndata
import scanpy as sc
sc.settings.verbosity = 2
import mygene
mg = mygene.MyGeneInfo()

from readcount_tools import compute_gene_stats

#### Pre-load: pickle sparseload results

In [2]:
basepath = 'data/tasic/'

In [3]:
%%time

# Load the Allen institute data. This takes a few minutes

# This function is needed because using Pandas to load these files in one go 
# can eat up a lot of RAM. So we are doing it in chunks, and converting each
# chunk to the sparse matrix format on the fly.
def sparseload(filenames):
    genes = []
    sparseblocks = []
    areas = []
    cells = []
    for chunk1,chunk2 in zip(pd.read_csv(filenames[0], chunksize=1000, index_col=0, na_filter=False),
                             pd.read_csv(filenames[1], chunksize=1000, index_col=0, na_filter=False)):
        if len(cells)==0:
            cells = np.concatenate((chunk1.columns, chunk2.columns))
            areas = [0]*chunk1.columns.size + [1]*chunk2.columns.size
        
        genes.extend(list(chunk1.index))  #chunk1 and chunk2 have the same index/geneID sequence!
        sparseblock1 = sparse.csr_matrix(chunk1.values.astype(float))
        sparseblock2 = sparse.csr_matrix(chunk2.values.astype(float))
        sparseblock = sparse.hstack((sparseblock1,sparseblock2), format='csr')
        sparseblocks.append([sparseblock])
        print('.', end='', flush=True)

    print(' done')
    counts = sparse.bmat(sparseblocks)
    return (counts.T, np.array(genes), cells, np.array(areas))

filenames = [f'{basepath}mouse_VISp_2018-06-14_exon-matrix.csv',
             f'{basepath}mouse_ALM_2018-06-14_exon-matrix.csv']
counts, genes, cells, areas = sparseload(filenames)

.............................................. done


tcmalloc: large alloc 1819385856 bytes == 0xdbeb0000 @ 


CPU times: user 2min 49s, sys: 6.24 s, total: 2min 56s
Wall time: 2min 56s


In [4]:
print('counts:',counts.shape)
print('genes:',genes.shape)
print('cells:',cells.shape)
print('areas:',areas.shape)

counts: (25481, 45768)
genes: (45768,)
cells: (25481,)
areas: (25481,)


In [5]:
genesDF = pd.read_csv(f'{basepath}mouse_VISp_2018-06-14_genes-rows.csv')
gene_ids     = genesDF['gene_entrez_id'].tolist()
symbols = genesDF['gene_symbol'].tolist()
id2symbol = dict(zip(gene_ids, symbols))
genes = np.array([id2symbol[i] for i in gene_ids])

clusterInfo = pd.read_csv(f'{basepath}sample_heatmap_plot_data.csv')
# a total of 23822 good cells with cluster labels
goodCells  = clusterInfo['sample_name'].values
cluster_ids = clusterInfo['cluster_id'].values
labels     = clusterInfo['cluster_label'].values
colors     = clusterInfo['cluster_color'].values
print('goodCells:',goodCells.shape)
print('cluster_ids:',cluster_ids.shape)
print('labels:',labels.shape)
print('colors:',colors.shape)

goodCells: (23822,)
cluster_ids: (23822,)
labels: (23822,)
colors: (23822,)


In [6]:
#make a list of unique cluster names and cluster colors that maps cluster IDs to Names and Colors
clusterNames  = np.array([labels[cluster_ids==i+1][0] for i in range(np.max(cluster_ids))])
clusterColors = np.array([colors[cluster_ids==i+1][0] for i in range(np.max(cluster_ids))])
#let cluster IDs start at 0
clusters   = np.copy(cluster_ids) - 1

#find index that maps from raw data cell order to good-cells order
good_cells_order_idx = np.array([np.where(cells==c)[0][0] for c in goodCells])
counts = counts[good_cells_order_idx, :]
cells = cells[good_cells_order_idx]

tcmalloc: large alloc 1729986560 bytes == 0x17ee3c000 @ 


In [7]:
adata = anndata.AnnData(X=counts,
                        obs=dict(clusters=clusters,
                                 cells=np.array(cells)),
                        var=dict(genes=genes),
                        uns=dict(clustercolors=clusterColors,
                                 clusternames=clusterNames))

  adata = anndata.AnnData(X=counts,


In [8]:
adata.obs['class'] =clusterInfo['class_label'].values
adata.obs['non_neuronal_idx'] = np.isin(adata.obs['class'].values,['Endothelial', 'Non-Neuronal'])

In [9]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
sc.pp.filter_genes(adata,min_cells=1)

tcmalloc: large alloc 1729986560 bytes == 0x10f79c000 @ 
tcmalloc: large alloc 1729986560 bytes == 0x1e68f8000 @ 


filtered out 2992 genes that are detected in less than 1 cells


In [10]:
markers = ['Snap25','Gad1','Slc17a7','Pvalb', 'Sst', 'Vip', 'Aqp4', 
           'Mog', 'Itgam', 'Pdgfra', 'Flt1', 'Bgn', 'Rorb', 'Foxp2']
adata.var['marker_idx'] = np.isin(adata.var['genes'],markers)
adata.uns['markers']=markers

In [11]:
adata.var.set_index('genes',inplace=True)
genesDF.set_index('gene_symbol',inplace=True)

assert all(adata.var.index == pd.concat((genesDF,adata.var),axis=1,join='inner').index)
adata.var = pd.concat((genesDF,adata.var),axis=1,join='inner')

### Add gene annotations

In [12]:
gene_entrez_id = adata.var['gene_entrez_id']
gene_annotations = mg.querymany(list(gene_entrez_id), scopes='entrezgene',species='mouse',fields=['name','symbol','type_of_gene','ensembl.gene','ensembl.type_of_gene'],as_dataframe=True)

#if any genes gave duplicate hits, we dont assign the annotations and add the `not_uniquely_assignable` category
gene_annotation_names,n_hits = np.unique(gene_annotations.index,return_counts=True)

if max(n_hits>1):
    multi_hits = gene_annotation_names[n_hits>1]
    gene_annotations_dedup = gene_annotations.drop(multi_hits)
    #make and append one row for each of the multi hits
    duplicates_df = pd.DataFrame({clm:{dup:'not_uniquely_assignable' for dup in multi_hits} for clm in gene_annotations.columns})
    gene_annotations = pd.concat([gene_annotations_dedup,duplicates_df])

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [13]:
gene_annotations = gene_annotations.fillna('missing')

In [14]:
#update ad.var
columns_to_keep = ['type_of_gene','symbol','name','ensembl.type_of_gene']
columns_to_drop = [c for c in gene_annotations.columns if c not in columns_to_keep]
gene_annotations_slim = gene_annotations.drop(columns=columns_to_drop)

In [15]:
#make sure gene symbolic names are kept for all genes
adata.var.index.name='genes'
adata.var.reset_index(inplace=True)

In [16]:
#prepare index for concat
adata.var.set_index('gene_entrez_id',inplace=True)
adata.var.index = adata.var.index.astype(str)
adata.var = pd.concat((adata.var,gene_annotations_slim),axis=1)

In [17]:
def add_coarse_gene_annotations_tasic(adata):

    types = np.unique(gene_annotations['type_of_gene'].astype(str))
    
    fine2coarse_map={}
    fine2coarse_map['protein-coding']='protein_coding'
    fine2coarse_map['pseudo']='pseudogene'
    fine2coarse_map['missing']='missing'



    other_types = types[~np.isin(types,list(fine2coarse_map.keys()))]
    for other_t in other_types:
        fine2coarse_map[other_t] = 'other'
        
    
    adata.var['coarse_types'] = adata.var['type_of_gene'].copy()
    adata.var.replace({'coarse_types':fine2coarse_map},inplace=True)

In [18]:
add_coarse_gene_annotations_tasic(adata)

### Compute gene stats

In [19]:
#gene fields
adata.var.set_index('genes',inplace=True,drop=False)
adata.var.index.name = 'gene_name' #this is needed to be able to work with "drop=False" when saving to h5ad later!
compute_gene_stats(adata)
# rename fields to show they were created from all cells before splitting into clusters
rename_dict_genes={}
for key in adata.var_keys():
    if key not in ['genes',
                'gene_id',
                'chromosome',
                'gene_name',
                'marker_idx',
                'name',
                'symbol',
                'type_of_gene',
                'ensembl.type_of_gene',
                'coarse_types']:
        rename_dict_genes[key]=key+'_allCells'
adata.var.rename(columns=rename_dict_genes,inplace=True)
adata.uns['markernames'] = adata.uns.pop('markers')

tcmalloc: large alloc 4076044288 bytes == 0x24dad0000 @ 
tcmalloc: large alloc 4076044288 bytes == 0x340a08000 @ 


In [20]:
#cell fields
#set obs index to cell ID
adata.obs.set_index('cells',inplace=True)
#decode clustercolors per observation
adata.obs['clustercolor'] = adata.uns['clustercolors'][adata.obs['clusters']]
# rename fields to show they were created from all cells before splitting into clusters
rename_dict_cells={}
for key in adata.obs_keys():
    if key not in ['clusters', 'areas','class', 'non_neuronal_idx', 'clustercolor']:
        rename_dict_cells[key]=key+'_allGenes'
adata.obs.rename(columns=rename_dict_cells,inplace=True)

In [21]:
metadata = pd.read_excel(f'{basepath}Supplementary_Table_10_Full_Metadata.xls')
metadata = metadata.set_index('sample_name')

In [22]:
keep = ['sample_id', 'subclass', 'cluster']

In [23]:
metadata_to_keep = metadata[keep]
metadata_to_keep = metadata_to_keep.rename(columns={"cluster":"clustername"})
#remove metadata of cells for which we dont have cluster info
metadata_to_keep = metadata_to_keep.loc[adata.obs.index]

adata.obs = pd.concat((adata.obs,metadata_to_keep),axis=1)
adata.obs.columns

Index(['clusters', 'class', 'non_neuronal_idx', 'n_genes_by_counts_allGenes',
       'total_counts_allGenes', 'clustercolor', 'sample_id', 'subclass',
       'clustername'],
      dtype='object')

In [24]:
adata.var_names_make_unique()
adata.write_h5ad(f'{basepath}adata.h5ad')

### Split by cluster

In [25]:
def split_by_cluster(adata):
    adatas_by_cluster = []
    cluster_ids = []
    cluster_names = []
    cluster_colors = []
    cluster_ncells = []
    cluster_ngenes = []
    cluster_lvl1 = []
    cluster_lvl2 = []
    cluster_class = []
    #make list of adatas for each cluster
    for c in np.unique(adata.obs.clusters):
        adata_c = adata[adata.obs.clusters==c].copy()

        adata_c.uns['cluster_id'] = c
        adata_c.uns['clustername'] = adata.uns['clusternames'][c]
        adata_c.uns['clustercolor'] = adata.uns['clustercolors'][c]

        sc.pp.filter_genes(adata_c,min_cells=1,inplace=True)
        adata_c.var.rename(columns=dict(n_cells='n_cells_withinCluster'),inplace=True)
        ncells,ngenes=adata_c.shape

        adatas_by_cluster.append(adata_c)
        cluster_ids.append(adata_c.uns['cluster_id'])
        cluster_names.append(adata_c.uns['clustername'])
        cluster_colors.append(adata_c.uns['clustercolor'])
        cluster_ncells.append(ncells)
        cluster_ngenes.append(ngenes)
        levels = adata_c.uns['clustername'].split(' ')
        cluster_lvl1.append(levels[0])
        cluster_lvl2.append(levels[1])
        cluster_cl = np.unique(adata_c.obs['class'])
        assert len(cluster_cl)==1 #uniqueness assumption
        cluster_class.append(cluster_cl[0])

    #lookup-dataframe that indexes the cluster list of adatas
    cluster_df = pd.DataFrame(data=dict(name=cluster_names,
                                        ncells=cluster_ncells,
                                        ngenes=cluster_ngenes,
                                        color=cluster_colors,
                                        level1=cluster_lvl1,
                                        level2=cluster_lvl2,
                                        cluster_class=cluster_class,
                                        cluster_id=cluster_ids))

    for ad in adatas_by_cluster:
        compute_gene_stats(ad,suffix='_withinCluster')

    with open(f'{basepath}cluster_df.pickle','wb') as f:
        pickle.dump(cluster_df,f)
    with open(f'{basepath}adatas_by_cluster.pickle','wb') as f:
        pickle.dump(adatas_by_cluster,f)
    return cluster_df, adatas_by_cluster

In [26]:
with warnings.catch_warnings(record=True):
    warnings.filterwarnings("ignore", category=FutureWarning,lineno=2487)
    cluster_df, adatas_by_cluster = split_by_cluster(adata)

filtered out 19179 genes that are detected in less than 1 cells
filtered out 19771 genes that are detected in less than 1 cells
filtered out 17895 genes that are detected in less than 1 cells
filtered out 14889 genes that are detected in less than 1 cells
filtered out 13194 genes that are detected in less than 1 cells
filtered out 11145 genes that are detected in less than 1 cells
filtered out 19271 genes that are detected in less than 1 cells
filtered out 19748 genes that are detected in less than 1 cells
filtered out 19366 genes that are detected in less than 1 cells
filtered out 19817 genes that are detected in less than 1 cells
filtered out 17673 genes that are detected in less than 1 cells
filtered out 20662 genes that are detected in less than 1 cells
filtered out 20764 genes that are detected in less than 1 cells
filtered out 17431 genes that are detected in less than 1 cells
filtered out 16439 genes that are detected in less than 1 cells
filtered out 19891 genes that are detect

### Select cluster for biologically homogeneous dataset

In [27]:
#split by neuronal/non-neuronal types and sort
non_neuronal_types = ['Non-Neuronal','Endothelial']
cluster_df['non_neuronal_idx'] = np.isin(cluster_df['cluster_class'],non_neuronal_types)

cluster_df_neuronal = cluster_df[~cluster_df['non_neuronal_idx']]

cluster_df_neuronal_sorted = cluster_df_neuronal.sort_values('ncells',ascending=False)
neuronal_clusters_ids_sorted = list(cluster_df_neuronal_sorted['cluster_id'])
adatas_neuronal = [adatas_by_cluster[i] for i in neuronal_clusters_ids_sorted]

In [28]:
adata_single_cluster = adatas_neuronal[1]
print(adata_single_cluster.uns['clustername'],adata_single_cluster.shape)

L6 IT VISp Penk Col27a1 (1049, 33914)


### Get gene length data for single cluster data

In [29]:
adata_single_cluster.var['gene_entrez_id'] = genesDF.loc[adata_single_cluster.var.index,'gene_entrez_id']

In [30]:
adata_single_cluster.var['ensembl_gene'] = gene_annotations.loc[adata_single_cluster.var.gene_entrez_id.astype(str),'ensembl.gene'].values

In [31]:
df_lengths = pd.read_csv(f'{basepath}mart_export_genelength.txt')
df_lengths.set_index('Gene stable ID',inplace=True)
ensemble_ids = np.unique(adata_single_cluster.var.ensembl_gene.dropna())

In [32]:
adata_single_cluster.var['ensembl_gene'] = gene_annotations.loc[adata_single_cluster.var.gene_entrez_id.astype(str),'ensembl.gene'].values

In [33]:
adata_single_cluster.var.set_index('ensembl_gene',inplace=True)

adata_single_cluster.var['transcript_len_max'] = np.nan
adata_single_cluster.var['n_transcripts'] = np.nan

for e_id in ensemble_ids:
    
    #ignore cases where we dont have a length info
    if e_id in df_lengths.index:
    
        df_gene = df_lengths.loc[e_id,:]
            
        #handle case where we only have one transcript = one length
        if type(df_gene) is pd.core.series.Series:
            lens = [df_gene['Transcript length (including UTRs and CDS)']]
        else:
            lens = df_gene['Transcript length (including UTRs and CDS)'].values
            
        #compute stats
        n_transcripts = len(lens)
        maxlen = np.max(lens)

        #assign results only if the ensmbl ID is unique
        if len(adata_single_cluster.var.loc[e_id,:].shape)==1:
            adata_single_cluster.var.loc[e_id,'n_transcripts'] = n_transcripts
            adata_single_cluster.var.loc[e_id,'transcript_len_max'] = maxlen   

In [34]:
adata_single_cluster.var_names_make_unique()
adata_single_cluster.write_h5ad(f'{basepath}adata_single_cluster.h5ad')