In [1]:
import anndata as ad
import networkx as nx
import scanpy as sc
import pandas as pd
import numpy as np
import scglue
from matplotlib import rcParams
import os
os.chdir('/lustre/scratch/kiviaho/spatac/')
date = '20221107'
multiple_experiments = False

ModuleNotFoundError: No module named 'scglue'

In [18]:
rna = ad.read_h5ad('synthetic-spatial/synth_adata_real_mg_20220929.h5ad')
atac = ad.read_h5ad('data/share-seq/mouse-brain/share-seq-mouse-brain-atac-data.h5ad')

if multiple_experiments:
    experiment_ids = ['exper'+ str(n) for n in list(range(0,10))]



In [19]:
cells_to_keep = ['EN','IN','A1.E1','OG1']
cell_idxs = np.where(atac.obs['celltype'].str.contains('|'.join(cells_to_keep)))[0]

atac = atac[cell_idxs]
aggr_celltypes = atac.obs['celltype'].astype(str)
aggr_celltypes[aggr_celltypes.str.contains('EN')] = 'EN'
aggr_celltypes[aggr_celltypes.str.contains('IN')] = 'IN'
atac.obs['broad_celltype'] = aggr_celltypes

  atac.obs['broad_celltype'] = aggr_celltypes


In [20]:
scglue.data.get_gene_annotation(
    rna, gtf="gencode.vM30.annotation.gtf.gz",
    gtf_by="gene_name"
)
# Drop unannotated genes:
rna = rna[:,rna.var.dropna(subset=['chrom','chromStart','chromEnd']).index]

In [25]:
exp = rna
exp.layers["counts"] = exp.X.copy()
sc.pp.highly_variable_genes(exp, n_top_genes=2000, flavor="seurat_v3")
sc.pp.normalize_total(exp)
sc.pp.log1p(exp)
sc.pp.scale(exp)
sc.tl.pca(exp, n_comps=100, svd_solver="auto")

  exp.layers["counts"] = exp.X.copy()


In [26]:
# Divide the "spatial experiments" into their own anndatas
# Only if there are multiple spatial experiments present

if multiple_experiments:
    synthetic_spatial_exps = list()
    for exp_id in experiment_ids:
        idxs = [s for s in rna.obs.index if exp_id in s]
        synthetic_spatial_exps.append(rna[idxs,:])

    # Normalize each spatial experiment individually
    for exp in synthetic_spatial_exps:
        exp.layers["counts"] = exp.X.copy()
        sc.pp.highly_variable_genes(exp, n_top_genes=2000, flavor="seurat_v3")
        sc.pp.normalize_total(exp)
        sc.pp.log1p(exp)
        sc.pp.scale(exp)
        sc.tl.pca(exp, n_comps=100, svd_solver="auto")

    # Save multiexperiments!
    for exp,exp_name in zip(synthetic_spatial_exps,experiment_ids):
        exp.write('data/preprocessed_synthetic_spatial_'+exp+'_'+date+'.h5ad',compression='gzip')


In [27]:
scglue.data.lsi(atac,n_components=100)

  idf = X.shape[0] / X.sum(axis=0)


In [28]:
# Only need to compute one guidance graph for all spatial exps since they all have the same features
# i.e. the same graph
atac.var['chromStart'] = atac.var['chromBegin']
guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
scglue.graph.check_graph(guidance, [rna, atac])

window_graph:   0%|          | 0/11437 [00:00<?, ?it/s]

[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] check_graph: All checks passed!


In [35]:
atac.write("preprocessed_shareseq_atac_data"+date+".h5ad", compression="gzip")
rna.write("preprocessed_synthetic_spatial_data"+date+".h5ad", compression="gzip")
nx.write_graphml(guidance, "guidance_synth_spatial_atac_shareseq"+date+".graphml.gz")

In [31]:
rna

AnnData object with n_obs × n_vars = 2500 × 11437
    obs: 'cell_count_EN', 'cell_count_IN', 'cell_count_OG1', 'cell_count_A1.E1', 'cell_abundances_EN', 'cell_abundances_IN', 'cell_abundances_OG1', 'cell_abundances_A1.E1', 'cell_capture_eff_EN', 'cell_capture_eff_IN', 'cell_capture_eff_OG1', 'cell_capture_eff_A1.E1', 'sample', 'UMI_count_EN', 'UMI_count_IN', 'UMI_count_OG1', 'UMI_count_A1.E1'
    var: 'gene_level', 'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'gene_id', 'gene_type', 'mgi_id', 'havana_gene', 'tag', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'design', 'hvg', 'log1p', 'pca'
    obsm: 'X_spatial', 'X_pca'
    varm: 'PCs'
    layers: 'expression_levels', 'counts'

In [32]:
atac

AnnData object with n_obs × n_vars = 2849 × 428041
    obs: 'atac.bc', 'rna.bc', 'celltype', 'broad_celltype'
    var: 'chrom', 'chromBegin', 'chromEnd', 'chromStart', 'highly_variable'
    obsm: 'X_lsi'