In [1]:
import anndata as ad
import networkx as nx
import scanpy as sc
import pandas as pd
import numpy as np
import scglue
from matplotlib import rcParams
import os
os.chdir('/lustre/scratch/kiviaho/spatac/integrations/multiome_5k/individual_cells_per_spot_10_from_5_metacells/')
date = '20221121'
multiple_experiments = False

In [33]:
rna_name = 'synthetic_spatial_rna'
atac_name = 'simulated_atac'

rna = ad.read_h5ad(rna_name+'.h5ad')
atac = ad.read_h5ad(atac_name + '.h5ad')

# Copy raw counts into X, only if 

# For single cell
# rna.X = rna.layers['counts'].copy()
# atac.X = atac.layers['counts'].copy()

# For simulated data
rna.layers['counts'] = rna.X.copy()
atac.layers['counts'] = atac.X.copy()

  utils.warn_names_duplicates("var")


In [34]:
scglue.data.get_gene_annotation(
    rna, gtf="../../../gencode.vM30.annotation.gtf.gz",
    gtf_by="gene_name"
)


In [35]:
# Drop unannotated genes:
rna = rna[:,~rna.var.index.duplicated(keep='first')]
rna = rna[:,rna.var.dropna(subset=['chrom','chromStart','chromEnd']).index]

In [36]:
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3",span=1)
sc.pp.normalize_total(rna)
sc.pp.log1p(rna)
sc.pp.scale(rna)
sc.tl.pca(rna, n_comps=100, svd_solver="auto")

  self.data[key] = value


In [37]:
# Drop peak locations with zero peaks
atac = atac[:,~(atac.X.sum(axis=0)==0)]
scglue.data.lsi(atac,n_components=100)

In [38]:
split = atac.var_names.str.split(r"[:-]")
atac.var["chrom"] = split.map(lambda x: x[0])
atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
atac.var.head()

Unnamed: 0,seqnames,start,end,width,strand,score,replicateScoreQuantile,groupScoreQuantile,Reproducibility,GroupReplicate,...,distToGeneStart,peakType,distToTSS,nearestTSS,GC,idx,N,chrom,chromStart,chromEnd
chr1:3094816-3095316,chr1,3094816,3095316,501,*,142.047,0.922,0.729,2,C10._.Rep1,...,119416,Distal,120565,uc007afg.1,0.4271,1,0,chr1,3094816,3095316
chr1:3119745-3120245,chr1,3119745,3120245,501,*,8.4234,0.923,0.694,2,C5._.Rep2,...,94487,Distal,95636,uc007afg.1,0.3812,2,0,chr1,3119745,3120245
chr1:3121251-3121751,chr1,3121251,3121751,501,*,12.8192,0.599,0.212,2,C6._.Rep1,...,92981,Distal,94130,uc007afg.1,0.4411,3,0,chr1,3121251,3121751
chr1:3371495-3371995,chr1,3371495,3371995,501,*,16.6912,0.631,0.246,2,C4._.Rep1,...,157263,Intronic,156112,uc007afg.1,0.3972,4,0,chr1,3371495,3371995
chr1:3399685-3400185,chr1,3399685,3400185,501,*,8.4234,0.923,0.694,2,C5._.Rep2,...,185453,Intronic,184302,uc007afg.1,0.4112,5,0,chr1,3399685,3400185


In [39]:
# Only need to compute one guidance graph for all spatial exps since they all have the same features
# i.e. the same graph
guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
scglue.graph.check_graph(guidance, [rna, atac])

window_graph:   0%|          | 0/31527 [00:00<?, ?it/s]

[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] check_graph: All checks passed!


In [40]:
atac.write('preprocessed_'+atac_name+'_'+date+".h5ad", compression="gzip")
rna.write("preprocessed_"+rna_name+"_"+date+".h5ad", compression="gzip")
nx.write_graphml(guidance, "guidance_graph_"+atac_name+"_"+rna_name+"_"+date+".graphml.gz")

In [41]:
rna

AnnData object with n_obs × n_vars = 3600 × 31527
    obs: '31', '18', '49', '50', '14'
    var: 'gene_ids', 'feature_types', 'genome', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'gene_id', 'gene_type', 'mgi_id', 'havana_gene', 'tag', 'highly_variable_rank', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'hvg', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [42]:
atac

AnnData object with n_obs × n_vars = 9000 × 112344
    obs: '31', '18', '49', '50', '14'
    var: 'seqnames', 'start', 'end', 'width', 'strand', 'score', 'replicateScoreQuantile', 'groupScoreQuantile', 'Reproducibility', 'GroupReplicate', 'nearestGene', 'distToGeneStart', 'peakType', 'distToTSS', 'nearestTSS', 'GC', 'idx', 'N', 'chrom', 'chromStart', 'chromEnd', 'highly_variable'
    obsm: 'X_lsi'
    layers: 'counts'