In [12]:
import anndata as ad
import scanpy as sc
from pathlib import Path
import pandas as pd
from scipy.io import mmwrite

In [13]:
outdir = Path('MARVEL')
outdir.mkdir(exist_ok=True)

In [14]:
adata = ad.read_h5ad("2_integration_combined.h5ad")
adata.obs_names = adata.obs["barcode"].astype(str) + "_" + adata.obs["sample"].astype(str)
adata

AnnData object with n_obs × n_vars = 3119 × 29459
    obs: 'sample', 'sample_type', 'barcode', 'well_position', 'well_row', 'well_column', 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rRNA', 'log1p_total_counts_rRNA', 'pct_counts_rRNA', 'total_counts_lncRNA', 'log1p_total_counts_lncRNA', 'pct_counts_lncRNA', 'n_genes', 'doublet_score', 'predicted_doublet', 'sample_group', 'facs_day'
    var: 'gene_ids', 'feature_types', 'gene_symbol', 'gene_versions', 'gene_type', 'mt', 'rRNA', 'lncRNA', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
  

In [15]:
adata_sj = ad.read_h5ad("../scribornaseq/run/results/combined/sj_raw_matrix.h5ad")
adata_sj

AnnData object with n_obs × n_vars = 12288 × 8035
    obs: 'barcode', 'sample'
    var: 'start', 'end', 'strand', 'motif', 'annotated'

In [16]:
index_intersect = adata.obs.index.intersection(adata_sj.obs.index)
adata_sj = adata_sj[index_intersect, :].copy()
adata_sj

AnnData object with n_obs × n_vars = 3119 × 8035
    obs: 'barcode', 'sample'
    var: 'start', 'end', 'strand', 'motif', 'annotated'

In [17]:
adata.layers["count"][:10, :10].toarray()

array([[ 0,  4,  2,  0,  1,  0,  7,  0,  3,  0],
       [ 0,  1,  2,  0,  1,  1,  3,  0,  1,  0],
       [ 0,  8,  6,  0,  7,  0, 10,  0,  3,  0],
       [ 0,  4,  3,  0,  2,  0,  3,  0,  1,  0],
       [ 0,  1,  8,  0,  8,  0,  8,  0,  4,  0],
       [ 0,  4,  6,  0,  2,  0,  6,  0,  2,  0],
       [ 0,  9,  8,  0,  7,  0, 10,  0,  9,  0],
       [ 0, 49, 90,  0,  0,  0,  1,  0,  1,  0],
       [ 0,  6,  3,  0,  2,  1,  5,  0,  4,  0],
       [ 0,  2,  2,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

In [18]:
adata.layers["norm_total"] = sc.pp.normalize_total(adata, layer="count", inplace=False)["X"]
adata.layers["norm_total"][:10, :10].toarray()

array([[ 0.        ,  1.01185293,  0.50592646,  0.        ,  0.25296323,
         0.        ,  1.77074262,  0.        ,  0.7588897 ,  0.        ],
       [ 0.        ,  0.3646905 ,  0.72938099,  0.        ,  0.3646905 ,
         0.3646905 ,  1.09407149,  0.        ,  0.3646905 ,  0.        ],
       [ 0.        ,  1.08515468,  0.81386601,  0.        ,  0.94951034,
         0.        ,  1.35644335,  0.        ,  0.406933  ,  0.        ],
       [ 0.        ,  2.65713832,  1.99285374,  0.        ,  1.32856916,
         0.        ,  1.99285374,  0.        ,  0.66428458,  0.        ],
       [ 0.        ,  0.27042927,  2.16343419,  0.        ,  2.16343419,
         0.        ,  2.16343419,  0.        ,  1.08171709,  0.        ],
       [ 0.        ,  1.43745704,  2.15618557,  0.        ,  0.71872852,
         0.        ,  2.15618557,  0.        ,  0.71872852,  0.        ],
       [ 0.        ,  1.55830125,  1.38515667,  0.        ,  1.21201209,
         0.        ,  1.73144584,  0.        

In [19]:
mmwrite(outdir / "gene_norm.mtx", adata.layers["norm_total"])
adata.obs.to_csv(outdir / "pheno.tsv", sep="\t")
adata.var.to_csv(outdir / "genes.tsv", sep="\t")

In [20]:
mmwrite(outdir / "gene_count.mtx", adata.layers["count"])

In [21]:
mmwrite(outdir / "sj_count.mtx", adata_sj.X)
adata_sj.var.to_csv(outdir / "sj.tsv", sep="\t")

In [22]:
df_umap = pd.DataFrame(adata.obsm["X_umap_scvi"], index=adata.obs_names, columns=["UMAP1", "UMAP2"])
df_umap.to_csv(outdir / "umap.tsv", sep="\t")
df_umap.head()

Unnamed: 0,UMAP1,UMAP2
AAACAGGC_combined_5,1.552624,12.190267
AAAGCGGA_combined_5,1.387908,8.696829
AAAGGCTG_combined_5,3.149166,11.693304
AACACGCA_combined_5,2.345846,10.873199
AACCCAAC_combined_5,4.19578,12.637164
