# Create AnnDataset from h5ad files using SnapATAC2
**Authorship:** Adam Klie (last updated: 08/22/2023)<br>
***
**Description:** Notebook to convert a set of AnndDatas written to disk as h5ad files into an AnnDataset object which is written to disk as an h5ads file. This is often an optional step in the pipeline, as working with AnnDatasets is not fully functional in SnapATAC2. Needs to be run after `1_create_anndatas_from_frag_files.sh`.

# Set-up

In [13]:
# Imports
import os
import sys
import time
import glob
import logging
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# We will use snapatac2 to load the data
import snapatac2 as snap

In [3]:
# Directories
h5ad_dir = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2"
out_dir =  "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/2023_11_14/snapatac2"

In [6]:
# Get all the h5ad files
h5ad_glob = os.path.join(h5ad_dir, "adata_atac*.h5ad")
h5ad_files = sorted(glob.glob(h5ad_glob, recursive=True))

# Get all the sample ids
sample_ids = [os.path.basename(file).split("_")[-1].split(".")[0].lower() for file in h5ad_files]

# Create a dict and log it
h5ad_dict = dict(zip(sample_ids, h5ad_files))
logging.info(f"AnnData dictionary: {h5ad_dict}")

2023-11-15 10:53:10 - INFO - AnnData dictionary: {'dm0b': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM0B.h5ad', 'dm11a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM11A.h5ad', 'dm12b': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM12B.h5ad', 'dm14b': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM14B.h5ad', 'dm21a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM21A.h5ad', 'dm23a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM23A.h5ad', 'dm24a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM24A.h5ad', 'dm25a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16

In [7]:
# Create the AnnDataset
adata_atac_list = []
for i, h5ad_file in enumerate(tqdm(h5ad_files)):
    logging.info(f"Loading {h5ad_file}")
    adata_atac = snap.read(h5ad_file)
    adata_atac_list.append(adata_atac)

  0%|          | 0/27 [00:00<?, ?it/s]

2023-11-15 10:53:12 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM0B.h5ad
2023-11-15 10:53:13 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM11A.h5ad
2023-11-15 10:53:14 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM12B.h5ad
2023-11-15 10:53:14 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM14B.h5ad
2023-11-15 10:53:14 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM21A.h5ad
2023-11-15 10:53:15 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM23A.h5ad
2023-11-15 10:53:15 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotatio

In [9]:
adata_atac_list

[AnnData object with n_obs x n_vars = 11608 x 0 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM0B.h5ad'
     obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito'
     uns: 'reference_sequences'
     obsm: 'insertion',
 AnnData object with n_obs x n_vars = 11864 x 0 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM11A.h5ad'
     obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito'
     uns: 'reference_sequences'
     obsm: 'insertion',
 AnnData object with n_obs x n_vars = 12272 x 0 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/adata_atac_DM12B.h5ad'
     obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito'
     uns: 'reference_sequences'
     obsm: 'insertion',
 AnnData object with n_obs x n_vars = 8580 x 0 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/a

In [10]:
# Merge into one object
adata_atac_merged = snap.AnnDataSet(
    adatas=[(name, adata) for name, adata in zip(sample_ids, adata_atac_list)],
    filename=os.path.join(out_dir, "adata_atac_all.h5ads")
)

In [11]:
# Close all the backed anndatas
for adata_atac in adata_atac_list:
    adata_atac.close()
adata_atac_merged.close()

In [12]:
adata_atac_merged = snap.read_dataset(os.path.join(out_dir, "adata_atac_all.h5ads"))
adata_atac_merged

AnnDataSet object with n_obs x n_vars = 248687 x 0 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/2023_11_14/snapatac2/adata_atac_all.h5ads'
contains 27 AnnData objects with keys: 'dm0b', 'dm11a', 'dm12b', 'dm14b', 'dm21a', 'dm23a', 'dm24a', 'dm25a', 'dm31a', 'dm32a', 'dm33a', 'dm34a', 'dm35a', 'dm42b', 'dm43b', 'dm44a', 'dm45a', 'mo1', 'mo14', 'mo18', 'mo22', 'mo26', 'mo29', 'mo3', 'mo33', 'mo38', 'mo9'
    obs: 'sample'
    uns: 'reference_sequences', 'AnnDataSet'

In [21]:
cell_sample = np.array(adata_atac_merged.obs["sample"])
np.unique(cell_sample, return_counts=True), len(np.unique(cell_sample, return_counts=True)[1])

((array(['dm0b', 'dm11a', 'dm12b', 'dm14b', 'dm21a', 'dm23a', 'dm24a',
         'dm25a', 'dm31a', 'dm32a', 'dm33a', 'dm34a', 'dm35a', 'dm42b',
         'dm43b', 'dm44a', 'dm45a', 'mo1', 'mo14', 'mo18', 'mo22', 'mo26',
         'mo29', 'mo3', 'mo33', 'mo38', 'mo9'], dtype='<U5'),
  array([11608, 11864, 12272,  8580, 13063,  8741,  8377, 10323, 12605,
         10927, 11845, 12403, 15780,  7596, 13291, 11101, 14623,  3490,
          3022, 10228,  6302,  3163,  2016,  3491,  3743,  8626,  9607])),
 27)

In [22]:
cell_barcodes = np.array(adata_atac_merged.obs_names)
cell_barcodes[:5]

array(['AATACCGGTTGTGACA-1', 'GTTAACGGTAGTTACG-1', 'GGCATTGTCACAGCCA-1',
       'TGGTCATAGTGATTCA-1', 'GCTGATCCAGAACCGA-1'], dtype='<U18')

In [23]:
# Add the sample id to the cell barcodes with "_" in between
sample_barcode_names = [sample + "#" + barcode for sample, barcode in zip(cell_sample, cell_barcodes)]
sample_barcode_names[:5]

['dm0b#AATACCGGTTGTGACA-1',
 'dm0b#GTTAACGGTAGTTACG-1',
 'dm0b#GGCATTGTCACAGCCA-1',
 'dm0b#TGGTCATAGTGATTCA-1',
 'dm0b#GCTGATCCAGAACCGA-1']

In [24]:

adata_atac_merged.obs_names = sample_barcode_names

In [27]:
adata_atac_merged

AnnDataSet object with n_obs x n_vars = 248687 x 0 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/2023_11_14/snapatac2/adata_atac_all.h5ads'
contains 27 AnnData objects with keys: 'dm0b', 'dm11a', 'dm12b', 'dm14b', 'dm21a', 'dm23a', 'dm24a', 'dm25a', 'dm31a', 'dm32a', 'dm33a', 'dm34a', 'dm35a', 'dm42b', 'dm43b', 'dm44a', 'dm45a', 'mo1', 'mo14', 'mo18', 'mo22', 'mo26', 'mo29', 'mo3', 'mo33', 'mo38', 'mo9'
    obs: 'sample'
    uns: 'reference_sequences', 'AnnDataSet'

In [28]:
adata_atac_merged.close()

# DONE!

---

# Scratch