# Create AnnDataset from *processed* h5ad files using SnapATAC2
**Authorship:** Adam Klie (last updated: 08/22/2023)<br>
***
**Description:** Notebook to convert a set of processed AnnData files into an AnnDataset object. This is basically the exact same as `2_create_anndataset_from_frag_files.ipynb` but expects processed h5ad files from `4_preprocess_anndatas.sh`. Future iterations may do both the wrangling from fragment into AnnData and the processing in one step. I haven't determined whether keeping the "raw" AnnData files is useful or not.

# Set-up

In [1]:
# Imports
import os
import sys
import time
import glob
import logging
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
sys.path.append("/cellar/users/aklie/data/igvf/bin")
from utils import make_dirs

# We will use snapatac2 to load the data
import snapatac2 as snap

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:


In [2]:
# File paths
h5ad_dir = "/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed"
out_dir = h5ad_dir

# Make the directory to output
make_dirs(out_dir)

'/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed'

In [3]:
# Get all the h5ad files
h5ad_glob = os.path.join(out_dir, "adata_atac*processed.h5ad")
h5ad_files = sorted(glob.glob(h5ad_glob, recursive=True))

# Get all the sample ids
sample_ids = [os.path.basename(file).split("_")[-2].split(".")[0] for file in h5ad_files]

# Create a dict and log it
h5ad_dict = dict(zip(sample_ids, h5ad_files))
logging.info(f"AnnData dictionary: {h5ad_dict}")

2023-08-23 08:06:33 - INFO - AnnData dictionary: {'DM0B': '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM0B_processed.h5ad', 'DM11A': '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM11A_processed.h5ad', 'DM12B': '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM12B_processed.h5ad', 'DM14B': '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM14B_processed.h5ad', 'DM21A': '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM21A_processed.h5ad', 'DM23A': '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM23A_processed.h5ad', 'DM24A': '/cellar/users/aklie/data

In [4]:
# Create the AnnDataset
adata_atac_processed_list = []
for i, h5ad_file in enumerate(tqdm(h5ad_files)):
    logging.info(f"Loading {h5ad_file}")
    adata_atac = snap.read(h5ad_file)
    adata_atac_processed_list.append(adata_atac)

  0%|          | 0/27 [00:00<?, ?it/s]

2023-08-23 08:06:42 - INFO - Loading /cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM0B_processed.h5ad
2023-08-23 08:06:42 - INFO - Loading /cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM11A_processed.h5ad
2023-08-23 08:06:42 - INFO - Loading /cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM12B_processed.h5ad
2023-08-23 08:06:42 - INFO - Loading /cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM14B_processed.h5ad
2023-08-23 08:06:42 - INFO - Loading /cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM21A_processed.h5ad
2023-08-23 08:06:42 - INFO - Loading /cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Mu

In [5]:
# Merge into one object
adata_atac_merged = snap.AnnDataSet(
    adatas=[(name, adata) for name, adata in zip(sample_ids, adata_atac_processed_list)],
    filename=os.path.join(out_dir, "adata_atac_merged_processed.h5ads")
)

In [6]:
# Close all the backed anndatas
for adata_atac in adata_atac_processed_list:
    adata_atac.close()
adata_atac_merged.close()

In [7]:
adata_atac_merged = snap.read_dataset(os.path.join(out_dir, "adata_atac_merged_processed.h5ads"))
adata_atac_merged

AnnDataSet object with n_obs x n_vars = 170329 x 606219 backed at '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_merged_processed.h5ads'
contains 27 AnnData objects with keys: 'DM0B', 'DM11A', 'DM12B', 'DM14B', 'DM21A', 'DM23A', 'DM24A', 'DM25A', 'DM31A', 'DM32A', 'DM33A', 'DM34A', 'DM35A', 'DM42B', 'DM43B', 'DM44A', 'DM45A', 'MO14', 'MO18', 'MO1', 'MO22', 'MO26', 'MO29', 'MO33', 'MO38', 'MO3', 'MO9'
    obs: 'sample'
    uns: 'AnnDataSet', 'reference_sequences'

In [8]:
cell_sample = np.array(adata_atac_merged.obs["sample"])
np.unique(cell_sample, return_counts=True), len(np.unique(cell_sample, return_counts=True)[1])

((array(['DM0B', 'DM11A', 'DM12B', 'DM14B', 'DM21A', 'DM23A', 'DM24A',
         'DM25A', 'DM31A', 'DM32A', 'DM33A', 'DM35A', 'DM42B', 'DM43B',
         'DM44A', 'DM45A', 'MO1', 'MO14', 'MO18', 'MO22', 'MO26', 'MO29',
         'MO3', 'MO33', 'MO38', 'MO9'], dtype='<U5'),
  array([ 9391,  7763,  9415,  7233,  1831,  7177,  7186,  7787, 10325,
           289, 10275, 12694,  6171, 10677,  9883,  8351,  2780,  2492,
          8521,  5205,  2527,  1553,  2852,  2972,  7099,  7880])),
 26)

In [9]:
cell_barcodes = np.array(adata_atac_merged.obs_names)
cell_barcodes[:5]

array(['AATACCGGTTGTGACA-1', 'GGCATTGTCACAGCCA-1', 'TGGTCATAGTGATTCA-1',
       'GCTGATCCAGAACCGA-1', 'TGGCTAAGTCATAACG-1'], dtype='<U18')

In [10]:
# Add the sample id to the cell barcodes with "_" in between
sample_barcode_names = [sample + "_" + barcode for sample, barcode in zip(cell_sample, cell_barcodes)]
sample_barcode_names[:5]

['DM0B_AATACCGGTTGTGACA-1',
 'DM0B_GGCATTGTCACAGCCA-1',
 'DM0B_TGGTCATAGTGATTCA-1',
 'DM0B_GCTGATCCAGAACCGA-1',
 'DM0B_TGGCTAAGTCATAACG-1']

In [11]:
adata_atac_merged.obs_names = sample_barcode_names

In [12]:
adata_atac_merged.close()

# DONE!

---

# Scratch

In [13]:
adata_atac_merged = snap.read_dataset(os.path.join(out_dir, "adata_atac_merged_processed.h5ads"))
adata_atac_merged

AnnDataSet object with n_obs x n_vars = 170329 x 606219 backed at '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_merged_processed.h5ads'
contains 27 AnnData objects with keys: 'DM0B', 'DM11A', 'DM12B', 'DM14B', 'DM21A', 'DM23A', 'DM24A', 'DM25A', 'DM31A', 'DM32A', 'DM33A', 'DM34A', 'DM35A', 'DM42B', 'DM43B', 'DM44A', 'DM45A', 'MO14', 'MO18', 'MO1', 'MO22', 'MO26', 'MO29', 'MO33', 'MO38', 'MO3', 'MO9'
    obs: 'sample'
    uns: 'reference_sequences', 'AnnDataSet'

In [15]:
snap.pp.select_features(adata_atac_merged, n_features=50000)

2023-08-23 08:15:22 - INFO - Selected 50000 features.


In [16]:
adata_atac_merged.close()

In [50]:
adata_atac = snap.read("/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM34A_processed.h5ad")
adata_atac

AnnData object with n_obs x n_vars = 0 x 606219 backed at '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM34A_processed.h5ad'
    obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito', 'doublet_probability', 'doublet_score'
    var: 'count', 'selected'
    uns: 'scrublet_sim_doublet_score', 'reference_sequences'
    obsm: 'insertion'

In [52]:
adata_atac.close()

In [53]:
adata_atac = snap.read("/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/adata_atac_DM34A.h5ad")
adata_atac

AnnData object with n_obs x n_vars = 12403 x 0 backed at '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/adata_atac_DM34A.h5ad'
    obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito'
    uns: 'reference_sequences'
    obsm: 'insertion'

In [None]:
snap.