# Create AnnDataset from *processed* h5ad files using SnapATAC2
**Authorship:** Adam Klie (last updated: 08/22/2023)<br>
***
**Description:** Notebook to convert a set of processed AnnData files into an AnnDataset object. This is basically the exact same as `2_create_anndataset_from_frag_files.ipynb` but expects processed h5ad files from `4_preprocess_anndatas.sh`. Future iterations may do both the wrangling from fragment into AnnData and the processing in one step. I haven't determined whether keeping the "raw" AnnData files is useful or not.

# Set-up

In [1]:
# Imports
import os
import sys
import time
import glob
import logging
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# We will use snapatac2 to load the data
import snapatac2 as snap

In [4]:
# File paths
h5ad_dir = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed"
out_dir = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/2023_11_14/snapatac2"

# Make the directory to output, if it doesn't exist
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [5]:
# Get all the h5ad files
h5ad_glob = os.path.join(h5ad_dir, "adata_atac*processed.h5ad")
h5ad_files = sorted(glob.glob(h5ad_glob, recursive=True))

# Get all the sample ids
sample_ids = [os.path.basename(file).split("_")[-2].split(".")[0].lower() for file in h5ad_files]

# Create a dict and log it
h5ad_dict = dict(zip(sample_ids, h5ad_files))
logging.info(f"AnnData dictionary: {h5ad_dict}")

2023-12-02 11:50:00 - INFO - AnnData dictionary: {'dm0b': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM0B_processed.h5ad', 'dm11a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM11A_processed.h5ad', 'dm12b': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM12B_processed.h5ad', 'dm14b': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM14B_processed.h5ad', 'dm21a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM21A_processed.h5ad', 'dm23a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM23A_processed.h5ad', 'dm24a': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug2

In [6]:
# Create the AnnDataset
adata_atac_processed_list = []
for i, h5ad_file in enumerate(tqdm(h5ad_files)):
    logging.info(f"Loading {h5ad_file}")
    adata_atac = snap.read(h5ad_file)
    adata_atac_processed_list.append(adata_atac)

  0%|          | 0/27 [00:00<?, ?it/s]

2023-12-02 11:50:00 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM0B_processed.h5ad


2023-12-02 11:50:01 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM11A_processed.h5ad
2023-12-02 11:50:02 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM12B_processed.h5ad
2023-12-02 11:50:03 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM14B_processed.h5ad
2023-12-02 11:50:04 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM21A_processed.h5ad
2023-12-02 11:50:04 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM23A_processed.h5ad
2023-12-02 11:50:05 - INFO - Loading /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM24A_p

In [7]:
[(name, adata) for name, adata in zip(sample_ids, adata_atac_processed_list)]

[('dm0b',
  AnnData object with n_obs x n_vars = 9391 x 606219 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM0B_processed.h5ad'
      obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito', 'doublet_probability', 'doublet_score'
      var: 'count', 'selected'
      uns: 'reference_sequences', 'scrublet_sim_doublet_score'
      obsm: 'insertion'),
 ('dm11a',
  AnnData object with n_obs x n_vars = 7763 x 606219 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug23/snapatac2/processed/adata_atac_DM11A_processed.h5ad'
      obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito', 'doublet_probability', 'doublet_score'
      var: 'count', 'selected'
      uns: 'reference_sequences', 'scrublet_sim_doublet_score'
      obsm: 'insertion'),
 ('dm12b',
  AnnData object with n_obs x n_vars = 9415 x 606219 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/16Aug

In [7]:
# Merge into one object
adata_atac_merged = snap.AnnDataSet(
    adatas=[(name, adata) for name, adata in zip(sample_ids, adata_atac_processed_list)],
    filename=os.path.join(out_dir, "adata_atac_all_processed.h5ads")
)

In [8]:
# Close all the backed anndatas
for adata_atac in adata_atac_processed_list:
    adata_atac.close()
adata_atac_merged.close()

In [9]:
# Read in the merged AnnDataset
adata_atac_merged = snap.read_dataset(os.path.join(out_dir, "adata_atac_all_processed.h5ads"))
adata_atac_merged

AnnDataSet object with n_obs x n_vars = 170329 x 606219 backed at '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/2023_11_14/snapatac2/adata_atac_all_processed.h5ads'
contains 27 AnnData objects with keys: 'dm0b', 'dm11a', 'dm12b', 'dm14b', 'dm21a', 'dm23a', 'dm24a', 'dm25a', 'dm31a', 'dm32a', 'dm33a', 'dm34a', 'dm35a', 'dm42b', 'dm43b', 'dm44a', 'dm45a', 'mo14', 'mo18', 'mo1', 'mo22', 'mo26', 'mo29', 'mo33', 'mo38', 'mo3', 'mo9'
    obs: 'sample'
    uns: 'AnnDataSet', 'reference_sequences'

In [42]:
# Check the number of cells per sample
cell_sample = np.array(adata_atac_merged.obs["sample"])
np.unique(cell_sample, return_counts=True), len(np.unique(cell_sample, return_counts=True)[1])

((array(['dm0b', 'dm11a', 'dm12b', 'dm14b', 'dm21a', 'dm23a', 'dm24a',
         'dm25a', 'dm31a', 'dm32a', 'dm33a', 'dm35a', 'dm42b', 'dm43b',
         'dm44a', 'dm45a', 'mo1', 'mo14', 'mo18', 'mo22', 'mo26', 'mo29',
         'mo3', 'mo33', 'mo38', 'mo9'], dtype='<U5'),
  array([ 9391,  7763,  9415,  7233,  1831,  7177,  7186,  7787, 10325,
           289, 10275, 12694,  6171, 10677,  9883,  8351,  2780,  2492,
          8521,  5205,  2527,  1553,  2852,  2972,  7099,  7880])),
 26)

In [11]:
# Make the barcodes into a numpy array
cell_barcodes = np.array(adata_atac_merged.obs_names)
cell_barcodes[:5]

array(['AATACCGGTTGTGACA-1', 'GGCATTGTCACAGCCA-1', 'TGGTCATAGTGATTCA-1',
       'GCTGATCCAGAACCGA-1', 'TGGCTAAGTCATAACG-1'], dtype='<U18')

In [12]:
# Add the sample id to the cell barcodes with "#" in between
sample_barcode_names = [sample + "#" + barcode for sample, barcode in zip(cell_sample, cell_barcodes)]
sample_barcode_names[:5]

['dm0b#AATACCGGTTGTGACA-1',
 'dm0b#GGCATTGTCACAGCCA-1',
 'dm0b#TGGTCATAGTGATTCA-1',
 'dm0b#GCTGATCCAGAACCGA-1',
 'dm0b#TGGCTAAGTCATAACG-1']

In [13]:
# Make that the obs_names
adata_atac_merged.obs_names = sample_barcode_names

In [15]:
# Add in some project metadata
project_metadata = pd.read_csv("/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/metadata/11Sep23/cleaned_samples_tracking_11Sep23.tsv", sep="\t")
project_metadata["sample_id"] = project_metadata["sample_id"].str.lower()
project_metadata["timepoint"] = project_metadata["sample_description"].str.split("hr_").str[0]
project_metadata["condition"] = project_metadata["sample_description"].str.split("hr_").str[1]
project_metadata["batch"] = project_metadata["batch"].replace({"DM": "A2", "MO": "H1"})
project_metadata

Unnamed: 0,sample_id,sample_description,atac_library_id,rna_library_id,batch,timepoint,condition
0,dm0b,0hr_control,WB_73,WB_74,A2,0,control
1,dm11a,6hr_3-cyt,WB_7,WB_8,A2,6,3-cyt
2,dm12b,6hr_IFNg,WB_75,WB_76,A2,6,IFNg
3,dm14b,6hr_Ex-4_HG,WB_47,WB_48,A2,6,Ex-4_HG
4,dm21a,24hr_3-cyt,WB_11,WB_12,A2,24,3-cyt
5,dm23a,24hr_dex,WB_15,WB_16,A2,24,dex
6,dm24a,24hr_Ex-4_HG,WB_17,WB_18,A2,24,Ex-4_HG
7,dm25a,24hr_control,WB_19,WB_20,A2,24,control
8,dm31a,48hr_3-cyt,WB_29,WB_30,A2,48,3-cyt
9,dm32a,48hr_IFNg,WB_31,WB_32,A2,48,IFNg


In [45]:
# Get the indeces of the obs for certain samples
samples = ['mo1', "mo2"]
sample_indeces = [np.where(cell_sample == sample)[0] for sample in samples][0]
sample_indeces

array([137461, 137462, 137463, ..., 140238, 140239, 140240])

In [None]:
adata_atac_merged.subset(
    obs_indices=sample_indeces,
    out="/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/2023_11_14/snapatac2/H1_control"
)

TypeError: AnnDataSet.subset() got an unexpected keyword argument 'obs_indeces'

In [None]:
snap.read("/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/annotation/2023_11_14/snapatac2/H1_control/anndatas/dm0b.h5ad")

In [47]:
adata_atac_merged.subset?

[0;31mSignature:[0m
[0madata_atac_merged[0m[0;34m.[0m[0msubset[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mobs_indices[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvar_indices[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbackend[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Subsetting the AnnDataSet object.

Note
----
AnnDataSet will not move data across underlying AnnData objects. So the
orders of rows in the resultant AnnDataSet object may not be consistent
with the input `obs_indices`. This function will return a vector that can
be used to reorder the `obs_indices` to match the final order of rows in
the AnnDataSet.

Parameters
----------
obs_indices
    obs indices
var_indices
    var indices
out
    Name of the directory used to store the new files. If provided,


In [18]:
# merge the project metadata with the AnnDataset.obs (which is a polars dataframe)
adata_atac_merged.obs = adata_atac_merged.obs.merge(project_metadata, left_on="sample", right_on="sample_id", how="left")

AttributeError: 'builtins.PyDataFrameElem' object has no attribute 'merge'

In [13]:
adata_atac_merged.close()

# DONE!

---

# Scratch

In [13]:
adata_atac_merged = snap.read_dataset(os.path.join(out_dir, "adata_atac_merged_processed.h5ads"))
adata_atac_merged

AnnDataSet object with n_obs x n_vars = 170329 x 606219 backed at '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_merged_processed.h5ads'
contains 27 AnnData objects with keys: 'DM0B', 'DM11A', 'DM12B', 'DM14B', 'DM21A', 'DM23A', 'DM24A', 'DM25A', 'DM31A', 'DM32A', 'DM33A', 'DM34A', 'DM35A', 'DM42B', 'DM43B', 'DM44A', 'DM45A', 'MO14', 'MO18', 'MO1', 'MO22', 'MO26', 'MO29', 'MO33', 'MO38', 'MO3', 'MO9'
    obs: 'sample'
    uns: 'reference_sequences', 'AnnDataSet'

In [15]:
snap.pp.select_features(adata_atac_merged, n_features=50000)

2023-08-23 08:15:22 - INFO - Selected 50000 features.


In [16]:
adata_atac_merged.close()

In [50]:
adata_atac = snap.read("/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM34A_processed.h5ad")
adata_atac

AnnData object with n_obs x n_vars = 0 x 606219 backed at '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/processed/adata_atac_DM34A_processed.h5ad'
    obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito', 'doublet_probability', 'doublet_score'
    var: 'count', 'selected'
    uns: 'scrublet_sim_doublet_score', 'reference_sequences'
    obsm: 'insertion'

In [52]:
adata_atac.close()

In [53]:
adata_atac = snap.read("/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/adata_atac_DM34A.h5ad")
adata_atac

AnnData object with n_obs x n_vars = 12403 x 0 backed at '/cellar/users/aklie/data/igvf/beta_cell_networks/h5ad/igvf_sc-islet_10X-Multiome/16Aug23/snapatac2/adata_atac_DM34A.h5ad'
    obs: 'tsse', 'n_fragment', 'frac_dup', 'frac_mito'
    uns: 'reference_sequences'
    obsm: 'insertion'

In [None]:
snap.