# Creating `h5ad` files
This notebook builds h5ad files for both gene UMI count matrices and guide assignment matrices.

# Set-up

In [3]:
# imports
import os
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.io import mmwrite, mmread
import anndata as ad

In [8]:
# paths
path_guide_assignments = "/cellar/users/aklie/data/datasets/tf_perturb_seq/Huangfu_HUES8-embryonic-stemcell-differentiation_TF-Perturb-seq/Perturbation_information/sgrna_design_matrix_filtered_combined_control_final.csv"
path_id_master = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/sgRNA_id_master.tsv"

path_out = "/cellar/users/aklie/data/datasets/tf_perturb_seq/datasets/Huangfu_HUES8-embryonic-stemcell-differentiation_TF-Perturb-seq/Perturbation_information"
os.makedirs(path_out, exist_ok=True)

# Load auxiliary data

In [9]:
# Load the id mapping
id_master = pd.read_csv(path_id_master, sep="\t")
id_map = id_master.set_index("Huangfu_id")["protospacer_ID"].to_dict()

In [10]:
id_master["type"].value_counts()

type
targeting           13147
non_targeting         600
negative_control      598
positive_control       19
Name: count, dtype: int64

# Load guide assignments

In [8]:
# Read the guide assignments for this dataset
guide_assignments = pd.read_csv(path_guide_assignments, delim_whitespace=True, engine="python")
guide_ids = guide_assignments["gene_ids"]
guide_assignments.shape

In [None]:
# Check how many guides are in the master
in_master_msk = guide_ids.isin(id_master["Huangfu_id"])
in_master_msk.sum()

In [None]:
# Subset the guide assignments to only include those in the master
guide_assignments_subset = guide_assignments[in_master_msk]
guide_assignments_subset.shape

In [None]:
# Any troublesome NaNs?
guide_assignments_subset.isna().sum().sum()

In [None]:
# Convert to sparse matrix
guide_assignments_sparse = csr_matrix(guide_assignments_subset.drop(columns=["gene_ids"]).values)
guide_assignments_sparse.shape

# Write 10X matrix format

In [None]:
# Write features.tsv.gz
features = guide_assignments_subset["gene_ids"]
features = features.map(id_map).values
print(f"Number of features: {len(features)}")
print(f"First 5 features: {features[:5]}")
pd.Index(features).to_frame().to_csv(f"{path_out}/guide_assignment_matrix/features.tsv.gz", sep="\t", header=False, index=False, compression="gzip")

In [None]:
# Barcodes
barcodes = guide_assignments_subset.drop(columns=["gene_ids"]).columns
print(f"Number of barcodes: {len(barcodes)}")
print(f"First 5 barcodes: {barcodes[:5]}")
barcodes.to_frame().to_csv(f"{path_out}/guide_assignment_matrix/barcodes.tsv.gz", sep="\t", header=False, index=False, compression="gzip")

In [None]:
len(features), len(barcodes)

In [None]:
guide_assignments_sparse.T.shape

In [None]:
# Write mtx.mtx, features.tsv.gz, barcodes.tsv.gz
mmwrite(f"{path_out}/guide_assignment_matrix/matrix.mtx", guide_assignments_sparse.T)

# Make AnnData

In [11]:
# Reload the matrix to check
mtx = mmread(f"{path_out}/guide_assignment_matrix/matrix.mtx").tocsr()
mtx.shape

(95717, 13160)

In [12]:
# Reload the features and barcodes
barcodes = pd.read_csv(
    f"{path_out}/guide_assignment_matrix/barcodes.tsv.gz", sep="\t", header=None, index_col=0
)
barcodes.index.name = "barcode"
features = pd.read_csv(
    f"{path_out}/guide_assignment_matrix/features.tsv.gz", sep="\t", header=None, index_col=0
)
features.index.name = "feature"

In [13]:
# Construct AnnData object
adata = ad.AnnData(
    X=mtx,
    obs=barcodes,
    var=features,
)

In [14]:
# add in guide metadata
adata.var = adata.var.merge(id_master, left_index=True, right_on="protospacer_ID", how="left")
adata

AnnData expects .var.index to contain strings, but got values like:
    [7519, 7513, 4418, 4393, 6600]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


AnnData object with n_obs × n_vars = 95717 × 13160
    var: 'protospacer_ID', 'protospacer', 'intended_target_name', 'genomic_element', 'reverse_compliment', 'Huangfu_id', 'Huangfu_name', 'type'

In [15]:
# Write AnnData object
adata.write(f"{path_out}/guide_assignment_matrix.h5ad")

In [None]:
# Write csv for PySpade
adata.to_df().T.to_csv(f"{path_out}/guide_assignment_matrix.csv")

# DONE!

---

In [9]:
path_test = "/cellar/users/aklie/data/datasets/Huangfu_HUES8-embryonic-stemcell-differentiation_TF-Perturb-seq/bin/results/3_pySpade/Singlet_sgRNA_df.h5"

In [10]:
test = pd.read_hdf(path_test, key="df")

In [None]:
test