# Creating `h5ad` files
This notebook builds h5ad files for both gene UMI count matrices and guide assignment matrices.

# Set-up

In [1]:
# imports
import os
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.io import mmwrite, mmread
import anndata as ad

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
# paths
path_guide_assignments = "/cellar/users/aklie/data/datasets/tf_perturb_seq/datasets/Huangfu_HUES8-definitive-endoderm-differentiation_TF-Perturb-seq/Perturbation_information/sgrna_design_matrix_filtered_combined_control_final.csv"
path_id_master = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/sgRNA_id_master.tsv"

path_out = "/cellar/users/aklie/data/datasets/tf_perturb_seq/datasets/Huangfu_HUES8-definitive-endoderm-differentiation_TF-Perturb-seq/Perturbation_information"
os.makedirs(path_out, exist_ok=True)

# Load auxiliary data

In [3]:
# Load the id mapping
id_master = pd.read_csv(path_id_master, sep="\t")
id_map = id_master.set_index("Huangfu_id")["protospacer_ID"].to_dict()

In [4]:
id_master["type"].value_counts()

type
targeting           13147
non_targeting         600
negative_control      598
positive_control       19
Name: count, dtype: int64

# Load guide assignments

In [5]:
# Read the guide assignments for this dataset
guide_assignments = pd.read_csv(path_guide_assignments, delim_whitespace=True, engine="python")
guide_ids = guide_assignments["gene_ids"]
guide_assignments.shape

  guide_assignments = pd.read_csv(path_guide_assignments, delim_whitespace=True, engine="python")


(13167, 157191)

In [6]:
# Check how many guides are in the master
in_master_msk = guide_ids.isin(id_master["Huangfu_id"])
in_master_msk.sum()

13166

In [9]:
# which ones are not in the master?
guide_assignments[~in_master_msk]

Unnamed: 0,gene_ids,AAACCCAAGACTACCT-1,AAACCCAAGCATCAGG-1,AAACCCAAGCCTGTCG-1,AAACCCAAGGGAGATA-1,AAACCCAAGTGTTCAC-1,AAACCCAAGTTAGTGA-1,AAACCCACACCCAAGC-1,AAACCCACAGACGCTC-1,AAACCCACAGGTGAGT-1,...,TTTGTTGCAGTGTGCC-8,TTTGTTGGTACTAAGA-8,TTTGTTGGTGGAACCA-8,TTTGTTGGTTTGCCGG-8,TTTGTTGTCCAACCGG-8,TTTGTTGTCGCCAATA-8,TTTGTTGTCGTTGTGA-8,TTTGTTGTCTCGACGG-8,TTTGTTGTCTCGTGGG-8,TTTGTTGTCTCTGGTC-8
13166,control,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# 
guide_assignments[~in_master_msk].iloc[0].drop("gene_ids").sum()

14513

In [15]:
# Subset the guide assignments to only include those in the master
guide_assignments_subset = guide_assignments[in_master_msk]
guide_assignments_subset.shape

(13166, 157191)

In [16]:
# Any troublesome NaNs?
guide_assignments_subset.isna().sum().sum()

0

In [28]:
# Convert to sparse matrix
guide_assignments_sparse = csr_matrix(guide_assignments_subset.drop(columns=["gene_ids"]).values)
guide_assignments_sparse.shape

(13166, 157190)

# Write 10X matrix format

In [30]:
os.makedirs(os.path.join(path_out, "guide_assignment_matrix"), exist_ok=True)

In [31]:
# Write features.tsv.gz
features = guide_assignments_subset["gene_ids"]
features = features.map(id_map).values
print(f"Number of features: {len(features)}")
print(f"First 5 features: {features[:5]}")
pd.Index(features).to_frame().to_csv(f"{path_out}/guide_assignment_matrix/features.tsv.gz", sep="\t", header=False, index=False, compression="gzip")

Number of features: 13166
First 5 features: ['CD81_strong' 'CD81_weak' 'CD151_strong' 'CD151_weak' 'CD55']


In [32]:
# Barcodes
barcodes = guide_assignments_subset.drop(columns=["gene_ids"]).columns
print(f"Number of barcodes: {len(barcodes)}")
print(f"First 5 barcodes: {barcodes[:5]}")
barcodes.to_frame().to_csv(f"{path_out}/guide_assignment_matrix/barcodes.tsv.gz", sep="\t", header=False, index=False, compression="gzip")

Number of barcodes: 157190
First 5 barcodes: Index(['AAACCCAAGACTACCT-1', 'AAACCCAAGCATCAGG-1', 'AAACCCAAGCCTGTCG-1',
       'AAACCCAAGGGAGATA-1', 'AAACCCAAGTGTTCAC-1'],
      dtype='object')


In [33]:
len(features), len(barcodes)

(13166, 157190)

In [35]:
# Write mtx.mtx, features.tsv.gz, barcodes.tsv.gz
mmwrite(f"{path_out}/guide_assignment_matrix/matrix.mtx", guide_assignments_sparse.T)

# Make AnnData

In [36]:
# Reload the matrix to check
mtx = mmread(f"{path_out}/guide_assignment_matrix/matrix.mtx").tocsr()
mtx.shape

(157190, 13166)

In [37]:
# Reload the features and barcodes
barcodes = pd.read_csv(
    f"{path_out}/guide_assignment_matrix/barcodes.tsv.gz", sep="\t", header=None, index_col=0
)
barcodes.index.name = "barcode"
features = pd.read_csv(
    f"{path_out}/guide_assignment_matrix/features.tsv.gz", sep="\t", header=None, index_col=0
)
features.index.name = "feature"

In [38]:
# Construct AnnData object
adata = ad.AnnData(
    X=mtx,
    obs=barcodes,
    var=features,
)

  utils.warn_names_duplicates("var")


In [None]:
# add in guide metadata
adata.var = adata.var.merge(id_master, left_index=True, right_on="protospacer_ID", how="left").set_index("protospacer_ID")
adata

AnnData expects .var.index to contain strings, but got values like:
    [7519.0, 7513.0, 4418.0, 4393.0, 6600.0]

    Inferred to be: floating

  value_idx = self._prep_dim_index(value.index, attr)


AnnData object with n_obs × n_vars = 157190 × 13166
    var: 'protospacer_ID', 'protospacer', 'intended_target_name', 'genomic_element', 'reverse_compliment', 'Huangfu_id', 'Huangfu_name', 'type'

In [42]:
adata.var["type"].value_counts()

type
targeting           13141
positive_control       19
Name: count, dtype: int64

In [43]:
# Write AnnData object
adata.write(f"{path_out}/guide_assignment_matrix.h5ad")

In [None]:
# Write csv for PySpade
adata.to_df().T.to_csv(f"{path_out}/guide_assignment_matrix.csv")

# DONE!

---