# Creating `h5ad` files
This notebook builds h5ad files for both gene UMI count matrices and guide assignment matrices.

# Set-up

In [1]:
# imports
import os
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.io import mmwrite, mmread
import anndata as ad

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
# paths
path_guide_assignments = "/cellar/users/aklie/data/datasets/tf_perturb_seq/datasets/Hon_WTC11-cardiomyocyte-differentiation_TF-Perturb-seq/Perturbation_information/cell_x_sgrna_matrix.pkl"
path_id_master = "/cellar/users/aklie/data/datasets/tf_perturb_seq/ref/sgRNA_id_master.tsv"

path_out = "/cellar/users/aklie/data/datasets/tf_perturb_seq/datasets/Hon_WTC11-cardiomyocyte-differentiation_TF-Perturb-seq/Perturbation_information"
os.makedirs(path_out, exist_ok=True)

# Load auxiliary data

In [3]:
# Load the id mapping
id_master = pd.read_csv(path_id_master, sep="\t")

In [4]:
id_master["type"].value_counts()

targeting           13147
non_targeting         600
negative_control      598
positive_control       19
Name: type, dtype: int64

# Load guide assignments

In [5]:
# Read the guide assignments for this dataset
guide_assignments = pd.read_pickle(path_guide_assignments)
guide_ids = guide_assignments.index
guide_assignments.shape

(11619, 739192)

In [6]:
# Check how many guides are in the master
in_master_msk = guide_ids.isin(id_master["protospacer_ID"])
in_master_msk.sum()

11619

In [7]:
# Subset the guide assignments to only include those in the master
if in_master_msk.sum() < guide_assignments.shape[0]:
    print(f"Subsetting guide assignments to only include {in_master_msk.sum()} guides in the master.")
    guide_assignments_subset = guide_assignments.loc[in_master_msk]
else:
    print(f"Don't need to subset")
    guide_assignments_subset = guide_assignments
guide_assignments_subset.shape

Don't need to subset


(11619, 739192)

In [8]:
# Any troublesome NaNs?
guide_assignments_subset.isna().sum().sum()

0

In [9]:
# Convert to sparse matrix
guide_assignments_sparse = csr_matrix(guide_assignments_subset.T)
guide_assignments_sparse.shape

(739192, 11619)

# Write 10X matrix format

In [10]:
os.makedirs(os.path.join(path_out, "guide_assignment_matrix"), exist_ok=True)

In [11]:
# Write features.tsv.gz
features = guide_ids
print(f"Number of features: {len(features)}")
print(f"First 5 features: {features[:5]}")
pd.Index(features).to_frame().to_csv(f"{path_out}/guide_assignment_matrix/features.tsv.gz", sep="\t", header=False, index=False, compression="gzip")

Number of features: 11619
First 5 features: Index(['ADNP2_-_77867536.23-P1P2-1', 'ADNP2_-_77867536.23-P1P2-2',
       'ADNP2_-_77867648.23-P1P2-2', 'ADNP_-_49547603.23-P1P2-2',
       'AEBP1_+_44143999.23-P1P2-1'],
      dtype='object')


In [12]:
# Barcodes
barcodes = guide_assignments_subset.columns
print(f"Number of barcodes: {len(barcodes)}")
print(f"First 5 barcodes: {barcodes[:5]}")
barcodes.to_frame().to_csv(f"{path_out}/guide_assignment_matrix/barcodes.tsv.gz", sep="\t", header=False, index=False, compression="gzip")

Number of barcodes: 739192
First 5 barcodes: Index(['AAACCTGAGAAACCGC-1', 'AAACCTGAGAGAGCTC-1', 'AAACCTGAGAGGGATA-1',
       'AAACCTGAGCAAATCA-1', 'AAACCTGAGCAATCTC-1'],
      dtype='object')


In [13]:
len(features), len(barcodes)

(11619, 739192)

In [14]:
guide_assignments_sparse.shape

(739192, 11619)

In [15]:
# Write mtx.mtx, features.tsv.gz, barcodes.tsv.gz
mmwrite(f"{path_out}/guide_assignment_matrix/matrix.mtx", guide_assignments_sparse)

# Make AnnData

In [16]:
# Reload the matrix to check
mtx = mmread(f"{path_out}/guide_assignment_matrix/matrix.mtx").tocsr()
mtx.shape

(739192, 11619)

In [17]:
# Reload the features and barcodes
barcodes = pd.read_csv(
    f"{path_out}/guide_assignment_matrix/barcodes.tsv.gz", sep="\t", header=None, index_col=0
)
barcodes.index.name = "barcode"
features = pd.read_csv(
    f"{path_out}/guide_assignment_matrix/features.tsv.gz", sep="\t", header=None, index_col=0
)
features.index.name = "feature"

In [18]:
len(features), len(barcodes)

(11619, 739192)

In [19]:
# Construct AnnData object
adata = ad.AnnData(
    X=mtx,
    obs=barcodes,
    var=features,
)

  adata = ad.AnnData(


In [None]:
# add in guide metadata
adata.var = adata.var.merge(id_master, left_index=True, right_on="protospacer_ID", how="left").set_index("protospacer_ID")
adata

AnnData expects .var.index to contain strings, but got values like:
    [7182, 7883, 10960, 4714, 4955]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


AnnData object with n_obs × n_vars = 739192 × 11619
    var: 'protospacer_ID', 'protospacer', 'intended_target_name', 'genomic_element', 'reverse_compliment', 'Huangfu_id', 'Huangfu_name', 'type'

In [26]:
# Write AnnData object
adata.write(f"{path_out}/guide_assignment_matrix.h5ad")

In [27]:
# Write csv for PySpade
adata.to_df().T.to_csv(f"{path_out}/guide_assignment_matrix.csv")

# DONE!

---