In [95]:
import anndata
import scanpy as sc
from pathlib import Path
import pandas as pd

In [96]:
dataset_path = Path("/Users/dallmann/bio/datasets/liver-cell-atlas/human/rawData_human/countTable_human")
output_path = Path("/Users/dallmann/bio/datasets/liver-cell-atlas/human/processed")

if not output_path.exists():
    output_path.mkdir(parents=True)
    
MTX_FILE_NAME = "matrix.mtx.gz"
BARCODES_FILE_NAME = "barcodes.tsv.gz"
FEATURES_FILE_NAME = "features.tsv.gz"
ANNOT_FILE_NAME = "annot_humanAll.csv"

H5AD_RAW_FILE_NAME = "liver-cell-atlas_humanAll.h5ad"
H5AD_LABELED_FILE_NAME = "liver-cell-atlas_humanAll_labeled.h5ad"
#data = sc.read_mtx(dataset_path / "matrix.mtx.gz")

# Convert data
We want to load the data from mtx as an AnnData object and convert it to h5ad since reading the mtx file every time takes a long time.


In [97]:
def convert_and_load_mtx(input_file_path: Path, output_file_path: Path, transpose: bool = True, overwrite: bool = False) -> anndata.AnnData:
    if output_file_path.exists() and not overwrite:
        print(f"File {output_file_path} already exists. Skipping conversion.")
        return sc.read_h5ad(output_file_path)
    
    data = sc.read_mtx(input_file_path)
    # transpose data if it is given as genes x cells matrix
    if transpose:
        data = data.T
    
    # write the data to disk as h5ad file, effectively caching it for later processing
    data.write_h5ad(output_file_path)
    return data


In [98]:
data = convert_and_load_mtx(dataset_path / MTX_FILE_NAME, output_path / H5AD_RAW_FILE_NAME, overwrite=False)

File /Users/dallmann/bio/datasets/liver-cell-atlas/human/processed/liver-cell-atlas_humanAll.h5ad already exists. Skipping conversion.


# Load barcodes and features
We want to load the barcodes and features from the original files and add them to the AnnData object. The barcodes will be set as an index on the observables and the featueres/gene names will be set as an index on the variables, as well as a column 'gene_name' in the variables for further processing with CoExp. 

In [99]:
def add_barcodes_and_features(data: anndata.AnnData, barcodes_file_path: Path, features_file_path: Path, inplace: bool = True) -> anndata.AnnData:
    data_ = data.copy() if not inplace else data
    
    barcodes = pd.read_csv(barcodes_file_path, sep="\t", header=None, names=["barcodes"])
    barcodes.set_index(keys="barcodes", inplace=True, drop=True, verify_integrity=True)
    data_.obs = barcodes
    
    features = pd.read_csv(features_file_path, sep="\t", header=None, names=["gene_name"])
    features.set_index('gene_name', inplace=True, drop=False, verify_integrity=True)
    data_.var = features
    
    return data_

In [100]:
data = add_barcodes_and_features(data, dataset_path / BARCODES_FILE_NAME, dataset_path / FEATURES_FILE_NAME, inplace=True)

# Integrate annotations
The dataset contains annotations for a subset of the cells (~ 167k) in a separate file. We want to integrate these annotations into the AnnData object and in the same step select only the cells that have annotations.

In [101]:
def integrate_annotations(data: anndata.AnnData, annot_file_path: Path, inplace: bool = True) -> anndata.AnnData:
    data_ = data.copy() if not inplace else data
    labels = pd.read_csv(annot_file_path, sep=",", index_col=6)
    
    # create a view that only contains the labeled cells
    # in addition this select will reorder the cells in the view according to the order of the labels
    labeled_data = data_[labels.index]
    
    selected_labels = labels[["annot"]]
    selected_labels = selected_labels.rename(columns={"annot": "cell_type"}, inplace=False)
    labeled_data.obs = selected_labels
    
    return labeled_data    
    

In [102]:
labeled_data = integrate_annotations(data, dataset_path.parent.parent / ANNOT_FILE_NAME, inplace=True)

In [106]:
labeled_data.write_h5ad(output_path / H5AD_LABELED_FILE_NAME)

In [109]:
labeled_data

AnnData object with n_obs × n_vars = 167598 × 32738
    obs: 'cell_type'
    var: 'gene_name'