This script prepares naftali lung data and nuc-seq data to perform label transfer at https://beta.fastgenomics.org/analyses/scarches (Transfer labels from Human Lung Cell Atlas)

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy as spy
from anndata import AnnData,concat

read processed data

In [2]:
anndata_yale_smoker = sc.read_h5ad("/ocean/projects/hmcmutc/qsong/projects/lung_aging/data/step1_yale_smoker_processed.h5ad")
anndata_yale_nonsmoker = sc.read_h5ad("/ocean/projects/hmcmutc/qsong/projects/lung_aging/data/step1_yale_nonsmoker_processed.h5ad")

anndata_car_smoker = sc.read_h5ad("/ocean/projects/hmcmutc/qsong/projects/lung_aging/data/step1_car_smoker_processed.h5ad")
anndata_car_nonsmoker = sc.read_h5ad("/ocean/projects/hmcmutc/qsong/projects/lung_aging/data/step1_car_nonsmoker_processed.h5ad")

anndata_nuc_smoker = sc.read_h5ad("/ocean/projects/hmcmutc/qsong/projects/lung_aging/data/step1_nuc_smoker_processed.h5ad")
anndata_nuc_nonsmoker = sc.read_h5ad("/ocean/projects/hmcmutc/qsong/projects/lung_aging/data/step1_nuc_nonsmoker_processed.h5ad")

Keep only the HLCA mapping genes and use raw counts

In [4]:
mapping_genes = pd.read_csv("data/HLCA_scarches_gene_order.csv")["gene_symbol"]
anndata_yale_smoker = anndata_yale_smoker.raw[:,np.intersect1d(anndata_yale_smoker.raw.var_names, mapping_genes)].to_adata()
anndata_yale_nonsmoker = anndata_yale_nonsmoker.raw[:,np.intersect1d(anndata_yale_nonsmoker.raw.var_names, mapping_genes)].to_adata()

anndata_car_smoker = anndata_car_smoker.raw[:,np.intersect1d(anndata_car_smoker.raw.var_names, mapping_genes)].to_adata()
anndata_car_nonsmoker = anndata_car_nonsmoker.raw[:,np.intersect1d(anndata_car_nonsmoker.raw.var_names, mapping_genes)].to_adata()

anndata_nuc_smoker = anndata_nuc_smoker.raw[:,np.intersect1d(anndata_nuc_smoker.raw.var_names, mapping_genes)].to_adata()
anndata_nuc_nonsmoker = anndata_nuc_nonsmoker.raw[:,np.intersect1d(anndata_nuc_nonsmoker.raw.var_names, mapping_genes)].to_adata()

  return anndata.AnnData(
  return anndata.AnnData(


Keep only "dataset" as metadata to reduce file size

In [5]:
anndata_yale_smoker.obs = anndata_yale_smoker.obs.loc[:,["dataset"]]
anndata_yale_nonsmoker.obs = anndata_yale_nonsmoker.obs.loc[:,["dataset"]]

anndata_car_smoker.obs = anndata_car_smoker.obs.loc[:,["dataset"]]
anndata_car_nonsmoker.obs = anndata_car_nonsmoker.obs.loc[:,["dataset"]]

anndata_nuc_smoker.obs = anndata_nuc_smoker.obs.loc[:,['dataset']]
anndata_nuc_nonsmoker.obs = anndata_nuc_nonsmoker.obs.loc[:,['dataset']]
anndata_nuc_smoker.obs['dataset'] = "John"
anndata_nuc_nonsmoker.obs['dataset'] = "John"

In [6]:
anndata_yale_smoker.X = anndata_yale_smoker.X.astype(np.int32)
anndata_yale_nonsmoker.X = anndata_yale_nonsmoker.X.astype(np.int32)

anndata_car_smoker.X = anndata_car_smoker.X.astype(np.int32)
anndata_car_nonsmoker.X = anndata_car_nonsmoker.X.astype(np.int32)

anndata_nuc_smoker.X = anndata_nuc_smoker.X.astype(np.int32)
anndata_nuc_nonsmoker.X = anndata_nuc_nonsmoker.X.astype(np.int32)

Combine datasets

In [7]:
combined_smoker = concat([anndata_yale_smoker, anndata_car_smoker])
combined_nonsmoker = concat([anndata_yale_nonsmoker, anndata_car_nonsmoker])

In [8]:
anndata_yale_smoker.write_h5ad("data/step2_yale_smoker_scArchesInput.h5ad")
anndata_yale_nonsmoker.write_h5ad("data/step2_yale_nonsmoker_scArchesInput.h5ad")

combined_smoker.write_h5ad("data/step2_combined_smoker_scArchesInput.h5ad")
combined_nonsmoker.write_h5ad("data/step2_combined_nonsmoker_scArchesInput.h5ad")

anndata_nuc_smoker.write_h5ad("data/step2_nuc_smoker_scArchesInput.h5ad")
anndata_nuc_nonsmoker.write_h5ad("data/step2_nuc_nonsmoker_scArchesInput.h5ad")