In [1]:
import squidpy as sq
import scanpy as sc
import scipy as scp
import numpy as np 
import pandas as pd

In [2]:
def metadata_filter(adata, obs_or_var, col_name, value):
    '''
    Function to quickly filter and subset the dataset based on metadata.
    
    Parameters:
        adata (AnnData): Annotated data object.
        obs_or_var (str): 'obs' for filtering based on observations metadata, 'var' for filtering based on variables metadata.
        col_name (str): Name of the metadata column to filter by.
        value: Value to filter for in the specified column.
    
    Returns:
        AnnData: Subset of the original AnnData object based on the specified metadata filter.
    '''
    if obs_or_var not in ['obs', 'var']:
        raise ValueError("Parameter 'obs_or_var' must be either 'obs' or 'var'.")

    if obs_or_var == 'obs':
        return adata[adata.obs[col_name] == value, :]
    else:
        return adata[adata.var[col_name] == value, :]

In [3]:
hlca = sclcCells = sc.read_h5ad('hlca/sclcCells.h5ad')
hlcadroplet = 'droplet_normal_lung_blood_scanpy.20200205.RC4.h5ad'

adata = sc.read_h5ad(hlca, backed='r')

In [5]:
sclc_cells = metadata_filter(adata, 'obs', 'disease', 'squamous cell lung carcinoma') # squamous cell lung carcinoma cells
normal_cells = metadata_filter(adata, 'obs', 'disease', 'normal') # normal cells

In [6]:
sclc_cells

View of AnnData object with n_obs × n_vars = 20631 × 56295 backed at 'hlca/hlcaFull.h5ad'
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', "3'_or_5'", 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'core_or_extension', 'dataset', 'fresh_or_frozen', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_nonharmonized', 'reannotation_type', 'sample', 'scanvi_label', 'sequencing_platform', 'smoking_status', 'study', 'subject_

In [7]:
normal_cells

View of AnnData object with n_obs × n_vars = 1305099 × 56295 backed at 'hlca/hlcaFull.h5ad'
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', "3'_or_5'", 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'core_or_extension', 'dataset', 'fresh_or_frozen', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_nonharmonized', 'reannotation_type', 'sample', 'scanvi_label', 'sequencing_platform', 'smoking_status', 'study', 'subjec

In [12]:
sclc_cells.to_memory(copy='sclcCells.h5ad') # Copy squamous cell lung carcinoma cells to file

AnnData object with n_obs × n_vars = 20631 × 56295
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', "3'_or_5'", 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'core_or_extension', 'dataset', 'fresh_or_frozen', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_nonharmonized', 'reannotation_type', 'sample', 'scanvi_label', 'sequencing_platform', 'smoking_status', 'study', 'subject_type', 'tissue_coarse_unharmonized', 't

In [28]:
normal_cells.copy('normal.h5ad') # Copy normal cells to file

AnnData object with n_obs × n_vars = 1305099 × 56295 backed at 'normal.h5ad'
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', "3'_or_5'", 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'core_or_extension', 'dataset', 'fresh_or_frozen', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_nonharmonized', 'reannotation_type', 'sample', 'scanvi_label', 'sequencing_platform', 'smoking_status', 'study', 'subject_type', 'tissu