In [1]:
import squidpy as sq
import scanpy as sc
import scipy as scp
import numpy as np 
import pandas as pd

In [2]:
def metadata_filter(adata, obs_or_var, col_name, value):
    '''
    Function to quickly filter and subset the dataset based on metadata.
    
    Parameters:
        adata (AnnData): Annotated data object.
        obs_or_var (str): 'obs' for filtering based on observations metadata, 'var' for filtering based on variables metadata.
        col_name (str): Name of the metadata column to filter by.
        value: Value to filter for in the specified column.
    
    Returns:
        AnnData: Subset of the original AnnData object based on the specified metadata filter.
    '''
    if obs_or_var not in ['obs', 'var']:
        raise ValueError("Parameter 'obs_or_var' must be either 'obs' or 'var'.")

    if obs_or_var == 'obs':
        return adata[adata.obs[col_name] == value, :]
    else:
        return adata[adata.var[col_name] == value, :]

In [5]:
adata = sc.read_h5ad('data/hlcaFull.h5ad', backed='r') # read in backed mode

In [6]:
sclc_cells = metadata_filter(adata, 'obs', 'disease', 'squamous cell lung carcinoma') # squamous cell lung carcinoma cells
normal_cells = metadata_filter(adata, 'obs', 'disease', 'normal') # normal cells

In [12]:
# set index names (ENSEMBL IDs) to gene names for better readability
sclc_cells.var.index = sclc_cells.var.feature_name
normal_cells.var.index = normal_cells.var.feature_name 

In [18]:
normal_cells.obs.age_range.unique

<bound method Series.unique of CGATGTAAGTTACGGG_SC10                          nan
cc05p_CATGCCTGTGTGCCTG_carraro_csmc            nan
ATTCTACCAAGGTTCT_HD68                          nan
D062_TGACCCTTCAAACCCA-sub_wang_sub_batch3      nan
P1_2_TGCTGCTAGCTCCTCT                          nan
                                              ... 
P3_4_GCTTGAACACGACGAA                          nan
TTGTGGATCGTTCCTG_5-PX5-sub_mould               nan
TCAGGATCAAGACGTG_F02526                        nan
CAACCTCTCATGTAGC-WSSS8015042-0_meyer_unpubl    nan
022C-b_GGATGTTTCCAAGTAC_adams                  nan
Name: age_range, Length: 1305099, dtype: category
Categories (7, object): ['30-35', '40-45', '50-55', '55-60', '60-65', '65-70', 'nan']>

In [None]:
abv55 = ['55-60', '60-65', '65-70'] # filter above 55 years of age
normalCellsabv55 = normal_cells[normal_cells.obs['age_range'].isin(abv55)] # normal cells above 55
normalCellsabv55.write_h5ad('data/normalCellsabv55.h5ad')

In [28]:
sclc_cells.var.index = sclc_cells.var.feature_name # set gene names
sclc_cells.write_h5ad('data/sclcCellsNew.h5ad') # Copy squamous cell lung carcinoma cells to file

In [37]:
bdata = sc.read_h5ad('data/normalCellsabv55.h5ad')
bdata.var.index = bdata.var.feature_name

In [39]:
bdata.write_h5ad('data/normalCellsabv551.h5ad')