## Import

In [1]:
import os
import scanpy as sc
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

## Load

In [2]:
# single cell data
sc_adata = sc.read_h5ad("../../../../../datasets/Marteau_2024_CRC/final_crc_atlas-adata.h5ad")

In [None]:
# spatial data
spatial_adata = sc.read_h5ad('../input data/crca_xenium.h5ad')

## scRNAseq data

In [3]:
sc_adata

AnnData object with n_obs × n_vars = 4264929 × 28476
    obs: 'dataset', 'medical_condition', 'cancer_type', 'sample_id', 'sample_type', 'tumor_source', 'replicate', 'sample_tissue', 'anatomic_region', 'anatomic_location', 'tumor_stage', 'tumor_stage_TNM', 'tumor_stage_TNM_T', 'tumor_stage_TNM_N', 'tumor_stage_TNM_M', 'tumor_size', 'tumor_dimensions', 'tumor_grade', 'histological_type', 'microsatellite_status', 'mismatch_repair_deficiency_status', 'MLH1_promoter_methylation_status', 'MLH1_status', 'KRAS_status', 'BRAF_status', 'APC_status', 'TP53_status', 'PIK3CA_status', 'SMAD4_status', 'NRAS_status', 'MSH6_status', 'FBXW7_status', 'NOTCH1_status', 'MSH2_status', 'PMS2_status', 'POLE_status', 'ERBB2_status', 'STK11_status', 'HER2_status', 'CTNNB1_status', 'BRAS_status', 'patient_id', 'sex', 'age', 'treatment_status_before_resection', 'treatment_drug', 'treatment_response', 'RECIST', 'platform', 'platform_fine', 'cellranger_version', 'reference_genome', 'matrix_type', 'enrichment_cell_

### Uniform scRNA annotation for imputation

In [3]:
sc_adata.var.index = sc_adata.var['GeneSymbol']

In [4]:
sc_adata.var_names = sc_adata.var_names.astype(str)
sc_adata.var_names_make_unique()

#### Remove Hepatocyte

In [5]:
#sc_adata.obs['atlas_cell_type_coarse'].unique().tolist()

In [6]:
sc_adata = sc_adata[sc_adata.obs['atlas_cell_type_coarse'] != 'Hepatocyte'].copy()

#### Merge labels

In [7]:
#sc_adata.obs['atlas_cell_type_middle'].unique().tolist()

In [8]:
sc_adata.obs['cell_type_imputation'] = sc_adata.obs['atlas_cell_type_coarse'].astype(str)

In [9]:
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Fibroblast', 'cell_type_imputation'] = 'Fibroblast'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Endothelial cell', 'cell_type_imputation'] = 'Endothelial cell'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Macrophage', 'cell_type_imputation'] = 'Macrophage'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Dendritic cell', 'cell_type_imputation'] = 'Dendritic cell'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Neutrophil', 'cell_type_imputation'] = 'Neutrophil'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Mast cell', 'cell_type_imputation'] = 'Mast cell'

In [10]:
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Pericyte', 'cell_type_imputation'] = 'Fibroblast'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Monocyte', 'cell_type_imputation'] = 'Macrophage'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_middle'] == 'Eosinophil', 'cell_type_imputation'] = 'Neutrophil'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_coarse'] == 'NK cell', 'cell_type_imputation'] = 'T cell'
sc_adata.obs.loc[sc_adata.obs['atlas_cell_type_coarse'] == 'ILC', 'cell_type_imputation'] = 'T cell'

In [11]:
#sc_adata.obs['cell_type_imputation'].unique().tolist()

### Subset dataset

#### Keep only tumoral samples

In [12]:
tumor_sc_adata = sc_adata[sc_adata.obs['sample_type'].isin(['primary tumor','polyp'])].copy()

#### Keep only 10x samples

In [None]:
#tumor_sc_adata.obs['platform'].value_counts()

In [14]:
tumor_sc_adata_10x = tumor_sc_adata[tumor_sc_adata.obs['platform'].isin(['10x 3p','10x 5p'])].copy()

In [17]:
tumor_sc_adata_10x.shape

(1392731, 28476)

In [15]:
#tumor_sc_adata_10x.obs['dataset'].unique().tolist()

In [16]:
#tumor_sc_adata_10x.obs['atlas_cell_type_middle'].value_counts()

## Spatial data

### Uniform spatial annotation

In [3]:
spatial_adata.obs['cell_type_imputation'] = spatial_adata.obs['celltype'].copy()

### Manage Smooth Muscle cells in spatial

In [5]:
spatial_smc_filtered_adata = spatial_adata[spatial_adata.obs['celltype'] != 'Smooth muscle cell'].copy() 

## Filter for shared genes

In [10]:
set(spatial_adata.var_names).issubset(set(sc_adata.var_names))

False

In [12]:
missing = set(spatial_adata.var_names) - set(sc_adata.var_names)
print(missing)

{'CCL3L1'}


In [13]:
# Already handled in stAI model !!!
#spatial_adata = spatial_adata[:, ~spatial_adata.var_names.isin(missing)].copy()

## Change annotation labels on both objects

In [19]:
tumor_sc_adata_10x.obs['cell_type_imputation'] = (
    tumor_sc_adata_10x.obs['cell_type_imputation']
    .str.replace(' ', '_', regex=False)
    .str.lower()
)

In [9]:
spatial_adata.obs['cell_type_imputation'] = (
    spatial_adata.obs['cell_type_imputation']
    .str.replace(' ', '_', regex=False)
    .str.lower()
)

## Save

In [20]:
tumor_sc_adata_10x.var = tumor_sc_adata_10x.var.rename(columns={"GeneSymbol": "gene_symbol"})

In [21]:
tumor_sc_adata_10x.write('../input data/sc_imputation_to_be_corrected.h5ad')

In [6]:
spatial_smc_filtered_adata.write('../input data/spatial_smc_filtered_imputation.h5ad')

In [14]:
spatial_adata.write('../input data/spatial_imputation.h5ad')

In [15]:
#sc_adata = sc.read_h5ad('../input data/sc_imputation.h5ad')

In [16]:
#spatial_adata = sc.read_h5ad('../input data/spatial_imputation.h5ad')

In [20]:
spatial_adata.shape

(3706544, 380)