# **Slide-seq lung: data processing**

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import warnings
warnings.filterwarnings("ignore")

BASE_PATH = "/home/projects/nyosef/oier/Harreman_files/Slide_seq_lung"
ADATA_PATH = os.path.join(BASE_PATH, 'h5ads')
RAW_ADATA_PATH = os.path.join(ADATA_PATH, 'raw')
DATA_PATH = os.path.join(BASE_PATH, 'data')

## Load dataset

The data was obtained from https://cellxgene.cziscience.com/collections/02b01703-bf1b-48de-b99a-23bef8cccc81.

In [2]:
adatas = {}
for data in os.listdir(RAW_ADATA_PATH):
    sample = data.split('.')[0]

    adata = ad.read_h5ad(os.path.join(RAW_ADATA_PATH, data))
    adata.obs_names = [ind + '_' + sample for ind in adata.obs_names]
    adata.obs['sample'] = sample
    adata.var['gene_ids'] = adata.var_names
    adata.var_names = adata.var['feature_name']

    adata.layers['counts'] = adata.X
    norm = sc.pp.normalize_total(adata, target_sum=1e4, inplace=False)
    adata.layers["normalized"] = norm["X"]
    adata.layers["log_norm"] = sc.pp.log1p(norm["X"], copy=True)

    adatas[sample] = adata

In [3]:
adata = ad.concat(adatas.values())

In [4]:
adata

AnnData object with n_obs × n_vars = 200394 × 21387
    obs: 'n_genes', 'n_UMIs', 'log10_n_UMIs', 'log10_n_genes', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'Cell_Type', 'cell_type_ontology_term_id', 'donor_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'sample'
    obsm: 'spatial'
    layers: 'counts', 'normalized', 'log_norm'

We save the AnnData

In [5]:
# adata.write(os.path.join(ADATA_PATH, 'Slide_seq_lung_adata.h5ad'))