In [None]:
import scanpy as sc
from pathlib import Path
import pandas as pd

base_dir = "data/GSE201575"
meta = "meta.csv"

### Read count matrix


In [2]:
path_list = Path(base_dir).glob("*.txt.gz")
path_list = sorted(path_list)
adata_list = []

for p in path_list:
    a = sc.read_text(p, delimiter="\t", first_column_names=True).T
    a.obs.index = [p.stem.split(".")[0].split("_")[1]]
    a.obs["filename"] = [p.stem]
    adata_list.append(a)

adata = sc.concat(adata_list)

### Read metadata


In [3]:
# Load metadata
meta = pd.read_csv(meta)
meta = meta.dropna()
meta

Unnamed: 0,set,sample_id,sample,cell,condition,sample_rna
36,S1,nanoSPLITs_C10_211005J_1cell_A2,C10_A2,C10,A2,43C10
37,S1,nanoSPLITs_C10_211005J_1cell_A3,C10_A3,C10,A3,38C10
38,S1,nanoSPLITs_C10_211005J_1cell_A4,C10_A4,C10,A4,44C10
39,S1,nanoSPLITs_C10_211005J_1cell_A5,C10_A5,C10,A5,39C10
40,S1,nanoSPLITs_C10_211005J_1cell_A6,C10_A6,C10,A6,45C10
...,...,...,...,...,...,...
101,S1,nanoSPLITs_SVEC_211007J_1cell_D2,SVEC_D2,SVEC,D2,7SVEC
102,S1,nanoSPLITs_SVEC_211007J_1cell_D3,SVEC_D3,SVEC,D3,2SVEC
103,S1,nanoSPLITs_SVEC_211007J_1cell_D6,SVEC_D6,SVEC,D6,9SVEC
104,S1,nanoSPLITs_SVEC_211007J_1cell_D8,SVEC_D8,SVEC,D8,10SVEC


### Add metadata and filter samples on use


In [4]:
adata.obs["sample_rna"] = adata.obs.index
adata.obs = pd.merge(adata.obs, meta, on="sample_rna", how="left")

adata = adata[adata.obs.dropna().index.to_list()].copy()
adata.obs.index = adata.obs["sample_id"].values
adata

  return dispatch(args[0].__class__)(*args, **kw)


AnnData object with n_obs × n_vars = 70 × 40207
    obs: 'filename', 'sample_rna', 'set', 'sample_id', 'sample', 'cell', 'condition'

### Basic filtering and normalization


In [5]:
adata.layers["counts"] = adata.X.copy()

sc.pp.filter_genes(adata, min_cells=3, inplace=True)
sc.pp.filter_cells(adata, min_genes=200, inplace=True)

sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

### Saving AnnData object


In [6]:
adata.write_h5ad("01_GSE201575.h5ad")