In [None]:
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix
import scanpy as sc
import numpy as np
import pandas as pd

Set the below variable to wherever you downloaded the data from NCBI GEO [GSE145926](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE145926)

In [None]:
liao_data_directory = ''

Load the cell meta data, including the subclusters so that we can remove doublets.

In [3]:
adata_liao = sc.read(liao_data_directory + '_exprmatrix.tsv.gz').T

In [None]:
liao_cellmeta = pd.read_csv(data_directory + 'liao20_meta.tsv', sep='\t', index_col=0)
liao_cellmeta.index.name = None

liao_tcell_meta = pd.read_csv(data_directory + 'liao20_tcell_meta.txt', sep='\t', index_col=0)
liao_tcell_meta.index.name = None

liao_myeloid_meta = pd.read_csv(data_directory + 'liao20_myeloid_meta.txt', sep='\t', index_col=0)
liao_myeloid_meta.index.name = None

Add the cell meta data and the cell sub-type labels

In [None]:
adata_liao.obs = liao_cellmeta

In [None]:
adata_liao.obs.sample = pd.Series(adata_liao.obs.sample, dtype='category')
adata_liao.obs.sample_new = pd.Series(adata_liao.obs.sample_new, dtype='category')
adata_liao.obs.group = pd.Series(adata_liao.obs.group, dtype='category')
adata_liao.obs.disease = pd.Series(adata_liao.obs.disease, dtype='category')
adata_liao.obs.hasnCoV = pd.Series(adata_liao.obs.hasnCoV, dtype='category')
adata_liao.obs.cluster = pd.Series(adata_liao.obs.cluster, dtype='category')
adata_liao.obs.celltype = pd.Series(adata_liao.obs.celltype, dtype='category')

In [None]:
# Add the annotation data
liao_tcell_meta.rename(columns={'celltype':'celltype_sub'}, inplace=True)
liao_myeloid_meta.rename(columns={'celltype':'celltype_sub'}, inplace=True)
liao_tcell_meta_copy = pd.DataFrame(liao_tcell_meta['celltype_sub'])
liao_myeloid_meta_copy = pd.DataFrame(liao_myeloid_meta['celltype_sub'])
adata_liao.obs = adata_liao.obs.merge(liao_tcell_meta_copy, how='left', left_index=True, right_index=True)
adata_liao.obs.update(liao_myeloid_meta_copy)

In [None]:
liao_genemeta = pd.read_csv(data_directory + 'liao20_gene_meta.tsv', sep=',', index_col=0)

In [None]:
adata_liao.var = liao_genemeta # Set the gene metadata as the variables dataframe

In [None]:
# Normalize and log-transform
adata_liao.X = csr_matrix(adata_liao.X.copy())
adata_liao.layers['counts'] = csr_matrix(adata_liao.X.copy())

# Normalise the data
sc.pp.normalize_total(adata_liao, target_sum=1e4)
sc.pp.log1p(adata_liao)

In [None]:
# Filter out doublets now
adata_liao_sub = adata_liao[(adata_liao.obs.celltype_sub != 'Doublet')].copy()

In [None]:
# Save the data
adata_liao_sub.write(liao_data_directory + 'liao20_sub.h5ad', compression='gzip')