In [1]:
import pandas as pd
import scanpy as sc
import anndata as ad

import numpy as np
from scipy.io import mmwrite

import datatable as dt

from sklearn.preprocessing import OneHotEncoder

## Firstly, loading the H5AD data file



The H5AD file can be retrieved from [here](https://doi.org/10.6084/m9.figshare.24151701). This dataset comprises gene expression counts obtained from individuals exposed to diverse infections. It is created from the dataset provided by the COvid-19 Multi-omics Blood ATlas (COMBAT) Consortium. The original dataset is retrievable [here](https://doi.org/10.5281/zenodo.6120249).

In [2]:
adata_gex = sc.read_h5ad('infection_endotype_data.h5ad')

In [7]:
adata_gex.X = adata_gex.layers['raw'].copy()

## Secondly, normalizing gene expression and finding highly variable genes

In [12]:
sc.pp.normalize_total(adata_gex, target_sum=1e6)
sc.pp.log1p(adata_gex)
sc.pp.highly_variable_genes(adata_gex, layer='raw', flavor='seurat_v3', n_top_genes=4000)

## Thirdly, saving data and metadata

In [13]:
adata_gex.obs.to_csv('infection_endotype_meta.csv')

In [14]:
df = pd.DataFrame(adata_gex.X[:,adata_gex.var.highly_variable].todense())
dt.Frame(df).to_csv('infection_endotype.csv')

In [15]:
df = pd.DataFrame(adata_gex.layers['raw'][:,adata_gex.var.highly_variable].todense())
dt.Frame(df).to_csv('infection_endotype_counts.csv')

In [16]:
adata_gex.obs_names.to_frame().to_csv('infection_endotype_cell.csv', index=None)
adata_gex.var_names[adata_gex.var.highly_variable].to_frame().to_csv('infection_endotype_gene.csv', index=None)

In [21]:
enc = OneHotEncoder(sparse=False).fit(adata_gex.obs['Severity'].to_numpy().reshape(-1,1))
df=pd.DataFrame(enc.transform(adata_gex.obs['Severity'].to_numpy().reshape(-1,1)), columns=enc.categories_).to_csv('infection_endotype_phenotype.txt', index=False)

In [22]:
enc = OneHotEncoder(sparse=False).fit(adata_gex.obs[['scRNASeq_sample_ID','Pool_ID']].to_numpy())
factors=['scRNASeq_sample_ID','Pool_ID']
colnames=[]
for i in range(len(factors)):
    col = [factors[i]+"_"+str(x) for x in enc.categories_[i]]
    colnames.extend(col)

pd.DataFrame(enc.transform(adata_gex.obs[['scRNASeq_sample_ID','Pool_ID']].to_numpy()), columns=colnames).to_csv('infection_endotype_uwv.txt', index=False)