# Loading and filtering features from Single cell dataset 

In [2]:
import pandas as pd        
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [3]:
path_data = "/shared/projects/microbiome_translocation/data/Tabula_sapiens_immune_all/"


adata = sc.read(path_data + "immune_all_cell.h5ad")

In [4]:
blood_adata = adata[adata.obs.tissue_in_publication.isin(["Blood"]),:]

df = blood_adata.to_df()

# Renaming genes and cell id
gene_name = np.asarray(blood_adata.var.feature_name).tolist()
gene_id = np.asarray(blood_adata.var.index).tolist()
ID = blood_adata.obs.index.tolist()
types = np.asarray(blood_adata.obs.cell_type).tolist()

df = df.rename(columns=dict(zip(gene_id,gene_name)),index=dict(zip(ID,types)))
df

ensemblid,DDX11L1,WASH7P,MIR6859-1,MIR1302-2HG,MIR1302-2,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,...,MT-ND4,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP
erythrocyte,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.396265,0.0,0.0,0.0,0.000000,1.518963,0.0,1.794237,0.0,0.000000
erythrocyte,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.500448,0.0,0.0,0.0,0.558787,0.000000,0.0,0.847475,0.0,0.000000
"CD4-positive, alpha-beta memory T cell",0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.300591,0.0,0.0,0.0,3.070026,0.000000,0.0,3.761100,0.0,0.000000
"CD8-positive, alpha-beta cytokine secreting effector T cell",0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.610880,0.0,0.0,0.0,3.174738,2.748144,0.0,4.052015,0.0,1.957541
classical monocyte,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.128788,0.0,0.0,0.0,2.992075,0.000000,0.0,3.277448,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CD4-positive, alpha-beta T cell",0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.468975,0.0,0.0,0.0,2.999030,0.000000,0.0,1.174247,0.0,0.000000
"CD4-positive, alpha-beta T cell",0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.049129,0.0,0.0,0.0,1.941740,1.271020,0.0,3.194520,0.0,0.000000
"CD4-positive, alpha-beta T cell",0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.143907,0.0,0.0,0.0,2.293076,0.000000,0.0,3.186714,0.0,7.515769
"CD4-positive, alpha-beta T cell",0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.991743,0.0,0.0,0.0,3.062743,2.814439,0.0,4.208903,0.0,7.788391


In [5]:
len(df.columns)

58604

In [6]:
len(df.index)

50115

**58604 features and 50115 cells**

In [7]:
from collections import Counter

Counter(df.index)

Counter({'erythrocyte': 10484,
         'CD4-positive, alpha-beta memory T cell': 949,
         'CD8-positive, alpha-beta cytokine secreting effector T cell': 1493,
         'classical monocyte': 7211,
         'neutrophil': 8456,
         'naive B cell': 2239,
         'mature NK T cell': 2948,
         'memory B cell': 854,
         'type I NK T cell': 518,
         'CD141-positive myeloid dendritic cell': 15,
         'CD8-positive, alpha-beta T cell': 1322,
         'plasma cell': 488,
         'T cell': 31,
         'platelet': 239,
         'naive thymus-derived CD4-positive, alpha-beta T cell': 412,
         'non-classical monocyte': 8,
         'hematopoietic stem cell': 70,
         'plasmacytoid dendritic cell': 11,
         'basophil': 29,
         'CD4-positive, alpha-beta T cell': 2863,
         'monocyte': 8972,
         'plasmablast': 10,
         'common myeloid progenitor': 3,
         'macrophage': 488,
         'granulocyte': 2})

In [22]:
df_no_zero = df.loc[:, (df.sum(axis=0) > 0)]
len(df_no_zero.columns)

52220

**52220 features with at least one cell expressing the corresponding gene**

In [17]:
df_no_zero.to_csv(path_data+"50000cell_immune_expressed.csv", sep='\t')

In [20]:
file = open(path_data+'50k_cell_labels.txt','w')
for cells in df.index.to_list():
	file.write(cells+"\n")
file.close()

file = open(path_data+'50k_cells_geneset/50k_cell_genes.txt','w')
for cells in mat.columns.to_list():
	file.write(cells+"\n")
file.close()

# With cross-validation data : Liu et al. Cell 2021

In [1]:
import pandas as pd        
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from collections import Counter

def adata_to_expr_df(adata, rename_cell=True, rename_gene=True):

    df = adata.to_df()

    # Renaming genes and cell id
    gene_name = np.asarray(adata.var.feature_name).tolist()
    gene_id = np.asarray(adata.var.index).tolist()
    ID = adata.obs.index.tolist()
    types = np.asarray(adata.obs.cell_type).tolist()

    if rename_cell and rename_gene :
        df = df.rename(columns=dict(zip(gene_id,gene_name)),index=dict(zip(ID,types)))
    elif rename_cell and not rename_gene : 
        df = df.rename(index=dict(zip(ID,types)))
    elif not rename_cell and rename_gene : 
        df = df.rename(columns=dict(zip(gene_id,gene_name)))

    return(df)

In [2]:
path_data = "/shared/projects/microbiome_translocation/data/"


adaptative = sc.read(path_data+"scRNAseq/Liu_2021_cell/adaptative.h5ad")
adaptative = adaptative[adaptative.obs.disease.isin(["normal"]),:]


adaptative_df = adata_to_expr_df(adaptative)
adaptative_df

feature_id,TXNDC2,INKA2,LAMB3,RP11-598F7.6,NDUFA9,PAGE2,SLC4A1AP,OR1D5,LINC02349,RP11-446H18.6,...,DIP2B,LINC02754,CPSF3,GPR17,IFITM2,NETO1,PRR23D2,ADGB,RP1-266L20.2,TRPM3
naive B cell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
memory B cell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.172221,0.0,0.0,0.0,0.0,0.0
memory B cell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.327402,0.0,0.0,0.0,0.0,0.0
memory B cell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.347603,0.0,0.0,0.0,0.0,0.0
naive B cell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CD4-positive, alpha-beta memory T cell",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.928863,0.0,1.963822,0.0,0.0,0.0,0.0,0.0
"CD4-positive, alpha-beta memory T cell",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.175153,0.0,0.0,0.0,0.0,0.0
"CD4-positive, alpha-beta memory T cell",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.912975,0.0,0.0,0.0,0.0,0.0
memory B cell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.124465,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [3]:
innate = sc.read(path_data+"scRNAseq/Liu_2021_cell/innate.h5ad")
innate = innate[innate.obs.disease.isin(["normal"]),:]

innate_df = adata_to_expr_df(innate)
innate_df

feature_id,TXNDC2,INKA2,LAMB3,RP11-598F7.6,NDUFA9,PAGE2,SLC4A1AP,OR1D5,LINC02349,RP11-446H18.6,...,DIP2B,LINC02754,CPSF3,GPR17,IFITM2,NETO1,PRR23D2,ADGB,RP1-266L20.2,TRPM3
"CD16-positive, CD56-dim natural killer cell, human",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,2.743088,0.0,0.0,0.0,0.0,0.0
"CD16-positive, CD56-dim natural killer cell, human",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,2.371941,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,2.623779,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,1.886934,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.970632,0.0,0.970632,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,1.453772,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CD16-positive, CD56-dim natural killer cell, human",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,3.070716,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,3.267242,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.838333,0.0,1.596864,0.0,0.0,0.0,0.0,0.0
"CD16-positive, CD56-dim natural killer cell, human",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,1.148602,0.0,2.465460,0.0,0.0,0.0,0.0,0.0


In [6]:
merged_data = pd.concat([innate_df,adaptative_df])
merged_data

feature_id,TXNDC2,INKA2,LAMB3,RP11-598F7.6,NDUFA9,PAGE2,SLC4A1AP,OR1D5,LINC02349,RP11-446H18.6,...,DIP2B,LINC02754,CPSF3,GPR17,IFITM2,NETO1,PRR23D2,ADGB,RP1-266L20.2,TRPM3
"CD16-positive, CD56-dim natural killer cell, human",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.743088,0.0,0.0,0.0,0.0,0.0
"CD16-positive, CD56-dim natural killer cell, human",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.371941,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.623779,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,1.886934,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
classical monocyte,0.0,0.0,0.0,0.0,0.970632,0.0,0.970632,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.453772,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CD4-positive, alpha-beta memory T cell",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.928863,0.0,1.963822,0.0,0.0,0.0,0.0,0.0
"CD4-positive, alpha-beta memory T cell",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.175153,0.0,0.0,0.0,0.0,0.0
"CD4-positive, alpha-beta memory T cell",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.912975,0.0,0.0,0.0,0.0,0.0
memory B cell,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,1.124465,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [8]:
Counter(merged_data.index)

Counter({'CD16-positive, CD56-dim natural killer cell, human': 6805,
         'classical monocyte': 13464,
         'non-classical monocyte': 2892,
         'natural killer cell': 1241,
         'conventional dendritic cell': 355,
         'plasmacytoid dendritic cell': 354,
         'platelet': 391,
         'CD16-negative, CD56-bright natural killer cell, human': 261,
         'granulocyte': 140,
         'intermediate monocyte': 62,
         'naive B cell': 3909,
         'memory B cell': 6102,
         'CD8-positive, alpha-beta memory T cell': 10758,
         'regulatory T cell': 621,
         'mucosal invariant T cell': 2520,
         'plasmablast': 249,
         'naive thymus-derived CD8-positive, alpha-beta T cell': 4172,
         'CD4-positive, alpha-beta memory T cell': 26710,
         'naive thymus-derived CD4-positive, alpha-beta T cell': 8020,
         'memory T cell': 341,
         'gamma-delta T cell': 3181,
         'double negative thymocyte': 1496,
         'double-pos

In [9]:
def replace_label(lab_list, pattern_list, replace_list, replace_other = "other_cell"):

    for i, item in enumerate(lab_list):
        c = 0

        for index, pattern in enumerate(pattern_list) : 

            if pattern in item :
                lab_list[i] = replace_list[index]
                continue 
            c+=1
            if replace_other != None and c == len(pattern_list):
                lab_list[i] = replace_other

    return(lab_list)

In [53]:
indexes = merged_data.index.to_list()

label_list = ["CD4","CD8","Monocyte","memory_B_cell","naive_B_cell","NK_cell"]

# Removing cells that does not interest us
new_index = replace_label(indexes, 
              pattern_list=["CD4","CD8","monocyte","memory B cell","naive B cell","natural"],
              replace_list=label_list,
              replace_other='other' )
Counter(new_index)  

Counter({'NK_cell': 8307,
         'Monocyte': 16418,
         'other': 9728,
         'naive_B_cell': 3909,
         'memory_B_cell': 6102,
         'CD8': 14930,
         'CD4': 34730})

In [62]:
bool_list = []
for item in new_index:
    if item in label_list:
        bool_list.append(True)
    else:
        bool_list.append(False)
        
Counter(bool_list)

Counter({True: 84396, False: 9728})

In [60]:
merged_data.index = new_index

In [61]:
merged_data = merged_data[bool_list]
merged_data

feature_id,TXNDC2,INKA2,LAMB3,RP11-598F7.6,NDUFA9,PAGE2,SLC4A1AP,OR1D5,LINC02349,RP11-446H18.6,...,DIP2B,LINC02754,CPSF3,GPR17,IFITM2,NETO1,PRR23D2,ADGB,RP1-266L20.2,TRPM3
NK_cell,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.743088,0.0,0.0,0.0,0.0,0.0
NK_cell,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.371941,0.0,0.0,0.0,0.0,0.0
Monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.623779,0.0,0.0,0.0,0.0,0.0
Monocyte,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,1.886934,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Monocyte,0.0,0.0,0.0,0.0,0.970632,0.0,0.970632,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.453772,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CD4,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.928863,0.0,1.963822,0.0,0.0,0.0,0.0,0.0
CD4,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,1.175153,0.0,0.0,0.0,0.0,0.0
CD4,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,2.912975,0.0,0.0,0.0,0.0,0.0
memory_B_cell,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,1.124465,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [64]:
merged_data_no_zero = merged_data.loc[:, (merged_data.sum(axis=0) > 0)]
merged_data_no_zero

feature_id,TXNDC2,INKA2,LAMB3,NDUFA9,SLC4A1AP,RP11-446H18.6,C6orf136,BCL2L10,SETBP1-DT,FAM156B,...,LINC02418,DIP2B,LINC02754,CPSF3,GPR17,IFITM2,NETO1,ADGB,RP1-266L20.2,TRPM3
NK_cell,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,2.743088,0.0,0.0,0.0,0.0
NK_cell,0.0,0.0,0.0,0.000000,0.000000,0.0,1.232427,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,2.371941,0.0,0.0,0.0,0.0
Monocyte,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,2.623779,0.0,0.0,0.0,0.0
Monocyte,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.0,1.886934,0.0,0.000000,0.0,0.0,0.0,0.0
Monocyte,0.0,0.0,0.0,0.970632,0.970632,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,1.453772,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CD4,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.928863,0.0,1.963822,0.0,0.0,0.0,0.0
CD4,0.0,0.0,0.0,0.000000,0.000000,0.0,1.175153,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,1.175153,0.0,0.0,0.0,0.0
CD4,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,2.912975,0.0,0.0,0.0,0.0
memory_B_cell,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,1.124465,...,0.0,1.124465,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


In [66]:
path_data = "/shared/projects/microbiome_translocation/data/scRNAseq/Liu_2021_cell/"

merged_data_no_zero.to_csv(path_data+"Liu2021_Cell.csv", sep='\t')

file = open(path_data+'Liu2021_index.txt','w')
for cells in merged_data_no_zero.index.to_list():
	file.write(cells+"\n")
file.close()

file = open(path_data+'Liu2021_genes.txt','w')
for genes in merged_data_no_zero.columns.to_list():
	file.write(genes+"\n")
file.close()