In [1]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from adjustText import adjust_text
import scanpy.external as sce
from NMFproj import *

In [2]:
# Please read 02_NMF_CD4_T_cell_Projection/00_Download_NMFprojection_Matrix.ipynb about how to downloading files
fixed_W=pd.read_csv("NMF.W.CD4T.csv")
fixed_W = fixed_W.rename(columns={'Unnamed: 0': ''})
fixed_W.set_index(fixed_W.columns[0], inplace=True)

In [3]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat


def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata
def get_last_pattern(inputstr):
    pattern = r"[^/]+(?=$)"
    match = re.search(pattern, inputstr)
    if match:
        return match.group(0)
    else:
        return ""
def process_file(file_name):
    adata = read_h5_anndata(file_name)

    doublet_scores=pd.read_csv('Doublet_Scores/'+adata.obs['pbmc_sample_id'][0]+'.csv', index_col=0)
    L1_labels=pd.read_csv('Labels/'+adata.obs['pbmc_sample_id'][0]+'_L1_predicted_labels.csv', index_col=0)
    L2_labels=pd.read_csv('Labels/'+adata.obs['pbmc_sample_id'][0]+'_L2_predicted_labels.csv', index_col=0)
    L3_labels=pd.read_csv('Labels/'+adata.obs['pbmc_sample_id'][0]+'_L3_predicted_labels.csv', index_col=0)
    L1_labels.columns=[ 'barcodes', 'AIFI_L1']
    L2_labels.columns=[ 'barcodes', 'AIFI_L2']
    L3_labels.columns=[ 'barcodes', 'AIFI_L3']
    print(doublet_scores['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L1_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L2_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L3_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist())
    adata.obs= pd.merge(adata.obs, doublet_scores, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L1_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L2_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L3_labels, on='barcodes', how='left')
    adata.obs.index=adata.obs['barcodes']
    adata.var["mito"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)

    return adata

In [4]:
meta_data=pd.read_csv('meta_data_GEO.csv')
file_list=["GSE214546_Data/"+x+'.h5' for x in meta_data['combined_sample_id']]


# Combine RNA data

In [5]:
%%time
file_names= file_list
h5_list = []
with ThreadPoolExecutor(max_workers=16) as executor:
    future_to_file = {executor.submit(process_file, file_name): file_name for file_name in file_names}
    for future in tqdm(as_completed(future_to_file), total=len(file_names)):
        result = future.result()
        if result is not None:
            h5_list.append(result)

  0% 0/16 [00:00<?, ?it/s]

True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True


100% 16/16 [02:03<00:00,  7.73s/it]

CPU times: user 1min 5s, sys: 19.1 s, total: 1min 24s
Wall time: 2min 3s





In [6]:
combined= anndata.concat(h5_list)

In [7]:
combined=combined[(combined.obs["pct_counts_mito"] <15) & (combined.obs["n_genes_by_counts"] <2500) &(combined.obs["n_genes"] >200) ]
combined=combined[combined.obs['predicted_doublet']==False]

# Filter CD4 T cell only

In [8]:
selected_celltype =['Core naive CD4 T cell',
 'CM CD4 T cell',
 'GZMB- CD27+ EM CD4 T cell',
 'CD4 MAIT',
 'KLRB1+ memory CD4 Treg',
 'KLRF1- GZMB+ CD27- memory CD4 T cell',
 'Naive CD4 Treg',
 'GZMB- CD27- EM CD4 T cell',
 'ISG+ naive CD4 T cell',
 'Memory CD4 Treg',
 'ISG+ memory CD4 T cell',
 'GZMK+ memory CD4 Treg',
 'SOX4+ naive CD4 T cell']
combined_CD4_T=combined[combined.obs['AIFI_L3'].isin(selected_celltype)]

In [9]:
combined_CD4_T.raw=combined_CD4_T

In [10]:
sc.pp.normalize_total(combined_CD4_T, target_sum=1e4)
sc.pp.log1p(combined_CD4_T)

In [11]:
%%time
input_df = pd.DataFrame(combined_CD4_T.X.todense().T)
input_df.index = combined_CD4_T.var_names
input_df.columns = combined_CD4_T.obs['barcodes'].tolist()

X_norm, X_trunc, df_H, fixed_W_trunc = NMFproj(input_df, fixed_W, return_truncated=True, normalized=True)
index_mapping = {
    'NMF_0': 'NMF0_Cytotoxic',
    'NMF_1': 'NMF1_Treg',
    'NMF_2': 'NMF2_Th17',
    'NMF_3': 'NMF3_Naive',
    'NMF_4': 'NMF4_Act',
    'NMF_5': 'NMF5_Th2',
    'NMF_6': 'NMF6_Tfh',
    'NMF_7': 'NMF7_IFN',
    'NMF_8': 'NMF8_Cent_Mem',
    'NMF_9': 'NMF9_Thymic_Emi',
    'NMF_10': 'NMF10_Tissue',
    'NMF_11': 'NMF11_Th1'
}

df_H.index = df_H.index.map(index_mapping)
df_H=df_H.T
df_H['AIFI_L3']=combined_CD4_T.obs['AIFI_L3'].tolist()

CPU times: user 1min 27s, sys: 1min 21s, total: 2min 48s
Wall time: 1min 31s


In [12]:
np.unique(combined_CD4_T.obs.index.tolist()==df_H.index.tolist())

array([ True])

In [13]:
df_H['pbmc_sample_id']=combined_CD4_T.obs['pbmc_sample_id'].tolist()

In [14]:
df_H.to_csv('NMF_Score_CD4_T.csv')