In [3]:
#install package from this github https://github.com/yyoshiaki/NMFprojection
from NMFproj import *

In [4]:
import pandas as pd 
import numpy as np 
import scanpy as sc
import matplotlib.pyplot as plt
import concurrent.futures
import warnings
from datetime import date
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed,ProcessPoolExecutor
from tqdm import tqdm
import anndata
import gc

warnings.filterwarnings("ignore")
sc.settings.n_jobs = 60

In [5]:
# Read NMF projection matrix
fixed_W=pd.read_csv("NMF.W.CD4T.csv")
fixed_W = fixed_W.rename(columns={'Unnamed: 0': ''})
fixed_W.set_index(fixed_W.columns[0], inplace=True)

In [13]:
# only select CD4 T cell to perform projection
selected_celltype =['Core naive CD4 T cell',
 'CM CD4 T cell',
 'GZMB- CD27+ EM CD4 T cell',
 'CD4 MAIT',
 'KLRB1+ memory CD4 Treg',
 'KLRF1- GZMB+ CD27- memory CD4 T cell',
 'Naive CD4 Treg',
 'GZMB- CD27- EM CD4 T cell',
 'ISG+ naive CD4 T cell',
 'Memory CD4 Treg',
 'ISG+ memory CD4 T cell',
 'GZMK+ memory CD4 Treg',
 'SOX4+ naive CD4 T cell']

# BRI

In [4]:
meta_data=pd.read_csv("/home/jupyter/BRI_Figures_Final_V2/Dataset/scRNA_meta_data-2024-05-09.csv")

In [5]:
file_names= ['/home/jupyter/BRI_Figures_Final_V2/Dataset/scRNA/BRI/h5ad/sample_h5ad/'+x+".h5ad" for x in meta_data['pbmc_sample_id'].tolist()]


In [6]:
def process_file(i, selected_celltype, fixed_W):
    adata = sc.read_h5ad(i)
    
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    adata_subset = adata[adata.obs['AIFI_L3'].isin(selected_celltype)]
    
    input_df = pd.DataFrame(adata_subset.X.todense().T)
    input_df.index = adata_subset.var_names
    input_df.columns = adata_subset.obs['barcodes'].tolist()
    
    X_norm, X_trunc, df_H, fixed_W_trunc = NMFproj(input_df, fixed_W, return_truncated=True, normalized=True)
    index_mapping = {
        'NMF_0': 'NMF0_Cytotoxic',
        'NMF_1': 'NMF1_Treg',
        'NMF_2': 'NMF2_Th17',
        'NMF_3': 'NMF3_Naive',
        'NMF_4': 'NMF4_Act',
        'NMF_5': 'NMF5_Th2',
        'NMF_6': 'NMF6_Tfh',
        'NMF_7': 'NMF7_IFN',
        'NMF_8': 'NMF8_Cent_Mem',
        'NMF_9': 'NMF9_Thymic_Emi',
        'NMF_10': 'NMF10_Tissue',
        'NMF_11': 'NMF11_Th1'
    }
    
    df_H.index = df_H.index.map(index_mapping)
    output_filename = "NMF_Score_BRI/" + adata.obs['specimen.specimenGuid'][0] + ".csv"
    df_H=df_H.T
    df_H['AIFI_L3']=adata_subset.obs['AIFI_L3'].tolist()
    df_H.to_csv(output_filename)

In [15]:
os.mkdir('NMF_Score_BRI')

FileExistsError: [Errno 17] File exists: 'NMF_Score_BRI'

In [16]:
# Parallel processing
with concurrent.futures.ProcessPoolExecutor(max_workers=60) as executor:
    futures = [executor.submit(process_file, i, selected_celltype, fixed_W) for i in file_names]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(file_names)):
        future.result()

100% 868/868 [05:29<00:00,  2.63it/s]


# SF4

In [17]:
meta_data=pd.read_csv("/home/jupyter/BRI_Figures_Final_V1/Dataset/SF4_meta_data-2024-05-05.csv")

In [18]:
file_names= ['/home/jupyter/BRI_Figures_Final_V1/Dataset/scRNA/SF4/h5ad/'+x+".h5ad" for x in meta_data['pbmc_sample_id'].tolist()]

In [19]:
def process_file(i, selected_celltype, fixed_W):
    adata = sc.read_h5ad(i)
    
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    adata_subset = adata[adata.obs['celltypist_l3'].isin(selected_celltype)]
    
    input_df = pd.DataFrame(adata_subset.X.todense().T)
    input_df.index = adata_subset.var_names
    input_df.columns = adata_subset.obs['barcodes'].tolist()
    
    X_norm, X_trunc, df_H, fixed_W_trunc = NMFproj(input_df, fixed_W, return_truncated=True, normalized=True)
    index_mapping = {
        'NMF_0': 'NMF0_Cytotoxic',
        'NMF_1': 'NMF1_Treg',
        'NMF_2': 'NMF2_Th17',
        'NMF_3': 'NMF3_Naive',
        'NMF_4': 'NMF4_Act',
        'NMF_5': 'NMF5_Th2',
        'NMF_6': 'NMF6_Tfh',
        'NMF_7': 'NMF7_IFN',
        'NMF_8': 'NMF8_Cent_Mem',
        'NMF_9': 'NMF9_Thymic_Emi',
        'NMF_10': 'NMF10_Tissue',
        'NMF_11': 'NMF11_Th1'
    }
    
    df_H.index = df_H.index.map(index_mapping)
    output_filename = "NMF_Score_SF4/" + adata.obs['sampleID'][0] + ".csv"
    df_H=df_H.T
    df_H['AIFI_L3']=adata_subset.obs['celltypist_l3'].tolist()
    df_H.to_csv(output_filename)

In [20]:
os.mkdir('NMF_Score_SF4')

In [21]:
# Parallel processing
with concurrent.futures.ProcessPoolExecutor(max_workers=60) as executor:
    futures = [executor.submit(process_file, i, selected_celltype, fixed_W) for i in file_names]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(file_names)):
        future.result()

100% 235/235 [05:08<00:00,  1.31s/it]


# Write Out Top 20 genes for NMF factor

In [6]:
#write out top genes for each factor
index_mapping = {
        'NMF_0': 'NMF0_Cytotoxic',
        'NMF_1': 'NMF1_Treg',
        'NMF_2': 'NMF2_Th17',
        'NMF_3': 'NMF3_Naive',
        'NMF_4': 'NMF4_Act',
        'NMF_5': 'NMF5_Th2',
        'NMF_6': 'NMF6_Tfh',
        'NMF_7': 'NMF7_IFN',
        'NMF_8': 'NMF8_Cent_Mem',
        'NMF_9': 'NMF9_Thymic_Emi',
        'NMF_10': 'NMF10_Tissue',
        'NMF_11': 'NMF11_Th1'
    }
    
fixed_W.columns = fixed_W.columns.map(index_mapping)

In [7]:
n = 20

top_genes_dict = {}

for group_col in fixed_W.columns:
    sorted_df = fixed_W[[group_col]].dropna().sort_values(by=group_col, ascending=False)
    
    top_n_indices = sorted_df.head(n).index.tolist()
    
    top_genes_dict[group_col] = top_n_indices

In [8]:
import json

with open('NMF_dict_Top20.json', 'w') as jsonfile:
    json.dump(top_genes_dict, jsonfile)

In [4]:
import session_info


session_info.show()