# import packages

In [None]:
import sys
import os
import psutil
import gc
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
import random
import numpy as np
import pandas as pd
import scipy as sp
from scipy.sparse import csr_matrix
from scipy import io
from scipy.stats import zscore
import anndata as ad
import scanpy as sc
import h5py
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sea
from pylab import rcParams
from signatureanalyzer.utils import postprocess_msigs, get_nlogs_from_output, select_markers
from os import listdir
from os.path import isfile, join
import copy as cp
import warnings

In [None]:
# Toggle as needed
warnings.filterwarnings("ignore")

In [None]:
# Set this to your source_data directory
source_data_path = ".../Source Data/"

# load data and cancer subtypes

In [None]:
cancer_subtypes={}

cancer_subtypes['DeDifferentiated'] = ['TFF1', 'FGA', 'CPS1']
cancer_subtypes['Adeno'] = ['NKX2-1', 'SFTA3', 'SFTPC']
cancer_subtypes['Squamous'] = ['KRT5', 'KRT6B', 'TP63']
cancer_subtypes["Large cell neuroendocrine"] = ['NCAM1', 'CHGA', 'CHGB']

cancer_subtypes_all = {'Cancer subtypes': ['TFF1', 'FGA', 'CPS1', 'NKX2-1', 'SFTA3', 'SFTPC', 
                                              'KRT5', 'KRT6B', 'TP63', 'NCAM1',   'CHGA',   'CHGB']}


In [None]:
adata  = ad.read('%s/Single_Cell/Wu_integrated_leiden_ann_adata.h5ad.gzip'%(source_data_path))

# cell type umap

In [None]:
random.seed(123)
rcParams['figure.figsize'] = (6,6)
sc.pl.umap(adata, color = 'leiden', legend_loc='on data', save="_Wu_leiden_number.svg")
sc.pl.umap(adata, color = 'leiden_annotation_min', save="_Wu_leiden_cellType.svg")

# clustermap markers

In [None]:
# get only the columns you want from the csv file
target_file = '/Single_Cell/Clustermap_Markers_SU2C - Clustermap_Markers_SU2C.csv'
clustermap_markers_df = pd.read_csv(source_data_path + target_file, usecols=['Pathway', 'hgnc_symbol' ], 
                                    index_col='Pathway')

In [None]:
result_raw = {}
for path in np.unique(clustermap_markers_df.index):
    sub_su2c_gene_list = clustermap_markers_df.loc[path, 'hgnc_symbol'].to_list()
    su2c_gene_list_inPaper_raw = []
    for gene in sub_su2c_gene_list:
        if (gene in adata.var.index):
            su2c_gene_list_inPaper_raw.append(gene)
    result_raw[path] = su2c_gene_list_inPaper_raw
    if result_raw[path] == []:
        del result_raw[path]
su2c_gene_list_raw = np.unique(clustermap_markers_df.loc[:, 'hgnc_symbol'].to_list())

In [None]:
result = result_raw

In [None]:
su2c_gene_markers = {'Wound Healing' : list(np.unique(result['hMono3'] + result['hN3'] + 
                                                      result['Macrophages/Monocytes'] + result['A2AR'] + 
                                                      ['PGC', 'PDLIM3'] + result['EMT'] + result['TGF-B'] + 
                                                      ['ELN', 'MFAP4', 'TBX5', 'GRIA1', 'DES'] + result['NR4A1'] +
                                                      ['LGR5', 'BCHE'])),
                     
                     'Immune Activation/Exhaustion' : list(np.unique(['MT1G', 'PDL1_TPS', 'AZGP1', 'TI-1', 'TME-2'] + 
                                                                     result['IFNG Signature'] + 
                                                                     ['CD274', 'PRPF40A', 'CXCL11', 'PSMB9'] + 
                                                                     result['MHC Class I'] + ['UBD'] + 
                                                                     result['T Cell Inflamed'] + 
                                                                     ['IGHV3-48', 'TRBV9']))
                    }



In [None]:
for key in su2c_gene_markers.keys():
    gene_to_rev = []
    for gene in su2c_gene_markers[key]:
        if gene not in adata.var.index:
            gene_to_rev.append(gene)
            
    for gene in gene_to_rev:
        su2c_gene_markers[key].remove(gene)

# metagenes

In [None]:
df = adata.to_df()
metaGenes = list(result.keys())

mg_names = []
for mGene in metaGenes:
    mg_names.append('MG_%s'%(mGene))
    df = df.join(pd.Series(df.loc[df.index, result[mGene]].mean(axis = 1), name = 'MG_%s'%(mGene) ))

df_sub = df.loc[:, mg_names]


In [None]:
# to merge with raw data
metaGenes_adata = ad.concat([adata, ad.AnnData(df_sub)], axis=1, join='inner', 
                            merge = 'first', uns_merge='first')


# meta genes TI-1 and TME-2 start here


In [None]:
TME =pd.read_csv("%s/Single_Cell/SU2C-MARK_Supplementary_Table_25_RNA_SU2C-MARK_M_Gene_Weights_Up.txt"%(source_data_path), sep="\t")
TME = TME.sort_values("M-2_loading", ascending=False)

TI =pd.read_csv("%s/Single_Cell/SU2C-MARK_Supplementary_Table_30_RNA_TCGA-LCNE_TI_Weights_Up.txt"%(source_data_path), sep="\t")
TI = TI.sort_values("TI-1_loading", ascending=False)

### TI 1 (4 TCGA based signatures) results G3


In [None]:
common_TI_1 = TI.loc[TI['Hugo_Symbol'].isin(adata.var.index).to_list(), :]
common_TI_1 = common_TI_1.sort_values("TI-1_loading", ascending=False)
common_TI_1_adata = adata[:, adata.var.index.isin(TI["Hugo_Symbol"])]
common_TI_1_genes = common_TI_1["Hugo_Symbol"]

In [None]:
weighted_TI_df = adata[:, common_TI_1_genes].to_df()


In [None]:
common_TI_1 = TI.loc[TI['Hugo_Symbol'].isin(adata.var.index).to_list(), :]

i=10
top_i_TI_1 = TI.iloc[0:i, :]
top_is_in_adata_ti = top_i_TI_1['Hugo_Symbol'].isin(adata.var.index).to_list()
top_i_TI_1genes = top_i_TI_1.loc[top_is_in_adata_ti, "Hugo_Symbol"].to_list()
weighted_TI_df["MG_TI_Top_%s"%(i)] = weighted_TI_df[top_i_TI_1genes].mean(axis=1)

In [None]:
weighted_TI_adata_data = ad.AnnData(weighted_TI_df)
weighted_TI_adata_data.obs = common_TI_1_adata.obs

### TME 2 (3 SU2C based signatures) results G1


In [None]:
common_TME_2 = TME.loc[TME['Hugo_Symbol'].isin(adata.var.index).to_list(), :]
common_TME_2 = common_TME_2.sort_values("M-2_loading", ascending=False)
common_TME_2_adata = adata[:, adata.var.index.isin(common_TME_2["Hugo_Symbol"])] 
common_TME_2_genes = common_TME_2["Hugo_Symbol"]
common_TME_2_genes_unique = np.unique(common_TME_2_genes)

In [None]:
weighted_TME_df = adata[:, common_TME_2_genes_unique].to_df()

In [None]:
i = 10 
top_i_TME_2 = TME.iloc[0:i, :]
top_is_in_adata_tme = top_i_TME_2['Hugo_Symbol'].isin(adata.var.index).to_list()
top_i_TME_2genes = top_i_TME_2.loc[top_is_in_adata_tme, "Hugo_Symbol"].to_list()    
weighted_TME_df["MG_TME_Top_%s"%(i)] = weighted_TME_df[top_i_TME_2genes].mean(axis=1)

In [None]:
weighted_TME_adata_data = ad.AnnData(weighted_TME_df)
weighted_TME_adata_data.obs =  common_TME_2_adata.obs

## select best TI and TME meta gene

In [None]:
# merge TI and TME adata data for later selection of best version of meta gene calculation
immune_adata = ad.concat([weighted_TI_adata_data, weighted_TME_adata_data], axis=1)
immune_adata.var_names_make_unique()
immune_adata.obs =  weighted_TME_adata_data.obs


In [None]:
imm_ann = ad.AnnData(immune_adata.to_df().loc[:,['MG_TME_Top_10', 'MG_TI_Top_10']])


## write metagene adata

In [None]:
madata = ad.concat([metaGenes_adata, imm_ann], axis=1, join='inner', merge = 'first', uns_merge='first')

for cell in madata.obs.index:
    l = madata.obs.loc[cell, 'leiden']
    cell_type = madata.obs.loc[cell, 'leiden_annotation_min']
    madata.obs.loc[cell, 'leiden_annotation_pair'] = '%s-%s'%(cell_type, l)
    

## updated marker lists

In [None]:
ordered_su2c_gene_markers_list = ['MG_hMono3', 
                                  'MG_hN3',
                                  'MG_Macrophages/Monocytes',
                                  'MG_A2AR',
                                  'NHSL2',
                                  'MG_EMT',
                                  'MG_TGF-B',
                                  'SIPA1L2',
                                  'MG_NR4A1',
                                  'AUTS2',
                                  'TCF7L1',
                                  'PRPF40A',
                                  'MG_T Cell Inflamed',
                                  'PSMB9',
                                  'MG_MHC Class I',
                                  'PSME1',
                                  'PSME2',
                                  'MG_TME_Top_10',
                                  'MG_TI_Top_10']


# final figure plots 

### subset to only cancer

In [None]:
np.unique(madata.obs["leiden_annotation_pair"])
cancer_annPair = ['Cancer-Adeno-10', 'Cancer-Adeno-21', 'Cancer-Adeno/Squamous-1', 
               'Cancer-Adeno/Squamous-22', 'Cancer-DeDifferentiated-12', 'Cancer-Squamous-0', 
               'Cancer-Squamous-11', 'Cancer-Squamous-16', 'Cancer-Squamous-23', 'Cancer-Squamous-3', 
               'Cancer-Squamous-6', 'Cancer-Squamous-7', 'Cancer-Squamous-8']
cancer_adata = madata[madata.obs["leiden_annotation_pair"].isin(cancer_annPair)]

### cancer subtype

In [None]:
sc.pl.dotplot(adata = cancer_adata, var_names=cancer_subtypes, 
              groupby='leiden_annotation_pair', save= "_cancer_subtypes.svg")


## limma response and nonresopnse associated

In [None]:
# Limma on pre data
revised_limma = pd.read_csv('%s/RNA/SU2C-MARK_Harmonized_Limma_All_v1.txt'%(source_data_path), sep="\t", index_col=0)
revised_limma_top_genes = revised_limma.sort_values("P.Value", ascending=True)[0:100]

In [None]:
limma_results= revised_limma
limma_results.set_index('hgnc_symbol', drop=True, inplace=True)
limma_results["-log10p"] = -np.log10(limma_results['P.Value'])


In [None]:
genelist_responce_topP = limma_results[limma_results['logFC']>0].sort_values("P.Value", 
                                                                             ascending= True)[0:10].index.to_list()

genelist_resist_topP = limma_results[limma_results['logFC']<0].sort_values("P.Value", 
                                                                           ascending= True)[0:10].index.to_list()



In [None]:
limma_sup_markers = {
    "Response":genelist_responce_topP,
    "Non-response":genelist_resist_topP,
}

In [None]:
sc.pl.dotplot(adata=madata, var_names=limma_sup_markers["Non-response"], groupby='leiden_annotation_pair', 
              title="Leiden expression of non-response genes", var_group_rotation=0, save= "_non_response_limma_updated.svg")
sc.pl.dotplot(adata=madata, var_names=limma_sup_markers["Response"], groupby='leiden_annotation_pair', 
              title="Leiden expression of response genes", var_group_rotation=0, save= "_response_limma_updated.svg")

## Update column names for meta gene dotplot figure 

In [None]:
renamed_ordered_su2c_gene_markers = {'Wound Healing': ['hMono3 (Zilionas et al.)',
                                                       'hN3 (Zilionas et al.)',
                                                       'Macrophages/Monocytes (Sade-Feldman et al.)',
                                                       'A2AR (Willingham et al.)',
                                                       '$\it{NHSL2}$ Expression',
                                                       'EMT (Hedegaard et al.)',
                                                       'TGF-B (Mariathasan et al.)',
                                                       '$\it{SIPA1L2}$ Expression',
                                                       'NR4A1 (Chen et al.)',
                                                       '$\it{AUTS2}$ Expression',
                                                       '$\it{TCF7L1}$ Expression'],
                                     'Immune Activation/Exhaustion': ['$\it{PRPF40A}$ Expression',
                                                                      'T cell-inflamed (Ayers et al.)',
                                                                      '$\it{PSMB9}$ Expression',
                                                                      'MHC Class I (Senbabaoglu et al.)',
                                                                      '$\it{PSME1}$ Expression',
                                                                      '$\it{PSME2}$ Expression',
                                                                      'De-differentiated (TI-1)',
                                                                      'Immune Activated (M-2)']
                                    }


In [None]:
tempdict = {'MG_hMono3':'hMono3 (Zilionas et al.)',
                                  'MG_hN3':'hN3 (Zilionas et al.)',
                                  'MG_Macrophages/Monocytes':'Macrophages/Monocytes (Sade-Feldman et al.)',
                                  'MG_A2AR':'A2AR (Willingham et al.)',
                                  'NHSL2':'$\it{NHSL2}$ Expression',
                                  'MG_EMT':'EMT (Hedegaard et al.)',
                                  'MG_TGF-B':'TGF-B (Mariathasan et al.)',
                                  'SIPA1L2':'$\it{SIPA1L2}$ Expression',
                                  'MG_NR4A1': 'NR4A1 (Chen et al.)',
                                  'AUTS2':'$\it{AUTS2}$ Expression',
                                  'TCF7L1':'$\it{TCF7L1}$ Expression',
                             
                                  'PRPF40A':'$\it{PRPF40A}$ Expression',
                                  'MG_T Cell Inflamed':'T cell-inflamed (Ayers et al.)',
                                  'PSMB9':'$\it{PSMB9}$ Expression',
                                  'MG_MHC Class I':'MHC Class I (Senbabaoglu et al.)',
                                  'PSME1':'$\it{PSME1}$ Expression',
                                  'PSME2':'$\it{PSME2}$ Expression',
                                  'MG_TME_Top_10':'Immune Activated (M-2)',
                                  'MG_TI_Top_10':'De-differentiated (TI-1)'}

In [None]:
sub_madata = madata[:, ordered_su2c_gene_markers_list]
mdf = sub_madata.to_df()
mdf.rename(tempdict, axis='columns', inplace = True)
renamed_madata = ad.AnnData(mdf)
renamed_madata.obs['leiden_annotation_pair'] = madata.obs['leiden_annotation_pair']

In [None]:
sc.pl.dotplot(adata=renamed_madata, var_names=renamed_ordered_su2c_gene_markers, groupby='leiden_annotation_pair',
              var_group_rotation=0, save= "SU2C_fig_4c_update_top10.svg")