# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import confusion_matrix

In [None]:
# Toggle as needed
import warnings
warnings.filterwarnings("ignore")

# Read Data

In [None]:
# Set this to your source_data directory
source_data_path = ".../Source Data/"

In [None]:
source_data_path_tcga = source_data_path + "TCGA/"

In [None]:
tcga_bnmf_harm = pd.read_csv(source_data_path_tcga + 'TCGA_Histology_NMF_Harmonized.txt',sep='\t')

In [None]:
tcga_clinical = pd.read_csv(source_data_path_tcga + 'TCGA_Clinical_Harmonized.txt',sep='\t')

In [None]:
tcga_tmb_harm = pd.read_csv(source_data_path_tcga + 'TCGA_Harmonized_TMB.txt',sep='\t')

In [None]:
tcga_bnmf_tmb_merge = tcga_bnmf_harm.merge(tcga_tmb_harm)

In [None]:
tcga_rna_harm = pd.read_csv(source_data_path_tcga + 'TCGA_RNA_Harmonized_log.txt',sep='\t')

In [None]:
tcga_clinical_bnmf_harm_merge = tcga_clinical.iloc[:,0:2].merge(tcga_bnmf_harm, left_on='Tumor_Sample_Barcode',\
                                right_on='Tumor_Sample_Barcode',how='right')

In [None]:
tcga_clinical_bnmf_harm_merge['Study ID'] = \
    tcga_clinical_bnmf_harm_merge['Study ID'].fillna('LCNE').replace('luad_tcga','LUAD').replace('lusc_tcga','LUSC')

# Analysis

## TI Cluster Heatmap

In [None]:
df = tcga_clinical_bnmf_harm_merge

In [None]:
tcga_bnmf_harm_sorted_T1 = df[df['B_cluster']==1].sort_values(by='B1_norm',\
                                                                ascending=False)

In [None]:
tcga_bnmf_harm_sorted_T2 = df[df['B_cluster']==2].sort_values(by='B2_norm',\
                                                                ascending=False)

In [None]:
tcga_bnmf_harm_sorted_T3 = df[df['B_cluster']==3].sort_values(by='B3_norm',\
                                                                ascending=False)

In [None]:
tcga_bnmf_harm_sorted_T4 = df[df['B_cluster']==4].sort_values(by='B4_norm',\
                                                                ascending=False)

In [None]:
tcga_bnmf_harm_sorted = pd.concat([tcga_bnmf_harm_sorted_T1,tcga_bnmf_harm_sorted_T2,\
          tcga_bnmf_harm_sorted_T3,tcga_bnmf_harm_sorted_T4])

In [None]:
row_color_dict = {'LUAD':'r','LUSC':'g','LCNE':'b'}

In [None]:
row_colors = tcga_bnmf_harm_sorted.replace({"Study ID":row_color_dict})['Study ID']

In [None]:
row_colors.name = ''

In [None]:
legend_TN = [mpatches.Patch(color=c, label=l) for c,l in \
             [['r','LUAD'],['g','LUSC'],['b','LCNE']]]

In [None]:
plt.figure(figsize=(16,8))
g = sns.clustermap(tcga_bnmf_harm_sorted[['B1_norm','B2_norm','B3_norm','B4_norm']].T,vmin=0,vmax=1,cmap='Blues',\
            yticklabels=False,xticklabels=False,row_cluster=False,col_cluster=False,\
               col_colors=row_colors,cbar_kws={'ticks':None,'label':None})

l2 = g.ax_heatmap.legend(loc='center left',bbox_to_anchor=(1.01,1.1),handles = legend_TN,frameon=True,\
                        prop={'size':14})
l2.set_title(title='Histology',prop={'size':14,'weight':"bold"})

## TI Cluster Confusion Matrix

In [None]:
cm = confusion_matrix(tcga_bnmf_harm_sorted['B_cluster'].apply(lambda x: str(x)),tcga_bnmf_harm_sorted['Study ID'])

In [None]:
df_cm=pd.DataFrame(cm).iloc[[0,1,2,3],[5,6,4]]

In [None]:
df_cm.columns=['LUAD','LUSC','LCNE']
df_cm.index=['De-differentiated\n(TI-1)','Adeno\n(TI-2)','Squamous\n(TI-3)','LCNE\n(TI-4)']

In [None]:
g = sns.heatmap(df_cm,annot=True,cmap="OrRd",fmt='g',annot_kws={"size":18})
g.set_xticklabels(['LUAD','LUSC','LCNE'],fontsize=18)
g.set_yticklabels(['De-differentiated (TI-1)','Adeno (TI-2)','Squamous (TI-3)','LCNE (TI-4)'],fontsize=18)
g.tick_params(left=False,bottom=False)
g.collections[0].colorbar.ax.tick_params(labelsize=16)

## TI Cluster Lineage Scatterplot

In [None]:
tcga_rna_harm_slim = tcga_rna_harm[['Tumor_Sample_Barcode','NKX2-1_RNA','SOX2_RNA','NAPSA_RNA','TP63_RNA']]

In [None]:
tcga_merge_bnmf_rna = tcga_bnmf_harm.merge(tcga_rna_harm_slim,left_on='Tumor_Sample_Barcode',right_on='Tumor_Sample_Barcode')

In [None]:
tcga_merge_bnmf_rna = tcga_merge_bnmf_rna[tcga_merge_bnmf_rna['B_cluster'].isin([1,2,3,4])].sort_values('B_cluster')

In [None]:
tcga_merge_bnmf_rna['T cluster'] = tcga_merge_bnmf_rna['B_cluster'].apply(lambda x: 'T'+str(x))

In [None]:
g = sns.scatterplot(data = tcga_merge_bnmf_rna,x = 'NAPSA_RNA',y='TP63_RNA',hue='T cluster')
g.set_xticklabels(['',0,2,4,6,8,10],fontsize=14)
g.set_yticklabels(['',0,1,2,3,4,5,6,7],fontsize=14)
plt.xlabel('NAPSA log(tpm + 1)',fontsize=16)
plt.ylabel('TP63 log(tpm + 1)',fontsize=16)
l2 = g.legend(loc='center left',bbox_to_anchor=(1.05,0.8),frameon=True,\
                        prop={'size':12})
l2.set_title(title='Tumor-Intrinsic (TI) Cluster',prop={'size':12,'weight':"bold"})

new_labels = ['De-differentiated (TI-1)','Adeno (TI-2)','Squamous (TI-3)','LCNE (TI-4)']
for t, l in zip(l2.texts, new_labels):
    t.set_text(l)

## TI Cluster Markers

In [None]:
marker_list = ['NKX2-1_RNA','SFTA3_RNA','SFTPC_RNA',\
              'TFF1_RNA','FGA_RNA','CPS1_RNA',\
              'KRT5_RNA','KRT6B_RNA','TP63_RNA',\
              'CHGA_RNA','CHGB_RNA','NCAM1_RNA']

In [None]:
tcga_rna_harm_slim = tcga_rna_harm[['Tumor_Sample_Barcode'] + marker_list]

In [None]:
tcga_merge_bnmf_rna = tcga_bnmf_harm.merge(tcga_rna_harm_slim,left_on='Tumor_Sample_Barcode',right_on='Tumor_Sample_Barcode')

In [None]:
marker_list_clean = [gene.replace('_RNA','') for gene in marker_list]

In [None]:
fig,axes = plt.subplots(4,3, figsize=(24,14))

xlabels = ['TI-1','TI-2','TI-3','TI-4']

for i in range(0,12):
    row = int(np.floor(i/3))
    column = i%3

    axis = axes[row,column]
    ax = sns.violinplot(data = tcga_merge_bnmf_rna,
             x='B_cluster',y=marker_list[i],linewidth=2,inner=None,ax=axis)
    for violin in ax.collections:
        violin.set_edgecolor(violin.get_facecolor())
        violin.set_facecolor((0,0,0,0))

    g = sns.swarmplot(data = tcga_merge_bnmf_rna,\
                   x='B_cluster',y=marker_list[i],s=2,ax=ax)
    ax.set_ylabel(marker_list_clean[i]+'\nln(TPM+1)',font='Arial',weight='bold',fontsize=18,\
                        rotation=90)
    ax.set_xticklabels(xlabels,font='Arial',weight='bold',fontsize=18)
    ax.set_yticklabels(ax.get_yticks(),font='Arial',weight='bold',fontsize=18)
    ax.set_xlabel(None)
plt.subplots_adjust(wspace=0.25,hspace=0.25)

## TMB Comparison of TI Clusters

In [None]:
#Convert from log(total mutations) to log(total mutations/MB)
tcga_bnmf_tmb_merge['log_TMB_scaled'] = tcga_bnmf_tmb_merge['log_TMB'].apply(lambda x: np.log((np.exp(x))/33))

In [None]:
plt.figure(figsize=(7,4))
h=sns.swarmplot(data=tcga_bnmf_tmb_merge,y='log_TMB_scaled',x='B_cluster',size=2.5)
plt.locator_params(axis='y', nbins=6)
h.set_xticklabels(['De-differentiated\n(TI-1)','Adeno\n(TI-2)','Squamous\n(TI-3)','LCNE\n(TI-4)'])
ax = plt.gca()
plt.setp(ax.get_yticklabels(), fontsize=12,weight='bold')
plt.setp(ax.get_xticklabels(), fontsize=10, weight='bold')
plt.ylabel('ln(TMB)',font='Arial',fontsize=14,weight='bold',labelpad=10)
plt.xlabel('')