In [1]:
import scanpy as sc
import pandas as pd
import glob

In [2]:
adata_SS2_processed = sc.read_h5ad('SS2_processed.h5ad')
adata_MULTI_processed = sc.read_h5ad('MULTI_processed.h5ad')

# SS2 genes

In [3]:
SS2_one_vs_rest_result_files = [f for f in glob.glob("DEGs/SS2/1_VS_rest/*.csv")]

In [4]:
SS2_one_vs_rest_result_files

['DEGs/SS2/1_VS_rest/H5097.csv',
 'DEGs/SS2/1_VS_rest/H4272.csv',
 'DEGs/SS2/1_VS_rest/HCI005.csv',
 'DEGs/SS2/1_VS_rest/HCI010.csv',
 'DEGs/SS2/1_VS_rest/HCI009.csv',
 'DEGs/SS2/1_VS_rest/J2036.csv',
 'DEGs/SS2/1_VS_rest/HCI011.csv',
 'DEGs/SS2/1_VS_rest/H5471.csv',
 'DEGs/SS2/1_VS_rest/H3204.csv',
 'DEGs/SS2/1_VS_rest/J53353.csv',
 'DEGs/SS2/1_VS_rest/J55454.csv',
 'DEGs/SS2/1_VS_rest/HCI001.csv']

In [5]:
SS2_final_1_vs_rest_genes = pd.DataFrame()
for i in SS2_one_vs_rest_result_files:
    temp_df = pd.DataFrame()
    EMT_ID = i.split('.')[0].split('1_VS_rest/')[1]
    df = pd.read_csv(i,index_col = 0)
    df = df[df['p_val'] < 0.05]
    up_genes = df[df['avg_log2FC'] > 0.5].sort_values(by='avg_log2FC', ascending=False).index.tolist()
    
    temp_df[EMT_ID] = up_genes
   
    SS2_final_1_vs_rest_genes = pd.concat([SS2_final_1_vs_rest_genes,temp_df], ignore_index=False, axis=1)

In [6]:
SS2_final_1_vs_rest_genes

Unnamed: 0,H5097,H4272,HCI005,HCI010,HCI009,J2036,HCI011,H5471,H3204,J53353,J55454,HCI001
0,ECM1,KRT14,SLC9A3R1,NR4A1,MAFB,LALBA,AGR2,S100A2,C2orf40,COL9A3,FDCSP,KRT16
1,COX6C,RARRES1,KRT8,CEBPD,CEACAM6,MUCL1,TFF1,TMSB4X,WIF1,MSLN,RARRES1,KLK7
2,APOD,LINC00472,MTCO2P12,CXCL2,AGR2,AARD,CRIP2,MGP,KRT15,SDR16C5,B2M,KLK6
3,SERPINI1,MMP7,KRT8P3,COMP,SCGB2A2,TPM2,CRIP1,KRT15,CTSF,SCRG1,S100B,KRT17
4,MAGEA8,TYMP,MTCO3P12,TRIB1,TSPYL2,SCGB3A1,PLPP5,PDK3,KRT23,CCN2,KRT81,KLK5
...,...,...,...,...,...,...,...,...,...,...,...,...
1145,,,,GALR2,,,,,,,,
1146,,,,NAT10,,,,,,,,
1147,,,,SUN3,,,,,,,,
1148,,,,ABCC3,,,,,,,,


In [7]:
SS2_final_1_vs_rest_genes.to_csv('SS2_1_vs_rest_genes_list.csv')

In [8]:
low_met_tumors = ['J55454', 'H5471', 'HCI005', 'H3204', 'H4272']
intermediate_met_tumors = ['HCI009', 'HCI011', 'HCI001']
high_met_tumors = ['H5097', 'J2036', 'J53353', 'HCI010']

In [9]:
def one_vs_rest_list(tumors_list):    
    genes_list = []
    for i in tumors_list:
        genes = [x for x in SS2_final_1_vs_rest_genes[i].tolist() if str(x) != 'nan']
        
        genes_list = genes_list + genes
        
    return genes_list

In [10]:
low_genes = one_vs_rest_list(low_met_tumors)
intermediate_genes = one_vs_rest_list(intermediate_met_tumors)
high_genes = one_vs_rest_list(high_met_tumors)

In [11]:
SS2_low_overlap = list(set([x for x in low_genes if low_genes.count(x) >= 2]))
len(SS2_low_overlap)

458

In [12]:
SS2_intermediate_overlap = list(set([x for x in intermediate_genes if intermediate_genes.count(x) >= 2]))
len(SS2_intermediate_overlap)

186

In [13]:
SS2_high_overlap = list(set([x for x in high_genes if high_genes.count(x) >= 2]))
len(SS2_high_overlap)

561

In [14]:
final_1_vs_rest_share_genes = pd.concat([pd.DataFrame({'low_overlap':SS2_low_overlap}),pd.DataFrame({'moderate_overlap':SS2_intermediate_overlap}),pd.DataFrame({'high_overlap':SS2_high_overlap})], ignore_index=False, axis=1)

In [15]:
final_1_vs_rest_share_genes.to_csv('SS2_1_vs_rest_share_genes_list_shared_at_least_2_tumors_in_a_group.csv')

# MULTI genes

In [16]:
MULTI_one_vs_rest_result_files = [f for f in glob.glob("DEGs/10X/1_VS_rest/*.csv")]

In [17]:
MULTI_one_vs_rest_result_files

['DEGs/10X/1_VS_rest/H5097.csv',
 'DEGs/10X/1_VS_rest/H4272.csv',
 'DEGs/10X/1_VS_rest/HCI005.csv',
 'DEGs/10X/1_VS_rest/HCI010.csv',
 'DEGs/10X/1_VS_rest/J2036.csv',
 'DEGs/10X/1_VS_rest/HCI002.csv',
 'DEGs/10X/1_VS_rest/HCI011.csv',
 'DEGs/10X/1_VS_rest/J53353.csv',
 'DEGs/10X/1_VS_rest/J55454.csv',
 'DEGs/10X/1_VS_rest/HCI001.csv']

In [18]:
MULTI_final_1_vs_rest_genes = pd.DataFrame()
for i in MULTI_one_vs_rest_result_files:
    temp_df = pd.DataFrame()
    EMT_ID = i.split('.')[0].split('1_VS_rest/')[1]
    df = pd.read_csv(i,index_col = 0)
    df = df[df['p_val'] < 0.05]
    up_genes = df[df['avg_log2FC'] > 0.5].sort_values(by='avg_log2FC', ascending=False).index.tolist()
    
    temp_df[EMT_ID] = up_genes
   
    MULTI_final_1_vs_rest_genes = pd.concat([MULTI_final_1_vs_rest_genes,temp_df], ignore_index=False, axis=1)

In [19]:
MULTI_final_1_vs_rest_genes

Unnamed: 0,H5097,H4272,HCI005,HCI010,J2036,HCI002,HCI011,J53353,J55454,HCI001
0,APOD,KRT14,SCGB1D2,CFD,LALBA,AC022081.1,TFF1,SNORC,FDCSP,LCN2
1,COX6C,RARRES1,SCGB2A2,SNHG25,MUCL1,GADD45G,CRIP1,SCRG1,RARRES1,AZGP1
2,GSTM3,MMP7,PRSS23,ELANE,SCGB3A1,OLFM4,AGR2,COL9A3,HLA-B,PERP
3,SH3BGRL,C5orf46,TFF3,NDUFA4L2,ZG16B,APOE,NPW,H19,KRT81,CKB
4,COX6A1,WFDC2,RAMP1,ELN,AARD,APOC1,SLC39A6,S100A4,SAA1,LGALS1
...,...,...,...,...,...,...,...,...,...,...
631,,SYNJ2BP,,,,,,,,
632,,MXD4,,,,,,,,
633,,PPIE,,,,,,,,
634,,CCDC144CP,,,,,,,,


In [20]:
MULTI_final_1_vs_rest_genes.to_csv('MULTI_1_vs_rest_genes_list.csv')

In [21]:
low_met_tumors = ['HCI002', 'J55454', 'HCI005', 'H4272']
intermediate_met_tumors = ['HCI011', 'HCI001']
high_met_tumors = ['H5097', 'J2036', 'J53353', 'HCI010']

In [22]:
def one_vs_rest_list(tumors_list):    
    genes_list = []
    for i in tumors_list:
        genes = [x for x in MULTI_final_1_vs_rest_genes[i].tolist() if str(x) != 'nan']
        
        genes_list = genes_list + genes
        
    return genes_list

In [23]:
low_genes = one_vs_rest_list(low_met_tumors)
intermediate_genes = one_vs_rest_list(intermediate_met_tumors)
high_genes = one_vs_rest_list(high_met_tumors)

In [24]:
MULTI_low_overlap = list(set([x for x in low_genes if low_genes.count(x) >= 2]))
len(MULTI_low_overlap)

335

In [25]:
MULTI_intermediate_overlap = list(set([x for x in intermediate_genes if intermediate_genes.count(x) >= 2]))
len(MULTI_intermediate_overlap)

81

In [26]:
MULTI_high_overlap = list(set([x for x in high_genes if high_genes.count(x) >= 2]))
len(MULTI_high_overlap)

283

In [27]:
final_1_vs_rest_share_genes = pd.concat([pd.DataFrame({'low_overlap':MULTI_low_overlap}),pd.DataFrame({'moderate_overlap':MULTI_intermediate_overlap}),pd.DataFrame({'high_overlap':MULTI_high_overlap})], ignore_index=False, axis=1)

In [28]:
final_1_vs_rest_share_genes.to_csv('MULTI_1_vs_rest_share_genes_list_shared_at_least_2_tumors_in_a_group.csv')

# find overlap genes between SS2 and MULTI-seq

In [29]:
final_low_overlap = list(set(SS2_low_overlap)&set(MULTI_low_overlap))
len(final_low_overlap)

98

In [30]:
final_intermediate_overlap = list(set(SS2_intermediate_overlap)&set(MULTI_intermediate_overlap))
len(final_intermediate_overlap)

22

In [31]:
final_high_overlap = list(set(SS2_high_overlap[:])&set(MULTI_high_overlap[:]))
len(final_high_overlap)

140

In [32]:
final_1_vs_rest_genes = pd.DataFrame()
temp_df_low = pd.DataFrame()
temp_df_int = pd.DataFrame()
temp_df_high = pd.DataFrame()
temp_df_low['low_overlap'] = final_low_overlap
temp_df_int['moderate_overlap'] = final_intermediate_overlap
temp_df_high['high_overlap'] = final_high_overlap
final_1_vs_rest_genes = pd.concat([final_1_vs_rest_genes,temp_df_low,temp_df_int,temp_df_high], ignore_index=False, axis=1)
final_1_vs_rest_genes

Unnamed: 0,low_overlap,moderate_overlap,high_overlap
0,EFNA5,ATP6V1G1,TIMM13
1,ELF3,TSTA3,MRPL12
2,CREG1,HSPD1,P4HB
3,SDF2,H2AFY,ENO1
4,NDUFB10,HNRNPA2B1,SLPI
...,...,...,...
135,,,CCDC88A
136,,,GPAA1
137,,,BLVRB
138,,,ZNF706


In [33]:
final_1_vs_rest_genes.to_csv('SS2_MULTI_1_vs_others_shared_genes.csv')

# making files for combined SS2 and MULTI dataset heatmap

In [34]:
adata_SS2_processed_tumor = adata_SS2_processed[adata_SS2_processed.obs['sort'] == 'Tumor'].copy()
adata_MULTI_processed_tumor = adata_MULTI_processed[adata_MULTI_processed.obs['sort'] == 'Tumor'].copy()

  res = method(*args, **kwargs)


In [35]:
SS2_MULTI_combined_df = pd.concat([adata_SS2_processed_tumor.to_df(), adata_MULTI_processed_tumor.to_df()])
SS2_MULTI_combined_df

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,CMC4,LINC00685-1,SLC25A6-1,ASMTL-1,MT-TQ,MT-TA,MT-TN,MT-TS1,MT-TK,MT-TH
A10_D088223_B106234_S10.homo.gencode.v30.ERCC.chrM,1.065904,-0.023565,-0.800092,-0.261282,-0.102227,-0.071613,-0.082078,-0.687182,-0.312428,-0.355880,...,,,,,,,,,,
A10_D088224_B106233_S10.homo.gencode.v30.ERCC.chrM,1.357900,0.000334,1.199165,-0.253723,-0.029310,-0.069788,-0.079685,-0.609155,-0.288949,1.947217,...,,,,,,,,,,
A11_B008880_B106232_S11.homo.gencode.v30.ERCC.chrM,-0.481161,-0.014066,2.950004,-0.202641,-0.805975,-0.005187,0.030367,-0.297675,-0.109880,-0.157250,...,,,,,,,,,,
A11_D088219_B106235_S11.homo.gencode.v30.ERCC.chrM,0.293514,0.008894,-0.042285,-0.253152,0.024951,-0.071657,-0.083110,1.140038,-0.287961,-0.362513,...,,,,,,,,,,
A11_D088224_B106233_S11.homo.gencode.v30.ERCC.chrM,2.078142,-0.020274,-0.740420,-0.260395,-0.090159,-0.071543,-0.082057,-0.685659,-0.309729,-0.356329,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACATACAGCT-8-7,0.141609,,0.162739,-0.310303,-0.054377,-0.039414,-0.059823,-0.447930,-0.236607,-0.303904,...,-0.092762,-0.059811,-0.134008,-0.036522,-0.071257,-0.059495,-0.090417,-0.040835,-0.185218,-0.053688
TTTGTCAGTTCCAACA-8-7,1.012131,,-0.928261,-0.302758,-0.094735,-0.038211,-0.053749,0.792208,-0.243430,-0.300124,...,-0.103576,-0.057624,-0.129742,-0.034396,-0.064061,-0.056568,-0.088048,-0.044344,-0.176309,-0.049293
TTTGTCAGTTCGTCTC-8-7,1.455134,,0.370014,-0.298983,-0.181208,-0.037944,-0.048420,0.992081,-0.234559,-0.291755,...,-0.105671,-0.054812,-0.123551,-0.038893,-0.058836,-0.054521,-0.087227,-0.043096,-0.163954,-0.040859
TTTGTCATCCTAGTGA-8-7,2.877068,,-0.947572,-0.313720,-0.044434,-0.040001,-0.062285,-0.442049,-0.231974,-0.304801,...,-0.087449,-0.060586,-0.135430,-0.038184,-0.074311,-0.060746,-0.091536,-0.038869,-0.188260,-0.054895


In [36]:
final_list = []
for i in final_low_overlap + final_intermediate_overlap + final_high_overlap:
    if i not in final_list:
        final_list.append(i)

In [37]:
SS2_MULTI_combined_df = SS2_MULTI_combined_df[final_list]
SS2_MULTI_combined_df

Unnamed: 0,EFNA5,ELF3,CREG1,SDF2,NDUFB10,CRYAB,ZCRB1,CD59,LAMP2,RPS27L,...,PRDX2,CCDC124,IFITM3,S100A4,YWHAZ,DCTPP1,CCDC88A,GPAA1,ZNF706,APOD
A10_D088223_B106234_S10.homo.gencode.v30.ERCC.chrM,-0.429295,-1.015788,-0.644690,-0.741781,0.431602,-0.782602,-0.870021,0.783825,-0.947678,-0.541053,...,0.852446,0.751804,1.533548,2.266842,0.365914,-0.776359,-0.411786,1.633001,0.446491,3.131921
A10_D088224_B106233_S10.homo.gencode.v30.ERCC.chrM,1.079903,1.762500,-0.421472,-0.121354,-0.538002,1.565611,1.065092,0.899542,1.584400,0.548601,...,-0.212156,0.180458,-0.615497,-0.559090,0.502062,0.289254,2.481177,-0.885219,0.320179,-0.526046
A11_B008880_B106232_S11.homo.gencode.v30.ERCC.chrM,-0.494940,0.974294,-0.466979,-0.226631,-1.125769,-0.736537,2.751403,-0.718001,-0.495585,-0.788861,...,1.150664,-0.772167,0.636578,-0.275337,-0.225724,-0.673121,-0.671874,-1.147932,0.702279,-0.204302
A11_D088219_B106235_S11.homo.gencode.v30.ERCC.chrM,-0.249642,0.924806,-0.282410,-0.781806,-0.222392,1.457646,1.089743,0.025965,-0.345778,-0.251663,...,-0.239229,0.021492,0.795786,2.247108,2.228669,0.427904,2.955042,1.270807,1.038622,-0.550729
A11_D088224_B106233_S11.homo.gencode.v30.ERCC.chrM,3.645844,0.135886,2.388566,0.138979,-0.471434,0.577188,0.989367,0.983468,3.040715,0.958570,...,-1.584067,0.527691,-0.646666,-0.611022,0.467848,-0.766412,1.957295,-0.710952,-1.192958,-0.495155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACATACAGCT-8-7,-0.402116,-0.605538,-0.554103,-0.681700,-0.385514,-0.940808,-0.938142,0.016225,0.363739,-0.868988,...,0.895862,0.386103,1.167248,-0.166827,0.524296,0.563537,-0.456764,-0.441102,-0.000008,-0.587381
TTTGTCAGTTCCAACA-8-7,-0.411978,0.075408,-0.550984,0.210087,0.306985,-0.982795,-0.301268,0.306322,-0.165639,-0.134562,...,0.735030,0.016649,0.325981,1.423135,0.565518,-0.710060,0.554474,-1.014332,0.444494,-0.556286
TTTGTCAGTTCGTCTC-8-7,-0.412786,-0.136348,-0.543263,-0.656324,0.439593,-0.585722,-0.201011,-0.000685,-0.729168,-0.194696,...,0.874465,1.457964,0.497687,1.030451,0.686376,-0.045012,0.992405,0.862388,0.369716,-0.281732
TTTGTCATCCTAGTGA-8-7,-0.397132,-0.802775,-0.554740,-0.683688,-0.547951,-0.922510,0.472243,-0.990508,-0.227507,-0.484089,...,0.632502,1.753664,0.288843,1.080337,0.008534,-0.569109,2.303072,0.123610,0.419141,0.501744


In [38]:
SS2_MULTI_combined_df = SS2_MULTI_combined_df.T

In [39]:
SS2_MULTI_combined_df

Unnamed: 0,A10_D088223_B106234_S10.homo.gencode.v30.ERCC.chrM,A10_D088224_B106233_S10.homo.gencode.v30.ERCC.chrM,A11_B008880_B106232_S11.homo.gencode.v30.ERCC.chrM,A11_D088219_B106235_S11.homo.gencode.v30.ERCC.chrM,A11_D088224_B106233_S11.homo.gencode.v30.ERCC.chrM,A12_B008880_B106232_S12.homo.gencode.v30.ERCC.chrM,A12_D088219_B106235_S12.homo.gencode.v30.ERCC.chrM,A12_D088226_B106239_S12.homo.gencode.v30.ERCC.chrM,A13_D088219_B106235_S13.homo.gencode.v30.ERCC.chrM,A13_D088223_B106234_S13.homo.gencode.v30.ERCC.chrM,...,TTTGTCAAGGCAGTCA-8-7,TTTGTCAAGTAGATGT-8-7,TTTGTCACACGAAGCA-8-7,TTTGTCACACTCAGGC-8-7,TTTGTCACAGTATAAG-8-7,TTTGTCACATACAGCT-8-7,TTTGTCAGTTCCAACA-8-7,TTTGTCAGTTCGTCTC-8-7,TTTGTCATCCTAGTGA-8-7,TTTGTCATCTACTATC-8-7
EFNA5,-0.429295,1.079903,-0.494940,-0.249642,3.645844,-0.471494,-0.034913,-0.435691,1.025311,-0.431061,...,-0.396124,-0.431721,-0.412435,-0.410779,-0.511595,-0.402116,-0.411978,-0.412786,-0.397132,-0.518262
ELF3,-1.015788,1.762500,0.974294,0.924806,0.135886,1.032003,-0.237841,-1.005115,0.777302,0.701310,...,-0.468388,0.509888,-0.456137,-0.361244,-0.930588,-0.605538,0.075408,-0.136348,-0.802775,0.936952
CREG1,-0.644690,-0.421472,-0.466979,-0.282410,2.388566,-0.520138,-0.261187,-0.631327,0.014524,-0.643929,...,0.929974,-0.541866,-0.550674,-0.546129,-0.499097,-0.554103,-0.550984,-0.543263,-0.554740,1.396984
SDF2,-0.741781,-0.121354,-0.226631,-0.781806,0.138979,-0.353144,-0.219445,-0.705159,0.439575,0.728119,...,-0.680137,-0.651056,-0.672883,-0.662940,-0.546640,-0.681700,0.210087,-0.656324,-0.683688,-0.524901
NDUFB10,0.431602,-0.538002,-1.125769,-0.222392,-0.471434,1.710709,-0.032939,-1.255919,-0.870899,0.198103,...,0.975759,-1.559831,-0.154665,-1.550376,1.360332,-0.385514,0.306985,0.439593,-0.547951,1.394196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DCTPP1,-0.776359,0.289254,-0.673121,0.427904,-0.766412,-0.535438,0.209485,-0.825593,0.313723,0.400640,...,-0.561171,-0.901044,0.131900,-0.703107,1.200908,0.563537,-0.710060,-0.045012,-0.569109,-0.565778
CCDC88A,-0.411786,2.481177,-0.671874,2.955042,1.957295,-0.438611,4.005063,-0.490932,2.539005,-0.215712,...,-0.439659,-0.530144,1.301818,-0.475395,-0.721811,-0.456764,0.554474,0.992405,2.303072,-0.734030
GPAA1,1.633001,-0.885219,-1.147932,1.270807,-0.710952,-0.979225,1.534949,-0.922056,0.276274,1.030053,...,-0.923280,0.491873,-0.373266,2.000878,0.091499,-0.441102,-1.014332,0.862388,0.123610,0.973844
ZNF706,0.446491,0.320179,0.702279,1.038622,-1.192958,0.393493,1.104558,-1.241553,0.349358,0.326610,...,0.291851,-0.114989,-0.806522,0.875040,1.881553,-0.000008,0.444494,0.369716,0.419141,0.958077


In [40]:
SS2_MULTI_combined_df.to_csv('SS2_MULTI_tumor_only_shared_genes_gc.csv')

In [41]:
SS2_metadata = adata_SS2_processed_tumor.obs.copy()
SS2_metadata['method'] = 'SS2'
SS2_metadata

Unnamed: 0,plate_ID,well,cell_id,Tumor_ID,Animal_ID,receptor_status,ER_receptor,intrinsic_BC_type,metastatic_potential_rank,metastatic_potential_group,...,S_score,G2M_score,phase,louvain,leiden,E_score,M_score,EMP_score,EMP_stage,method
A10_D088223_B106234_S10.homo.gencode.v30.ERCC.chrM,D088223,A10,A10_D088223,J53353,A2781,TNBC,ER-,basal,12,high,...,-0.095455,-0.156764,G1,8,7,-0.256206,-0.005513,0.250694,Mesenchymal-like,SS2
A10_D088224_B106233_S10.homo.gencode.v30.ERCC.chrM,D088224,A10,A10_D088224,H4272,876,TNBC,ER-,basal,6,low,...,-0.151092,-0.181816,G1,9,9,-0.066053,0.036384,0.102436,EMP Intermediate,SS2
A11_B008880_B106232_S11.homo.gencode.v30.ERCC.chrM,B008880,A11,A11_B008880,HCI011,928,ER+PR+,ER+,luminal B,8,moderate,...,-0.009401,0.914188,G2M,3,3,-0.285115,-0.364366,-0.079251,EMP Intermediate,SS2
A11_D088219_B106235_S11.homo.gencode.v30.ERCC.chrM,D088219,A11,A11_D088219,HCI010,D0098,TNBC,ER-,basal,13,high,...,-0.175175,-0.232306,G1,2,2,-0.131939,0.443152,0.575091,Mesenchymal-like,SS2
A11_D088224_B106233_S11.homo.gencode.v30.ERCC.chrM,D088224,A11,A11_D088224,H4272,876,TNBC,ER-,basal,6,low,...,-0.103416,-0.163204,G1,9,9,-0.079068,-0.224259,-0.145191,EMP Intermediate,SS2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P2_B008648_B008695.homo.gencode.v30.ERCC.chrM,B008648,P2,P2_B008648,H5471,2401,TNBC,ER-,basal,3,low,...,-0.031380,0.401153,G2M,4,4,-0.493750,-0.275921,0.217830,Mesenchymal-like,SS2
P4_B008648_B008695.homo.gencode.v30.ERCC.chrM,B008648,P4,P4_B008648,H5471,2401,TNBC,ER-,basal,3,low,...,-0.171220,-0.233650,G1,1,1,-0.040506,-0.407374,-0.366868,Epithelial-like,SS2
P5_B008648_B008695.homo.gencode.v30.ERCC.chrM,B008648,P5,P5_B008648,H5471,2401,TNBC,ER-,basal,3,low,...,-0.084996,-0.064025,G1,1,1,-0.362897,-0.169785,0.193111,EMP Intermediate,SS2
P6_B008648_B008695.homo.gencode.v30.ERCC.chrM,B008648,P6,P6_B008648,H5471,2401,TNBC,ER-,basal,3,low,...,-0.094404,-0.055287,G1,1,1,-0.161300,-0.217604,-0.056304,EMP Intermediate,SS2


In [42]:
MULTI_metadata = adata_MULTI_processed_tumor.obs.copy()
MULTI_metadata['method'] = 'MULTI'
MULTI_metadata

Unnamed: 0,n_counts,n_genes,percent_mito,MULTI_barcode,MULTI_tumor_ID,MULTI_sort,Tumor_ID,run_id,sequencing_batch,sort,...,S_score,G2M_score,phase,louvain,leiden,E_score,M_score,EMP_score,EMP_stage,method
AAACCTGAGCCACTAT-1-0,3081.045410,1204,28.874271,Sample4,H4272,tumor,H4272,PDX1A,1,Tumor,...,-0.009338,-0.086057,G1,0,0,-0.230684,-0.175436,0.055248,EMP Intermediate,MULTI
AAACCTGTCCTTTCGG-1-0,8258.421875,3054,3.121930,Sample2,HCI001,tumor,HCI001,PDX1A,1,Tumor,...,-0.011026,-0.040799,G1,2,2,-0.005601,-0.152973,-0.147371,EMP Intermediate,MULTI
AAACCTGTCTTTACAC-1-0,5620.009766,2297,3.691891,Sample4,H4272,tumor,H4272,PDX1A,1,Tumor,...,0.362262,-0.017178,S,0,0,-0.241318,-0.198665,0.042653,EMP Intermediate,MULTI
AAACGGGCACCACGTG-1-0,4077.374268,1175,26.499096,Sample4,H4272,tumor,H4272,PDX1A,1,Tumor,...,0.031187,-0.066564,S,0,0,-0.215355,-0.187617,0.027738,EMP Intermediate,MULTI
AAACGGGCACCTCGTT-1-0,3337.482910,1231,14.729706,Sample2,HCI001,tumor,HCI001,PDX1A,1,Tumor,...,-0.036659,-0.073135,G1,2,2,-0.237576,-0.190947,0.046629,EMP Intermediate,MULTI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACATACAGCT-8-7,25223.130859,3714,3.514345,Negative,Negative,Negative,J53353,PDX3B,3,Tumor,...,-0.056953,-0.140692,G1,1,1,-0.405807,0.125637,0.531445,Mesenchymal-like,MULTI
TTTGTCAGTTCCAACA-8-7,19600.021484,3681,5.502220,Negative,Negative,Negative,J53353,PDX3B,3,Tumor,...,-0.025819,-0.137633,G1,1,1,-0.405343,0.086573,0.491916,Mesenchymal-like,MULTI
TTTGTCAGTTCGTCTC-8-7,50359.718750,5625,4.933639,Negative,Negative,Negative,J53353,PDX3B,3,Tumor,...,-0.013910,-0.043573,G1,1,1,-0.414896,0.026527,0.441423,Mesenchymal-like,MULTI
TTTGTCATCCTAGTGA-8-7,20192.769531,3937,1.519440,Negative,Negative,Negative,J53353,PDX3B,3,Tumor,...,-0.071511,-0.130449,G1,5,5,-0.276916,0.299085,0.576001,Mesenchymal-like,MULTI


In [43]:
final_metadata = pd.concat([SS2_metadata, MULTI_metadata])
final_metadata

Unnamed: 0,plate_ID,well,cell_id,Tumor_ID,Animal_ID,receptor_status,ER_receptor,intrinsic_BC_type,metastatic_potential_rank,metastatic_potential_group,...,E_score,M_score,EMP_score,EMP_stage,method,MULTI_barcode,MULTI_tumor_ID,MULTI_sort,run_id,ranking_ID
A10_D088223_B106234_S10.homo.gencode.v30.ERCC.chrM,D088223,A10,A10_D088223,J53353,A2781,TNBC,ER-,basal,12,high,...,-0.256206,-0.005513,0.250694,Mesenchymal-like,SS2,,,,,
A10_D088224_B106233_S10.homo.gencode.v30.ERCC.chrM,D088224,A10,A10_D088224,H4272,876,TNBC,ER-,basal,6,low,...,-0.066053,0.036384,0.102436,EMP Intermediate,SS2,,,,,
A11_B008880_B106232_S11.homo.gencode.v30.ERCC.chrM,B008880,A11,A11_B008880,HCI011,928,ER+PR+,ER+,luminal B,8,moderate,...,-0.285115,-0.364366,-0.079251,EMP Intermediate,SS2,,,,,
A11_D088219_B106235_S11.homo.gencode.v30.ERCC.chrM,D088219,A11,A11_D088219,HCI010,D0098,TNBC,ER-,basal,13,high,...,-0.131939,0.443152,0.575091,Mesenchymal-like,SS2,,,,,
A11_D088224_B106233_S11.homo.gencode.v30.ERCC.chrM,D088224,A11,A11_D088224,H4272,876,TNBC,ER-,basal,6,low,...,-0.079068,-0.224259,-0.145191,EMP Intermediate,SS2,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACATACAGCT-8-7,,,,J53353,,TNBC,ER-,basal,12,high,...,-0.405807,0.125637,0.531445,Mesenchymal-like,MULTI,Negative,Negative,Negative,PDX3B,12_J53353
TTTGTCAGTTCCAACA-8-7,,,,J53353,,TNBC,ER-,basal,12,high,...,-0.405343,0.086573,0.491916,Mesenchymal-like,MULTI,Negative,Negative,Negative,PDX3B,12_J53353
TTTGTCAGTTCGTCTC-8-7,,,,J53353,,TNBC,ER-,basal,12,high,...,-0.414896,0.026527,0.441423,Mesenchymal-like,MULTI,Negative,Negative,Negative,PDX3B,12_J53353
TTTGTCATCCTAGTGA-8-7,,,,J53353,,TNBC,ER-,basal,12,high,...,-0.276916,0.299085,0.576001,Mesenchymal-like,MULTI,Negative,Negative,Negative,PDX3B,12_J53353


In [44]:
final_metadata.to_csv('SS2_MULTI_tumor_only_shared_genes_metadata.csv')

In [45]:
gene_metadata = pd.DataFrame()
gene_metadata['gene'] = final_list
gene_metadata = gene_metadata.set_index('gene')
for i in gene_metadata.index:
    if i in final_low_overlap:
        gene_metadata.loc[i, 'group'] = 'low'
    elif i in final_intermediate_overlap:
        gene_metadata.loc[i, 'group'] = 'moderate'
    elif i in final_high_overlap:
        gene_metadata.loc[i, 'group'] = 'high'

In [46]:
gene_metadata

Unnamed: 0_level_0,group
gene,Unnamed: 1_level_1
EFNA5,low
ELF3,low
CREG1,low
SDF2,low
NDUFB10,low
...,...
DCTPP1,high
CCDC88A,high
GPAA1,high
ZNF706,high


In [47]:
gene_metadata.to_csv('SS2_MULTI_tumor_only_shared_genes_gene_metadata.csv')