In [1]:
# importing python modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import glob

%matplotlib inline
sc.logging.print_header()

scanpy==1.8.1 anndata==0.7.5 umap==0.5.1 numpy==1.21.0 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.23.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.2 pynndescent==0.5.2


In [2]:
adata_SS2_processed = sc.read_h5ad('SS2_processed.h5ad')
adata_MULTI_processed = sc.read_h5ad('MULTI_processed.h5ad')

# SS2 genes

In [3]:
SS2_EMP_files = [f for f in glob.glob("DEGs/SS2/EMP/*.csv")]

In [4]:
SS2_EMP_files

['DEGs/SS2/EMP/MAST_global_Epithelial-like.csv',
 'DEGs/SS2/EMP/MAST_global_Mesenchymal-like.csv',
 'DEGs/SS2/EMP/MAST_global_EMP Intermediate.csv']

In [5]:
SS2_EMP_genes = pd.DataFrame()
for i in SS2_EMP_files:
    temp_df = pd.DataFrame()
    EMT_ID = i.split('.')[0].split('MAST_global_')[1]
    df = pd.read_csv(i,index_col = 0)
    df = df[df['p_val'] < 0.05]
    up_genes = df[df['avg_log2FC'] > 0.5].sort_values(by='avg_log2FC', ascending=False).index.tolist()
    
    temp_df[EMT_ID] = up_genes
   
    SS2_EMP_genes = pd.concat([SS2_EMP_genes,temp_df], ignore_index=False, axis=1)

In [6]:
SS2_EMP_genes

Unnamed: 0,Epithelial-like,Mesenchymal-like,EMP Intermediate
0,KRT8,COL9A3,KRT15
1,SLC9A3R1,C2orf40,SCGB3A1
2,KRT18,BGN,S100A2
3,LGALS3BP,MSLN,AZGP1
4,INTS1,COMP,ACTA1
...,...,...,...
303,,WNK1,
304,,TUBA1B,
305,,H2AFJ,
306,,AZIN1,


In [7]:
SS2_EMP_genes.to_csv('SS2_EMP_genes_list.csv')

In [8]:
SS2_E_genes = [x for x in SS2_EMP_genes['Epithelial-like'].tolist() if str(x) != 'nan']
SS2_H_genes = [x for x in SS2_EMP_genes['EMP Intermediate'].tolist() if str(x) != 'nan']
SS2_M_genes = [x for x in SS2_EMP_genes['Mesenchymal-like'].tolist() if str(x) != 'nan']

# MULTI genes

In [9]:
MULTI_EMP_files = [f for f in glob.glob("DEGs/MULTI/EMP/*.csv")]

In [10]:
MULTI_EMP_files

['DEGs/MULTI/EMP/MAST_global_Epithelial-like.csv',
 'DEGs/MULTI/EMP/MAST_global_Mesenchymal-like.csv',
 'DEGs/MULTI/EMP/MAST_global_EMP Intermediate.csv']

In [11]:
MULTI_EMP_genes = pd.DataFrame()
for i in MULTI_EMP_files:
    temp_df = pd.DataFrame()
    EMT_ID = i.split('.')[0].split('MAST_global_')[1]
    df = pd.read_csv(i,index_col = 0)
    df = df[df['p_val'] < 0.05]
    up_genes = df[df['avg_log2FC'] > 0.5].sort_values(by='avg_log2FC', ascending=False).index.tolist()
    
    temp_df[EMT_ID] = up_genes
   
    MULTI_EMP_genes = pd.concat([MULTI_EMP_genes,temp_df], ignore_index=False, axis=1)

In [12]:
MULTI_EMP_genes

Unnamed: 0,Epithelial-like,Mesenchymal-like,EMP Intermediate
0,TFF1,SNORC,FDCSP
1,TFF3,SCRG1,WFDC2
2,AGR2,COL9A3,RARRES1
3,SCGB2A2,H19,LINC00472
4,SLC39A6,S100A4,C5orf46
...,...,...,...
284,,PLAG1,
285,,CSPG4,
286,,TMEM208,
287,,MGP,


In [13]:
MULTI_EMP_genes.to_csv('MULTI_EMP_genes_list.csv')

In [14]:
MULTI_E_genes = [x for x in MULTI_EMP_genes['Epithelial-like'].tolist() if str(x) != 'nan']
MULTI_H_genes = [x for x in MULTI_EMP_genes['EMP Intermediate'].tolist() if str(x) != 'nan']
MULTI_M_genes = [x for x in MULTI_EMP_genes['Mesenchymal-like'].tolist() if str(x) != 'nan']

# find overlap genes between SS2 and MULTI-seq

In [15]:
final_E_overlap = list(set(SS2_E_genes)&set(MULTI_E_genes))
len(final_E_overlap)

62

In [16]:
final_H_overlap = list(set(SS2_H_genes)&set(MULTI_H_genes))
len(final_H_overlap)

5

In [17]:
final_M_overlap = list(set(SS2_M_genes[:])&set(MULTI_M_genes[:]))
len(final_M_overlap)

132

In [18]:
final_overlap_EMP_genes = pd.DataFrame()
temp_df_E = pd.DataFrame()
temp_df_H = pd.DataFrame()
temp_df_M = pd.DataFrame()
temp_df_E['low_overlap'] = final_E_overlap
temp_df_H['intermediate_overlap'] = final_H_overlap
temp_df_M['high_overlap'] = final_M_overlap
final_overlap_EMP_genes = pd.concat([final_overlap_EMP_genes,temp_df_E,temp_df_H,temp_df_M], ignore_index=False, axis=1)
final_overlap_EMP_genes

Unnamed: 0,low_overlap,intermediate_overlap,high_overlap
0,NSD3,CRYAB,RPL23A
1,NAT1,KRT15,PLOD1
2,ARRDC1,CALML5,CITED4
3,EFHD1,CD24,RPL28
4,SCCPDH,S100A2,HAPLN1
...,...,...,...
127,,,COL11A2
128,,,NR4A2
129,,,RPS12
130,,,CYBA


In [19]:
final_overlap_EMP_genes.to_csv('SS2_MULTI_overlap_EMP_genes_list.csv')

# SS2 EMP Heatmap

In [20]:
SS2_EMP_genes_list = SS2_E_genes + SS2_H_genes + SS2_M_genes

adata_SS2_processed_E = adata_SS2_processed[adata_SS2_processed.obs['EMP_stage'] == 'Epithelial-like']
SS2_E_metadata_df = adata_SS2_processed_E.obs.sort_values(by='EMP_score',ascending=True)

adata_SS2_processed_H = adata_SS2_processed[adata_SS2_processed.obs['EMP_stage'] == 'EMP Intermediate']
SS2_H_metadata_df = adata_SS2_processed_H.obs.sort_values(by='EMP_score',ascending=True)

adata_SS2_processed_M = adata_SS2_processed[adata_SS2_processed.obs['EMP_stage'] == 'Mesenchymal-like']
SS2_M_metadata_df = adata_SS2_processed_M.obs.sort_values(by='EMP_score',ascending=True)


  res = method(*args, **kwargs)


In [21]:
len(set(SS2_EMP_genes_list))

568

In [22]:
final_SS2_EMP_genes_list = []
for i in SS2_EMP_genes_list:
    if i not in final_SS2_EMP_genes_list:
        final_SS2_EMP_genes_list.append(i)

In [23]:
SS2_E_gc_df = adata_SS2_processed_E.to_df()[final_SS2_EMP_genes_list]
SS2_E_gc_df = SS2_E_gc_df.T[SS2_E_metadata_df.index].T

SS2_H_gc_df = adata_SS2_processed_H.to_df()[final_SS2_EMP_genes_list]
SS2_H_gc_df = SS2_H_gc_df.T[SS2_H_metadata_df.index].T

SS2_M_gc_df = adata_SS2_processed_M.to_df()[final_SS2_EMP_genes_list]
SS2_M_gc_df = SS2_M_gc_df.T[SS2_M_metadata_df.index].T

final_SS2_gc_df = pd.concat([SS2_E_gc_df,SS2_H_gc_df,SS2_M_gc_df])
final_SS2_gc_df = final_SS2_gc_df.T

final_SS2_gc_df.to_csv('SS2_global_EMT_gc.csv')

In [24]:
SS2_final_metadata_df = pd.concat([SS2_E_metadata_df,SS2_H_metadata_df,SS2_M_metadata_df])
SS2_final_metadata_df.to_csv('SS2_global_EMT_metadata.csv')

In [25]:
SS2_gene_metadata = pd.DataFrame()
SS2_gene_metadata['gene'] = final_SS2_EMP_genes_list
SS2_gene_metadata = SS2_gene_metadata.set_index('gene')
for i in SS2_gene_metadata.index:
    if i in SS2_E_genes:
        SS2_gene_metadata.loc[i, 'group'] = 'Epithelial-like'
    elif i in SS2_H_genes:
        SS2_gene_metadata.loc[i, 'group'] = 'EMP Intermediate'
    elif i in SS2_M_genes:
        SS2_gene_metadata.loc[i, 'group'] = 'Mesenchymal-like'
        
SS2_gene_metadata.to_csv('SS2_global_EMT_metadata_gene.csv')

# MULTI EMP Heatmap

In [26]:
MULTI_EMP_genes_list = MULTI_E_genes + MULTI_H_genes + MULTI_M_genes

adata_MULTI_processed_E = adata_MULTI_processed[adata_MULTI_processed.obs['EMP_stage'] == 'Epithelial-like']
MULTI_E_metadata_df = adata_MULTI_processed_E.obs.sort_values(by='EMP_score',ascending=True)

adata_MULTI_processed_H = adata_MULTI_processed[adata_MULTI_processed.obs['EMP_stage'] == 'EMP Intermediate']
MULTI_H_metadata_df = adata_MULTI_processed_H.obs.sort_values(by='EMP_score',ascending=True)

adata_MULTI_processed_M = adata_MULTI_processed[adata_MULTI_processed.obs['EMP_stage'] == 'Mesenchymal-like']
MULTI_M_metadata_df = adata_MULTI_processed_M.obs.sort_values(by='EMP_score',ascending=True)


In [27]:
len(set(MULTI_EMP_genes_list))

622

In [28]:
final_MULTI_EMP_genes_list = []
for i in MULTI_EMP_genes_list:
    if i not in final_MULTI_EMP_genes_list:
        final_MULTI_EMP_genes_list.append(i)

In [29]:
MULTI_E_gc_df = adata_MULTI_processed_E.to_df()[final_MULTI_EMP_genes_list]
MULTI_E_gc_df = MULTI_E_gc_df.T[MULTI_E_metadata_df.index].T

MULTI_H_gc_df = adata_MULTI_processed_H.to_df()[final_MULTI_EMP_genes_list]
MULTI_H_gc_df = MULTI_H_gc_df.T[MULTI_H_metadata_df.index].T

MULTI_M_gc_df = adata_MULTI_processed_M.to_df()[final_MULTI_EMP_genes_list]
MULTI_M_gc_df = MULTI_M_gc_df.T[MULTI_M_metadata_df.index].T

final_MULTI_gc_df = pd.concat([MULTI_E_gc_df,MULTI_H_gc_df,MULTI_M_gc_df])
final_MULTI_gc_df = final_MULTI_gc_df.T

final_MULTI_gc_df.to_csv('MULTI_global_EMT_gc.csv')

In [30]:
MULTI_final_metadata_df = pd.concat([MULTI_E_metadata_df,MULTI_H_metadata_df,MULTI_M_metadata_df])
MULTI_final_metadata_df.to_csv('MULTI_global_EMT_metadata.csv')

In [31]:
MULTI_gene_metadata = pd.DataFrame()
MULTI_gene_metadata['gene'] = final_MULTI_EMP_genes_list
MULTI_gene_metadata = MULTI_gene_metadata.set_index('gene')
for i in MULTI_gene_metadata.index:
    if i in MULTI_E_genes:
        MULTI_gene_metadata.loc[i, 'group'] = 'Epithelial-like'
    elif i in MULTI_H_genes:
        MULTI_gene_metadata.loc[i, 'group'] = 'EMP Intermediate'
    elif i in MULTI_M_genes:
        MULTI_gene_metadata.loc[i, 'group'] = 'Mesenchymal-like'
        
MULTI_gene_metadata.to_csv('MULTI_global_EMT_metadata_gene.csv')