In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

#### Default Paths

In [2]:
diff_expression_path = Path('./dados_proteomica_com_foldchange.xlsx')
cancer_sheet = 'Cancer-Rat'
preeclam_sheet = 'Preeclampsia-Rat'

string_cancer_path = Path('./String_cancer.csv')
string_preeclam_path = Path('./String_pre_eclampsia.csv')

kegg_cancer_path = Path('./KEGG_cancer.csv')
kegg_preeclam_path = Path('./KEGG_pre_eclampsia.xlsx')
kegg_common_path = Path('./KEGG_compartilhadas.xlsx')

output_path = Path('./output')
output_path.mkdir(exist_ok=True)

#### Load expression data

In [4]:
cancer_exp_cols = ['Query_protein', 'Gene_name', 'FC']
rename_cancer_exp_cols = {'Query_protein':'protein', 'Gene_name':'gene', 'FC':'fc'}
cancer_expression_data = pd.read_excel(diff_expression_path, sheet_name=cancer_sheet, usecols=cancer_exp_cols).rename(columns=rename_cancer_exp_cols)
cancer_expression_data['gene'] = cancer_expression_data['gene'].str.strip()
cancer_expression_data['regulated'] = np.where(cancer_expression_data['fc'] > 0, 'up', 'down')


preeclam_exp_cols = ['protein_firstname', 'Gene_name_correct', 'FC']
rename_preeclam_exp_cols = {'protein_firstname':'protein', 'Gene_name_correct':'gene', 'FC':'fc'}
preeclam_expression_data = pd.read_excel(diff_expression_path, sheet_name=preeclam_sheet, usecols=preeclam_exp_cols).rename(columns=rename_preeclam_exp_cols)
preeclam_expression_data['gene'] = preeclam_expression_data['gene'].str.strip()
preeclam_expression_data['regulated'] = np.where(preeclam_expression_data['fc'] > 0, 'up', 'down')


In [6]:
len(cancer_expression_data['gene'])

177

In [9]:
len(preeclam_expression_data['gene'])

326

In [47]:
# Drop rows missing gene or fold change

print('## Cancer NAs:\n', cancer_expression_data.isna().sum())
print()
print('## Pre-eclam NAs:\n', preeclam_expression_data.isna().sum())

cancer_expression_data.dropna(how='any', subset=['gene', 'fc'], inplace=True)
preeclam_expression_data.dropna(how='any', subset=['gene', 'fc'], inplace=True)

## Cancer NAs:
 protein      0
gene         0
fc           1
regulated    0
dtype: int64

## Pre-eclam NAs:
 protein       0
gene         10
fc            0
regulated     0
dtype: int64


In [48]:
# Cancer - Drop duplicated genes (?) (keep the first occurrence)
display(cancer_expression_data['gene'].value_counts()[cancer_expression_data['gene'].value_counts() > 1])
display(cancer_expression_data[cancer_expression_data['gene'] == 'Tuba1c'])
cancer_expression_data.drop_duplicates(subset='gene', keep='first', inplace=True)

Tuba1c    2
Name: gene, dtype: int64

Unnamed: 0,protein,gene,fc,regulated
0,Q6AYZ1,Tuba1c,-0.44037,down
160,A0A0H2UHM7,Tuba1c,-0.355256,down


In [49]:
# Pre-eclampsia - Drop duplicated genes (?) (keep the first occurrence)
display(preeclam_expression_data['gene'].value_counts()[preeclam_expression_data['gene'].value_counts() > 1])
display(preeclam_expression_data[(preeclam_expression_data['gene'] == 'Gbe1') | (preeclam_expression_data['gene'] == 'Gpx3')])
preeclam_expression_data.drop_duplicates(subset='gene', keep='first', inplace=True)

Gbe1    2
Gpx3    2
Name: gene, dtype: int64

Unnamed: 0,protein,gene,fc,regulated
208,A0A0G2JTB2,Gbe1,0.533101,up
209,A0A096MJY6,Gbe1,0.537608,up
317,A0A0G2K531,Gpx3,-0.320739,down
318,A0A0G2K8W9,Gpx3,-0.290763,down


In [52]:
preeclam_expression_data[preeclam_expression_data['gene'].str.contains(".", regex=False)]
# preeclam_expression_data[preeclam_expression_data['gene'] == '.']


Unnamed: 0,protein,gene,fc,regulated
315,F1LTN6,AABR07060872.1,0.754191,up


##### Merge differential expression data (cancer + pre-eclampsia)

In [53]:
merged_expression_data = cancer_expression_data.merge(preeclam_expression_data, on='gene', suffixes=['_cancer', '_pre_eclampsia'], how='outer')
merged_expression_data['cancer'] = np.where(merged_expression_data['fc_cancer'].notna(), 1, 0)
merged_expression_data['pre_eclampsia'] = np.where(merged_expression_data['fc_pre_eclampsia'].notna(), 1, 0)

merged_expression_data['both'] = np.where((merged_expression_data['cancer'] == 1) & (merged_expression_data['pre_eclampsia'] == 1), 1, 0)

merged_expression_data['present_in'] = np.where((merged_expression_data['cancer'] == 1) & (merged_expression_data['pre_eclampsia'] == 1), 1, 0)


In [54]:
merged_expression_data['present_in'] = np.where(merged_expression_data['cancer'] == 1, np.where(merged_expression_data['pre_eclampsia'] == 1, 'both', 'cancer'), 'pre_eclampsia')

In [56]:
merged_expression_data.head(10)

Unnamed: 0,protein_cancer,gene,fc_cancer,regulated_cancer,protein_pre_eclampsia,fc_pre_eclampsia,regulated_pre_eclampsia,cancer,pre_eclampsia,both,present_in
0,Q6AYZ1,Tuba1c,-0.44037,down,,,,1,0,0,cancer
1,Q9EPH1,A1bg,-0.778146,down,,,,1,0,0,cancer
2,P06238,A2m,-0.711127,down,,,,1,0,0,cancer
3,G3V9J6,Abcb1b,-0.557322,down,,,,1,0,0,cancer
4,P68136,Acta1,-0.284675,down,,,,1,0,0,cancer
5,P62738,Acta2,-0.316151,down,,,,1,0,0,cancer
6,D3ZRN3,Actbl2,-0.322669,down,,,,1,0,0,cancer
7,P68035,Actc1,-0.284966,down,,,,1,0,0,cancer
8,P63259,Actg2,-0.308877,down,,,,1,0,0,cancer
9,Q9Z1P2,Actn1,0.240738,up,Q9Z1P2,0.263762,up,1,1,1,both


In [57]:
# Verify common genes
merged_expression_data[merged_expression_data['fc_cancer'].notna() & merged_expression_data['fc_pre_eclampsia'].notna()]

Unnamed: 0,protein_cancer,gene,fc_cancer,regulated_cancer,protein_pre_eclampsia,fc_pre_eclampsia,regulated_pre_eclampsia,cancer,pre_eclampsia,both,present_in
9,Q9Z1P2,Actn1,0.240738,up,Q9Z1P2,0.263762,up,1,1,1,both
13,P29410,Ak2,-0.890588,down,A0A0G2JSG6,0.287707,up,1,1,1,both
15,P11884,Aldh2,-0.461168,down,F1LN88,-0.441715,down,1,1,1,both
36,B0BNA5,Cotl1,0.821756,up,B0BNA5,-1.921076,down,1,1,1,both
44,Q9JHL4,Dbnl,-1.110252,down,Q9JHL4,-0.311186,down,1,1,1,both
48,Q6TUG0,Dnajb11,-0.428988,down,Q6TUG0,0.442867,up,1,1,1,both
51,Q641Z6,Ehd1,-0.450791,down,Q641Z6,-0.294619,down,1,1,1,both
52,Q8R3Z7,Ehd4,0.576265,up,Q8R3Z7,-0.293523,down,1,1,1,both
58,Q8R4A1,Ero1a,0.306258,up,Q8R4A1,0.773918,up,1,1,1,both
60,G3V843,F2,-0.553859,down,G3V843,0.293442,up,1,1,1,both


In [58]:
# Count number of genes
ct_cancer = sum(merged_expression_data['cancer'] == 1)
ct_preeclam = sum(merged_expression_data['pre_eclampsia'] == 1)
ct_both = sum(merged_expression_data['both'] == 1)
print(f'# Cancer = {ct_cancer}, # Pre-eclampsia = {ct_preeclam}, # Common = {ct_both}')


# Cancer = 175, # Pre-eclampsia = 314, # Common = 22


In [60]:
# Save data
exp_data_fn = 'expression_data'
cancer_expression_data.to_csv(Path(output_path, f'{exp_data_fn}_cancer.csv'), index=False)
preeclam_expression_data.to_csv(Path(output_path, f'{exp_data_fn}_pre_eclampsia.csv'), index=False)
merged_expression_data.to_csv(Path(output_path, f'{exp_data_fn}_merged.csv'), index=False)

#### Load String-db data (PPI)

In [11]:
string_cols = ['#node1', 'node2', 'node1_string_id', 'node2_string_id']
rename_string_cols = {'#node1': 'node1', 'node1_string_id':'node1_id', 'node2_string_id':'node2_id'}

# Note: current string files are separated by semicolon
cancer_ppi_data = pd.read_csv(string_cancer_path, usecols=string_cols, sep=';').rename(columns=rename_string_cols)
preeclam_ppi_data = pd.read_csv(string_preeclam_path, usecols=string_cols, sep=';').rename(columns=rename_string_cols)

In [12]:
# Cancer
display(cancer_ppi_data.head(10))

cancer_genes_in_ppi = set(cancer_ppi_data['node1'].values.tolist() + cancer_ppi_data['node2'].values.tolist())
cancer_all_genes = set(cancer_expression_data['gene'].values.tolist())

print(f'Cancer -> # Total genes = {len(cancer_all_genes)}, # Genes in PPI = {len(cancer_genes_in_ppi)}')
print(f'Genes NOT in ppi ({len(cancer_all_genes - cancer_genes_in_ppi)}):')
print(cancer_all_genes - cancer_genes_in_ppi)


Unnamed: 0,node1,node2,node1_id,node2_id
0,Acta1,Actc1,10116.ENSRNOP00000024084,10116.ENSRNOP00000011773
1,Acta1,Acta2,10116.ENSRNOP00000024084,10116.ENSRNOP00000073101
2,Acta1,Actg2,10116.ENSRNOP00000024084,10116.ENSRNOP00000050322
3,Acta2,Actc1,10116.ENSRNOP00000073101,10116.ENSRNOP00000011773
4,Acta2,Tln1,10116.ENSRNOP00000073101,10116.ENSRNOP00000022401
5,Acta2,Pxn,10116.ENSRNOP00000073101,10116.ENSRNOP00000043928
6,Acta2,Actg2,10116.ENSRNOP00000073101,10116.ENSRNOP00000050322
7,Acta2,Myl6,10116.ENSRNOP00000073101,10116.ENSRNOP00000070200
8,Acta2,Myh14,10116.ENSRNOP00000073101,10116.ENSRNOP00000072636
9,Acta2,Vcl,10116.ENSRNOP00000073101,10116.ENSRNOP00000074748


Cancer -> # Total genes = 176, # Genes in PPI = 104
Genes NOT in ppi (72):
{'Vapa', 'Pfkl', 'Tgm2', 'Ptbp1', 'Atp1b1', 'Ndrg1', 'Hbb-b2', 'Itih2', 'Api5', 'Lgals1', 'Rilpl1', 'Car1', 'Alpp', 'Aimp1', 'A2m', 'Rpia', 'Cyb5b', 'Prdx4', 'Tubb4b', 'Hba', 'Rragb', 'Cdv3', 'A1bg', 'Atxn10', 'Lonp1', 'Gnpda1', 'Ddx17', 'Krt18', 'Prmt8', 'Rab43', 'Nefl', 'En2', 'Cntrl', 'Mat2a', 'Gstm4', 'Hax1', 'Bsg', 'Atic', 'Dbnl', 'Aldh2', 'Mug2', 'Sfxn3', 'Abcb1b', 'Anxa2', 'Vat1', 'Uchl4', 'Psap', 'Ak2', 'Rab33b', 'Spata1', 'Stk26', 'Csrp1', 'Etfb', 'Slc25a10', 'Ddost', 'Rap2c', 'Afm', 'Cotl1', 'Emc2', 'Eno2', 'Ubap2l', 'Dpysl2', 'Hpx', 'Cand1', 'Slc3a2', 'Olfm4', 'Dstn', 'Rbp4', 'Mug1', 'Fkbp14', 'Cpsf6', 'Serpina3n'}


In [13]:
unique_nodes = set(cancer_ppi_data['node1'].values.tolist() + cancer_ppi_data['node2'].values.tolist())
print(f'Cancer -> # of Edges = {len(cancer_ppi_data)}, # of Nodes = {len(unique_nodes)}')

Cancer -> # of Edges = 185, # of Nodes = 104


In [14]:
## Pre-eclampsia
display(preeclam_ppi_data.head(10))

preeclam_genes_in_ppi = set(preeclam_ppi_data['node1'].values.tolist() + preeclam_ppi_data['node2'].values.tolist())
preeclam_all_genes = set(preeclam_expression_data['gene'].values.tolist())
print(f'Pre-eclampsia -> # Total genes = {len(preeclam_all_genes)}, # Genes in PPI = {len(preeclam_genes_in_ppi)}')

print(f'Genes NOT in ppi ({len(preeclam_all_genes - preeclam_genes_in_ppi)}):')
print(preeclam_all_genes - preeclam_genes_in_ppi)

Unnamed: 0,node1,node2,node1_id,node2_id
0,Actn1,Itga6,10116.ENSRNOP00000068851,10116.ENSRNOP00000002075
1,Actn1,Mcam,10116.ENSRNOP00000068851,10116.ENSRNOP00000010464
2,Actn1,Efhd2,10116.ENSRNOP00000068851,10116.ENSRNOP00000018864
3,Actn1,Itga2b,10116.ENSRNOP00000068851,10116.ENSRNOP00000052051
4,Ada,Nt5c3a,10116.ENSRNOP00000014151,10116.ENSRNOP00000071357
5,Ada,Adk,10116.ENSRNOP00000014151,10116.ENSRNOP00000073983
6,Adk,Ak2,10116.ENSRNOP00000073983,10116.ENSRNOP00000000134
7,Adk,Jchain,10116.ENSRNOP00000073983,10116.ENSRNOP00000004866
8,Adk,Upp1,10116.ENSRNOP00000073983,10116.ENSRNOP00000006765
9,Adk,Nt5c3a,10116.ENSRNOP00000073983,10116.ENSRNOP00000071357


Pre-eclampsia -> # Total genes = 315, # Genes in PPI = 153
Genes NOT in ppi (162):
{nan, 'Ankrd13a', 'Tm9sf2', 'C4bpa', 'Ank3', 'Afap1', 'Cmas', 'Tmpo', 'Lzic', 'Lgals1', 'Eps8l2', 'Krt7', 'Aif1l', 'Maged2', 'Fbln1', 'Pitrm1', 'Ctsm', 'Lta4h', 'Septin5', 'Uba6', 'Slc4a2', 'Nln', 'Uggt1', 'Chdh', 'Ceacam11', 'Loxl2', 'Fam136a', 'Cd5l', 'Dbnl', 'Sec11a', 'AABR07060872.1', 'Casp6', 'Lima1', 'Emb', 'Lgals5', 'Phactr4', 'Psg19', 'Matr3', 'Hpx', 'Vcpip1', 'Serpinb9', 'Arsb', 'Lrrc59', 'Hdhd2', 'Prl8a9', 'Kng1', 'Prl4a1', 'Nars1', 'Aqp1', 'Gab1', 'Myo18a', 'Prl6a1', 'Klhl22', 'Tnfaip8', 'Agt', 'Abcb4', 'Slc2a1', 'Plcd1', 'Tes', 'Creg1', 'Naxd', 'Orm1', 'LOC299282', 'Fndc3a', 'Slc27a3', 'Retsat', 'Pdlim4', 'Gnb4', 'St6gal1', 'Prss8', 'Arpp19', 'Bgn', 'Crtap', 'Prrc1', 'Cbr1l1', 'Lamp1', 'Fblim1', 'Abracl', 'Dpp3', 'Vnn1', 'Grn', 'Ddah1', 'Inf2', 'Timp3', 'Xdh', 'Sgpl1', 'Sh3bgrl', 'Etfdh', 'Sars1', 'Cobll1', 'Pgk1', 'Man2a1', 'Nostrin', 'Tspan9', 'Limd1', 'Lad1', 'Mesd', 'Gar1', 'Pon2', 'Snx12

In [16]:
unique_nodes = set(preeclam_ppi_data['node1'].values.tolist() + preeclam_ppi_data['node2'].values.tolist())
print(f'Pre-eclampsia -> # of Edges = {len(preeclam_ppi_data)}, # of Nodes = {len(unique_nodes)}')

Pre-eclampsia -> # of Edges = 315, # of Nodes = 153


In [83]:
###### Verificar como podem existir Genes no STRING que não estão na tabela de expressão diferencial
# preeclam_ppi_data = preeclam_ppi_data[preeclam_ppi_data['node1'].isin(preeclam_all_genes) & preeclam_ppi_data['node2'].isin(preeclam_all_genes)]



# preeclam_genes_in_ppi = set(preeclam_ppi_data['node1'].values.tolist() + preeclam_ppi_data['node2'].values.tolist())
# preeclam_all_genes = set(preeclam_expression_data['gene'].values.tolist())
# print(f'Cancer -> # Total genes = {len(preeclam_all_genes)}, # Genes in PPI = {len(preeclam_genes_in_ppi)}')

# print(f'Genes NOT in ppi ({len(preeclam_all_genes - preeclam_genes_in_ppi)}):')
# print(preeclam_all_genes - preeclam_genes_in_ppi)

In [84]:
# Merge STRING networks and check/remove for duplicates
merged_ppi_data = pd.concat([cancer_ppi_data, preeclam_ppi_data], axis=0, join='outer', ignore_index=True)
display(merged_ppi_data)
print('#Duplicated Edges:')
merged_ppi_data.pivot_table(index = ['node1', 'node2'], aggfunc ='size')[merged_ppi_data.pivot_table(index = ['node1', 'node2'], aggfunc ='size') > 1]

Unnamed: 0,node1,node2,node1_id,node2_id
0,Acta1,Actc1,10116.ENSRNOP00000024084,10116.ENSRNOP00000011773
1,Acta1,Acta2,10116.ENSRNOP00000024084,10116.ENSRNOP00000073101
2,Acta1,Actg2,10116.ENSRNOP00000024084,10116.ENSRNOP00000050322
3,Acta2,Actc1,10116.ENSRNOP00000073101,10116.ENSRNOP00000011773
4,Acta2,Tln1,10116.ENSRNOP00000073101,10116.ENSRNOP00000022401
...,...,...,...,...
495,Stk10,Tfrc,10116.ENSRNOP00000044325,10116.ENSRNOP00000002407
496,Tmed3,Uso1,10116.ENSRNOP00000018603,10116.ENSRNOP00000003277
497,Tmed3,Tmed7,10116.ENSRNOP00000018603,10116.ENSRNOP00000004942
498,Tmed7,Uso1,10116.ENSRNOP00000004942,10116.ENSRNOP00000003277


#Duplicated Edges:


node1    node2 
Dnajb11  Ruvbl2    2
Ehd1     Ehd4      2
Tuba4a   Tubb3     2
dtype: int64

In [85]:
merged_ppi_data.drop_duplicates(keep='first', ignore_index=True, inplace=True)
unique_nodes = set(merged_ppi_data['node1'].values.tolist() + merged_ppi_data['node2'].values.tolist())
print(f'# of Edges = {len(merged_ppi_data)}, # of Nodes = {len(unique_nodes)}')


# of Edges = 497, # of Nodes = 243


In [18]:
ppi_fn = 'ppi_data'
cancer_ppi_data.to_csv(Path(output_path, f'./{ppi_fn}_cancer.csv'), index=False)
preeclam_ppi_data.to_csv(Path(output_path, f'./{ppi_fn}_pre_eclampsia.csv'), index=False)
merged_ppi_data.to_csv(Path(output_path, f'./{ppi_fn}_merged.csv'), index=False)


#### Load and process KEGG enrichment (pathways)

In [35]:
pathway_rename_cols = {
    '#term ID' : 'pathway_id',
    'term description' : 'pathway_name',
    'observed gene count' : 'observed_gene_count',
    'background gene count' : 'background_gene_count',
    'strength' : 'strength',
    'false discovery rate' : 'fdr',
    'matching proteins in your network (labels)' : 'gene_labels_in'
}


In [36]:
## Cancer
cancer_pathway_data = pd.read_csv(kegg_cancer_path, sep=';', usecols=list(pathway_rename_cols.keys())).rename(columns=pathway_rename_cols)
cancer_pathway_data['pathway_name'] = cancer_pathway_data['pathway_name'].str.replace(',', ';')
cancer_pathway_data[['pathway_id', 'pathway_name']]


Unnamed: 0,pathway_id,pathway_name
0,rno04145,Phagosome
1,rno04540,Gap junction
2,rno04810,Regulation of actin cytoskeleton
3,rno04530,Tight junction
4,rno04210,Apoptosis
5,rno04270,Vascular smooth muscle contraction
6,rno04141,Protein processing in endoplasmic reticulum
7,rno04611,Platelet activation
8,rno04961,Endocrine and other factor-regulated calcium r...
9,rno00020,Citrate cycle (TCA cycle)


In [37]:
cancer_pathway_net = cancer_pathway_data[['pathway_id', 'pathway_name', 'gene_labels_in']].copy()
cancer_pathway_net['gene_labels_in'] = cancer_pathway_net['gene_labels_in'].str.split(',')
cancer_pathway_net = cancer_pathway_net.explode(column='gene_labels_in', ignore_index=True)
cancer_pathway_net

Unnamed: 0,pathway_id,pathway_name,gene_labels_in
0,rno04145,Phagosome,Tfrc
1,rno04145,Phagosome,Tuba4a
2,rno04145,Phagosome,Tubb3
3,rno04145,Phagosome,Tubb6
4,rno04145,Phagosome,Ctsl
...,...,...,...
116,rno01230,Biosynthesis of amino acids,Cs
117,rno00010,Glycolysis / Gluconeogenesis,Pfkl
118,rno00010,Glycolysis / Gluconeogenesis,Aldh2
119,rno00010,Glycolysis / Gluconeogenesis,LOC100911625


In [38]:
## Pre-eclampsia
preeclam_pathway_data = pd.read_excel(kegg_preeclam_path, usecols=list(pathway_rename_cols.keys())).rename(columns=pathway_rename_cols)
preeclam_pathway_data['pathway_name'] = preeclam_pathway_data['pathway_name'].str.replace(',', ';')
preeclam_pathway_data[['pathway_id', 'pathway_name']]

Unnamed: 0,pathway_id,pathway_name
0,rno04141,Protein processing in endoplasmic reticulum
1,rno01100,Metabolic pathways
2,rno00010,Glycolysis / Gluconeogenesis
3,rno03010,Ribosome
4,rno04066,HIF-1 signaling pathway
5,rno03060,Protein export
6,rno01230,Biosynthesis of amino acids
7,rno01200,Carbon metabolism
8,rno04145,Phagosome
9,rno04142,Lysosome


In [39]:
preeclam_pathway_net = preeclam_pathway_data[['pathway_id', 'pathway_name', 'gene_labels_in']].copy()
preeclam_pathway_net['gene_labels_in'] = preeclam_pathway_net['gene_labels_in'].str.split(',')
preeclam_pathway_net = preeclam_pathway_net.explode(column='gene_labels_in', ignore_index=True)
preeclam_pathway_net

Unnamed: 0,pathway_id,pathway_name,gene_labels_in
0,rno04141,Protein processing in endoplasmic reticulum,Hsph1
1,rno04141,Protein processing in endoplasmic reticulum,Erp29
2,rno04141,Protein processing in endoplasmic reticulum,Dnajb11
3,rno04141,Protein processing in endoplasmic reticulum,Sec31a
4,rno04141,Protein processing in endoplasmic reticulum,Calr
...,...,...,...
140,rno04922,Glucagon signaling pathway,Slc2a1
141,rno04216,Ferroptosis,Tfrc
142,rno04216,Ferroptosis,Slc40a1
143,rno04216,Ferroptosis,Hmox1


In [40]:
print(f"Cancer -> # of pathways = {len(cancer_pathway_net['pathway_name'].unique())}, "
      + f"# of genes = {len(cancer_pathway_net['gene_labels_in'].unique())}, # of edges = {len(cancer_pathway_net)}")

print(f"Pre-eclampsia -> # of pathways = {len(preeclam_pathway_net['pathway_name'].unique())}, "
      + f"# of genes = {len(preeclam_pathway_net['gene_labels_in'].unique())}, # of edges = {len(preeclam_pathway_net)}")

Cancer -> # of pathways = 16, # of genes = 67, # of edges = 121
Pre-eclampsia -> # of pathways = 12, # of genes = 103, # of edges = 145


In [41]:
# Merge both pathway nets and check/remove for duplicates

merged_pathway_net = pd.concat([cancer_pathway_net, preeclam_pathway_net], axis=0, join='outer', ignore_index=True)
display(merged_pathway_net)
print()
merged_pathway_net.pivot_table(index = ['pathway_name', 'gene_labels_in'], aggfunc ='size')[merged_pathway_net.pivot_table(index = ['pathway_name', 'gene_labels_in'], aggfunc ='size') > 1]

Unnamed: 0,pathway_id,pathway_name,gene_labels_in
0,rno04145,Phagosome,Tfrc
1,rno04145,Phagosome,Tuba4a
2,rno04145,Phagosome,Tubb3
3,rno04145,Phagosome,Tubb6
4,rno04145,Phagosome,Ctsl
...,...,...,...
261,rno04922,Glucagon signaling pathway,Slc2a1
262,rno04216,Ferroptosis,Tfrc
263,rno04216,Ferroptosis,Slc40a1
264,rno04216,Ferroptosis,Hmox1





pathway_name                                 gene_labels_in
Glycolysis / Gluconeogenesis                 Aldh2             2
                                             Ldhb              2
Phagosome                                    Tfrc              2
                                             Tuba4a            2
                                             Tubb3             2
Protein processing in endoplasmic reticulum  Dnajb11           2
                                             Ero1a             2
                                             Lman2             2
dtype: int64

In [42]:
merged_pathway_net.drop_duplicates(keep='first', ignore_index=True, inplace=True)

In [43]:
print(f"Merged net -> # of pathways = {len(merged_pathway_net['pathway_name'].unique())}, "
      + f"# of genes = {len(merged_pathway_net['gene_labels_in'].unique())}, # of edges = {len(merged_pathway_net)}")

Merged net -> # of pathways = 22, # of genes = 162, # of edges = 258


In [44]:
pathway_net_fn = 'kegg_net'
cancer_pathway_net.to_csv(Path(output_path, f'./{pathway_net_fn}_cancer.csv'), index=False)
preeclam_pathway_net.to_csv(Path(output_path, f'./{pathway_net_fn}_pre_eclampsia.csv'), index=False)
merged_pathway_net.to_csv(Path(output_path, f'./{pathway_net_fn}_merged.csv'), index=False)

#### KEGG compartilhadas

In [17]:
# comp_pathway_data = pd.read_excel(kegg_common_path)

In [18]:
# KEGG compartilhadas
pathway_rename_cols = {
    '#term ID' : 'pathway_id',
    'term description' : 'pathway_name',
    'observed gene count' : 'observed_gene_count',
    'background gene count' : 'background_gene_count',
    'strength' : 'strength',
    'false discovery rate' : 'fdr',
    'matching proteins in your network (labels)' : 'gene_labels_in',
    'Unnamed: 8' : 'disease'
}

comp_pathway_data = pd.read_excel(kegg_common_path, usecols=list(pathway_rename_cols.keys())).rename(columns=pathway_rename_cols)
comp_pathway_data['pathway_name'] = comp_pathway_data['pathway_name'].str.replace(',', ';')
comp_pathway_data[['pathway_id', 'pathway_name', 'gene_labels_in', 'disease']]


Unnamed: 0,pathway_id,pathway_name,gene_labels_in,disease
0,rno04141,Protein processing in endoplasmic reticulum,"Hsph1,Erp29,Dnajb11,Sec31a,Calr,Pdia4,Ero1a,Ck...",pre_eclampsia
1,rno04141,Protein processing in endoplasmic reticulum,"Dnajb11,Ero1a,Nsfl1c,Lman2,Ubqln4,P4hb,Ddost,R...",Cancer
2,rno01230,Biosynthesis of amino acids,"Shmt2,Idh1,Got1,Phgdh,Aldoc,Pgk1",pre_eclampsia
3,rno01230,Biosynthesis of amino acids,"Pfkl,Rpia,Mat2a,Cs",Cancer
4,rno01200,Carbon metabolism,"Hk2,Shmt2,Idh1,Got1,Phgdh,Aldoc,Pgk1",pre_eclampsia
5,rno01200,Carbon metabolism,"Pfkl,Rpia,Sdha,Cs,Suclg2,Ogdh",Cancer
6,rno03010,Ribosome,"Rpl24,Rpl32,Rpl22,Rpl23a,Rps13,Rpl30,Rpl14",Cancer
7,rno03010,Ribosome,"Rpl19,Rpl34,Rpl35,Rpl13,Rpl18a,Rps25,Rps15,Rpl...",pre_eclampsia
8,rno04145,Phagosome,"Tfrc,Tuba4a,Tubb3,Tubb6,Ctsl,Tuba3b,Tuba8,C3,T...",Cancer
9,rno04145,Phagosome,"Tfrc,Calr,Tuba4a,Sec61a1,Tubb3,Lamp1,RT1-CE1,C...",pre_eclampsia


In [19]:
comp_pathway_net = comp_pathway_data[['pathway_id', 'pathway_name', 'gene_labels_in', 'disease']].copy()
comp_pathway_net['gene_labels_in'] = comp_pathway_net['gene_labels_in'].str.split(',')
comp_pathway_net = comp_pathway_net.explode(column='gene_labels_in', ignore_index=True)
comp_pathway_net

Unnamed: 0,pathway_id,pathway_name,gene_labels_in,disease
0,rno04141,Protein processing in endoplasmic reticulum,Hsph1,pre_eclampsia
1,rno04141,Protein processing in endoplasmic reticulum,Erp29,pre_eclampsia
2,rno04141,Protein processing in endoplasmic reticulum,Dnajb11,pre_eclampsia
3,rno04141,Protein processing in endoplasmic reticulum,Sec31a,pre_eclampsia
4,rno04141,Protein processing in endoplasmic reticulum,Calr,pre_eclampsia
...,...,...,...,...
97,rno00010,Glycolysis / Gluconeogenesis,Hk2,pre_eclampsia
98,rno00010,Glycolysis / Gluconeogenesis,Pgm1,pre_eclampsia
99,rno00010,Glycolysis / Gluconeogenesis,Ldhb,pre_eclampsia
100,rno00010,Glycolysis / Gluconeogenesis,Aldoc,pre_eclampsia


In [20]:
print(f"Shared -> # of pathways = {len(comp_pathway_net['pathway_name'].unique())}, "
      + f"# of genes = {len(comp_pathway_net['gene_labels_in'].unique())}, # of edges = {len(comp_pathway_net)}")

Shared -> # of pathways = 6, # of genes = 79, # of edges = 102


In [33]:
comp_pathway_net_cancer = comp_pathway_net[comp_pathway_net['disease'] == 'Cancer']
comp_pathway_net_preeclam = comp_pathway_net[comp_pathway_net['disease'] == 'pre_eclampsia']

print(f"Shared Cancer -> # of pathways = {len(comp_pathway_net_cancer['pathway_name'].unique())}, "
      + f"# of genes = {len(comp_pathway_net_cancer['gene_labels_in'].unique())}, # of edges = {len(comp_pathway_net_cancer)}")

print(f"Shared Pre-eclampsia -> # of pathways = {len(comp_pathway_net_preeclam['pathway_name'].unique())}, "
      + f"# of genes = {len(comp_pathway_net_preeclam['gene_labels_in'].unique())}, # of edges = {len(comp_pathway_net_preeclam)}")

Shared Cancer -> # of pathways = 6, # of genes = 38, # of edges = 42
Shared Pre-eclampsia -> # of pathways = 6, # of genes = 49, # of edges = 60


In [22]:
pathway_net_fn = 'kegg_net'
# comp_pathway_net_cancer.to_csv(Path(output_path, f'./{pathway_net_fn}_compartilhadas_cancer.csv'), index=False)
# comp_pathway_net_preeclam.to_csv(Path(output_path, f'./{pathway_net_fn}_compartilhadas_preeclam.csv'), index=False)

comp_pathway_net.to_csv(Path(output_path, f'./{pathway_net_fn}_compartilhadas.csv'), index=False)
