In [1]:
import numpy as np
import pandas as pd
import os
import requests
import json
import xarray as xr
import scipy.stats as stats

from pathlib import Path
from scipy.stats import mannwhitneyu
from statsmodels.stats import multitest


# arrange metadata

In [2]:
# genome metadata

# filepaths
filepath_genome_metadata = '../../data/metadata/genome-metadata.csv'
filepath_ortholog_metadata = '../../data/metadata/ortholog-metadata.csv'

# ortholog metadata
ortho_df = pd.read_csv(filepath_ortholog_metadata)

# genome metadata
genome_df = pd.read_csv(filepath_genome_metadata)

# edit the genome metadata for the purposes of this analysis
genome_df.loc[genome_df['Group'].eq('Uncultured-marine-virus'), 'Group'] = 'Virus'
genome_df.loc[genome_df['Group'].eq('Virus'), 'Clade'] = 'N/A Virus'
genome_df.loc[genome_df['Virocell'], 'Group'] = 'Virocell'
genome_df.loc[genome_df['Virocell'], 'Clade'] = 'N/A Virocell'

genome_df


Unnamed: 0,GenomeID,GenomeName,Type,Group,Clade,Virocell,Completeness
0,2716884698,AG-316-L16,SAG,Prochlorococcus,AMZ-II,False,20.69
1,2716884700,AG-316-N23,SAG,Prochlorococcus,AMZ-II,False,20.69
2,2716884701,AG-316-P23,SAG,Prochlorococcus,AMZ-II,False,12.07
3,2716884699,AG-316-L21,SAG,Prochlorococcus,AMZ-II,False,10.34
4,2716884642,AG-316-A05,SAG,Prochlorococcus,AMZ-II,False,6.90
...,...,...,...,...,...,...,...
765,651703106,S-ShM2,ISOLATE,Virus,N/A Virus,False,0.00
766,651703107,Syn19,ISOLATE,Virus,N/A Virus,False,0.00
767,641201056,Syn5,ISOLATE,Virus,N/A Virus,False,0.00
768,2595698410,metaG-MbCM1,ISOLATE,Virus,N/A Virus,False,0.00


In [3]:
# add genome id and clade info into ortholog df

ortho_df['GenomeID'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['GenomeID'])
ortho_df['Group'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Group'])
ortho_df['Clade'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Clade'])

ortho_df


Unnamed: 0,MappingName,CyCOGID,GenomeName,GeneID,Annotation,GenomeID,Group,Clade
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit,2654587735,Prochlorococcus,HLII
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit,2636415834,Synechococcus,5.2
...,...,...,...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII


In [4]:
# start ko mapping file

ko_map_df = ortho_df.groupby('CyCOGID').GeneID.count().reset_index().rename(columns={'GeneID': 'TotalRefs'})
# add in genus counts
ko_map_df = ko_map_df.join(
    pd.DataFrame(ortho_df.groupby('CyCOGID')['Group'].value_counts()).rename(
        columns={'Group': 'count'}).reset_index().pivot(
        columns='Group', index='CyCOGID', values='count').fillna(0), 
    on='CyCOGID', 
    how='left'
)
# add in clade counts (only of select clades we deal with in this study)
clades = ['HLI', 'HLII', 'LLI', '5.1B-I', '5.1B-CRD1', '5.1A-II', '5.1A-III', '5.1A-IV', '5.1A-CRD2', '5.1A-UC-A-EnvC']
ko_map_df = ko_map_df.join(
    pd.DataFrame(ortho_df[ortho_df.Clade.isin(clades)].groupby('CyCOGID')['Clade'].value_counts()).rename(
        columns={'Clade': 'count'}).reset_index().pivot(
        columns='Clade', index='CyCOGID', values='count').fillna(0), 
    on='CyCOGID', 
    how='left'
).fillna(0)
# convert dataframe back to ints
ko_map_df = ko_map_df.astype(int)
# add back CyCOG annotation
ko_map_df['DescriptionCyCOG'] = ko_map_df['CyCOGID'].map(
    ortho_df[['CyCOGID', 'Annotation']].drop_duplicates().set_index('CyCOGID')['Annotation']
)

ko_map_df


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,5.1A-UC-A-EnvC,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG
0,60000001,1376,1211,158,7,0,17,17,8,18,9,37,19,196,532,184,membrane protease FtsH catalytic subunit
1,60000002,1453,1287,157,9,0,10,19,9,16,9,40,18,198,552,198,ATP-dependent Clp protease ATP-binding subunit...
2,60000003,1387,1222,156,9,0,12,16,9,21,8,41,14,203,523,192,ATP-dependent Clp protease proteolytic subunit...
3,60000004,1631,1369,255,7,0,22,24,16,27,26,62,29,173,542,265,hypothetical protein
4,60000005,990,877,108,5,0,8,11,5,14,7,25,13,128,383,131,chaperonin GroEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,0,0,0,0,0,4,hypothetical protein
40291,60040292,4,4,0,0,0,0,0,0,0,0,0,0,0,0,4,Tryptophan-rich Synechocystis species C-termin...
40292,60040293,5,0,5,0,0,0,0,0,0,0,0,0,0,0,0,Putative transposase
40293,60040294,6,0,6,0,0,0,0,0,0,0,0,0,0,0,0,sulfate transport system substrate-binding pro...


# import kegg annotations from genomes

In [5]:
# check that there is a directory for each genome

data_path = Path('../../data/refseqs/genomes/')

count = 0
for g_id in ortho_df['GenomeID'].unique():
    if not os.path.isdir(data_path / str(g_id)):
        print(f'{g_id} directory not found: {data_path / str(g_id)}')
        count += 1
        
print(f'{count} total missing directories')


0 total missing directories


In [6]:
# import all kegg annotations as a df

ko_df = pd.DataFrame()
for g_id in ortho_df['GenomeID'].unique():
    df = pd.read_csv(data_path / f'{g_id}/{g_id}.ko.tab.txt', sep='\t')
    if len(ko_df) == 0:
        ko_df = df
    else:
        ko_df = pd.concat([ko_df, df])
        
ko_df = ko_df.reset_index(drop=True)

ko_df


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,2607658051,465,99.8,1,465,1,465,0.000000e+00,906.4,KO:K02313,chromosomal replication initiator protein,,Yes
1,2607658053,410,100.0,1,410,11,420,0.000000e+00,856.7,KO:K00799,glutathione S-transferase [EC:2.5.1.18],EC:2.5.1.18,Yes
2,2607658055,455,100.0,1,455,1,455,0.000000e+00,895.6,KO:K00383,glutathione reductase (NADPH) [EC:1.8.1.7],EC:1.8.1.7,Yes
3,2607658058,198,100.0,1,198,1,198,0.000000e+00,409.1,KO:K02276,cytochrome c oxidase subunit III [EC:1.9.3.1],EC:1.9.3.1,Yes
4,2607658059,562,100.0,1,562,1,562,0.000000e+00,1173.7,KO:K02274,cytochrome c oxidase subunit I [EC:1.9.3.1],EC:1.9.3.1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
492987,2610145142,243,35.7,19,239,48,257,4.300000e-27,123.6,KO:K03791,putative chitinase,,Yes
492988,2610145144,309,37.5,4,292,17,311,0.000000e+00,198.4,KO:K03086,RNA polymerase primary sigma factor,,Yes
492989,2610145149,109,41.2,1,95,1,97,1.500000e-13,77.4,KO:K03111,single-strand DNA-binding protein,,Yes
492990,2610145152,261,40.5,5,254,10,269,2.000000e-43,177.9,KO:K10906,exodeoxyribonuclease VIII [EC:3.1.11.-],EC:3.1.11.-,Yes


# Deduplicate gene annotations

**Problem:** Some genes (specific nucleotide sequence from particular reference genome) have more than one KO annotation hit. 
- Based on the counts below, this comes out to 25,218 (23227 + 1973 + 15 + 2 + 1) out of 492,992 annotated genes, or ~5.12%. 
- Most of these are annotations that actually have the same KO number, but just a different EC number or something. Only 3,375 genes (0.68%) have multiple distinct KO annotations
- Of these, qualitatively it seems like the annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Implemented solution:** Since the KO annotations are based on an HMM search, first select the annotation with the lowest E-value. Then if the E-values are exactly the same, randomly select one annotaiton or the other


In [7]:
# how many genes have more than one ko number assigned?

print(ko_df['gene_oid'].value_counts().value_counts())


count
1    440542
2     23227
3      1973
4        15
6         2
5         1
Name: count, dtype: int64


In [8]:
# deeper look: examine genes with more than one annotation

counts = ko_df['gene_oid'].value_counts()
ko_counts = ko_df[ko_df['gene_oid'].isin(counts[counts.gt(1)].index)].groupby('gene_oid')['ko_id'].nunique()
print(ko_counts.value_counts())    # most have the same ko id, just different EC numbers or something

ko_df[ko_df['gene_oid'].isin(ko_counts[ko_counts.gt(1)].index)]


ko_id
1    21836
2     3333
3       48
4        1
Name: count, dtype: int64


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
1549,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06182,23S rRNA pseudouridine2604 synthase [EC:5.4.99...,EC:5.4.99.21,Yes
1550,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06183,16S rRNA pseudouridine516 synthase [EC:5.4.99.19],EC:5.4.99.19,Yes
1561,2681971798,467,89.08,1,467,1,467,0.000000e+00,960.0,KO:K04094,methylenetetrahydrofolate--tRNA-(uracil-5-)-me...,EC:2.1.1.74,Yes
1562,2681971798,467,90.15,1,467,1,467,0.000000e+00,969.0,KO:K03495,tRNA uridine 5-carboxymethylaminomethyl modifi...,,Yes
1620,2681971886,348,80.77,1,338,1,338,0.000000e+00,680.0,KO:K08919,chlorophyll a/b binding light-harvesting prote...,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
492225,2717343036,149,41.86,15,140,4,132,6.500000e-31,126.0,KO:K00980,glycerol-3-phosphate cytidylyltransferase [EC:...,EC:2.7.7.39,Yes
492226,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.1.167,Yes
492227,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.7.70,Yes
492269,2717343185,112,100.00,1,112,1,112,1.400000e-73,243.0,KO:K04751,nitrogen regulatory protein P-II 1,,Yes


In [9]:
# deduplicate genes with more than one annotation

# for each gene_oid's set of annotations, select the one with the lowest e-value
# this step will also randomly select one of the annotations to propogate in cases with the same KO 
# but different EC numbers, or different KOs but same e-value

ko_df = ko_df.loc[ko_df.groupby('gene_oid')['evalue'].idxmin(), :].reset_index(drop=True)
ko_df


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,638291477,531,48.40,4,522,3,549,0.000000e+00,461.5,KO:K17680,twinkle protein [EC:3.6.4.12],EC:3.6.4.12,Yes
1,638291481,243,43.10,5,240,3,242,1.500000e-37,158.3,KO:K02335,DNA polymerase I [EC:2.7.7.7],EC:2.7.7.7,Yes
2,638291512,235,61.80,26,231,1,207,0.000000e+00,253.1,KO:K03465,thymidylate synthase (FAD) [EC:2.1.1.148],EC:2.1.1.148,Yes
3,638310966,295,49.20,2,294,32,309,0.000000e+00,283.1,KO:K06223,DNA adenine methylase [EC:2.1.1.72],EC:2.1.1.72,Yes
4,638311025,200,32.50,49,200,48,186,1.100000e-12,75.5,KO:K07336,PKHD-type hydroxylase [EC:1.14.11.-],EC:1.14.11.-,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
465755,2721491637,397,65.72,27,379,39,383,0.000000e+00,514.0,KO:K02010,iron(III) transport system ATP-binding protein...,EC:3.6.3.30,Yes
465756,2721491638,266,63.67,10,264,5,250,3.500000e-107,354.0,KO:K01069,hydroxyacylglutathione hydrolase [EC:3.1.2.6],EC:3.1.2.6,Yes
465757,2721491639,219,87.26,2,213,1,212,9.000000e-129,411.0,KO:K00765,ATP phosphoribosyltransferase [EC:2.4.2.17],EC:2.4.2.17,Yes
465758,2721491640,599,81.64,1,599,1,599,0.000000e+00,1060.0,KO:K06147,"ATP-binding cassette, subfamily B, bacterial",,Yes


# Deduplicate CyCOG annotations

**Problem:** Some CyCOG (Clusters of Orthologous Genes) consist of genes with discordant KO annotations
- Out of 40,295 orthologous gene families in CyCOGs v6, 36,693 no KO annotation, meaning just 3,602 (8.94%) contain at least one gene with a KO annotation.
- In most of the annotated CyCOGs, all gene members had a concordant KO annotation. In 128 (3.55% of annotated CyCOGs), the KO annotations were discordant among gene members.
- Of these, qualitatively it seems like the discordant annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Implemented solution:** For each CyCOG, select the annotation applied to the majority of members as the representatitve CyCOG annotation. 


In [10]:
# join kegg annotations onto reference gene set

annot_df = pd.merge(ortho_df, ko_df, left_on='GeneID', right_on='gene_oid', how='left')

# how many unique ko annotations per ortholog group?
print(annot_df.groupby('CyCOGID').ko_id.nunique().value_counts())

# drop the sequences without a ko annotation
annot_df = annot_df[annot_df['ko_id'].notna()]

annot_df


ko_id
0     36693
1      3474
2        94
3        14
4        13
7         2
9         2
5         1
8         1
10        1
Name: count, dtype: int64


Unnamed: 0,MappingName,CyCOGID,GenomeName,GeneID,Annotation,GenomeID,Group,Clade,gene_oid,gene_length,...,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III,2.607658e+09,637.0,...,1.0,637.0,1.0,637.0,0.0,1240.7,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI,2.681971e+09,640.0,...,1.0,640.0,1.0,640.0,0.0,1370.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII,2.717339e+09,584.0,...,1.0,584.0,1.0,584.0,0.0,1260.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit,2654587735,Prochlorococcus,HLII,2.655605e+09,584.0,...,1.0,584.0,1.0,584.0,0.0,1250.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit,2636415834,Synechococcus,5.2,2.638208e+09,646.0,...,1.0,643.0,1.0,643.0,0.0,1134.4,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964878,KORDI-49_2507312281,60040286,KORDI-49,2507312281,hypothetical protein,2507262011,Synechococcus,5.1A-WPC1,2.507312e+09,1889.0,...,1.0,1889.0,1.0,1889.0,0.0,3815.0,KO:K07004,,,Yes
964910,GFB01_2638207649,60040294,GFB01,2638207649,sulfate transport system substrate-binding pro...,2636415834,Synechococcus,5.2,2.638208e+09,289.0,...,7.0,280.0,6.0,279.0,0.0,389.4,KO:K02048,sulfate transport system substrate-binding pro...,,Yes
964911,GFB01_2638207687,60040294,GFB01,2638207687,sulfate transport system substrate-binding pro...,2636415834,Synechococcus,5.2,2.638208e+09,338.0,...,7.0,338.0,6.0,338.0,0.0,449.5,KO:K02048,sulfate transport system substrate-binding pro...,,Yes
964912,GFB01_2638207688,60040294,GFB01,2638207688,sulfate transport system substrate-binding pro...,2636415834,Synechococcus,5.2,2.638208e+09,348.0,...,10.0,348.0,19.0,363.0,0.0,502.3,KO:K02048,sulfate transport system substrate-binding pro...,,Yes


In [11]:
# look at non-unique ko mappings

counts = annot_df.groupby('CyCOGID').ko_id.nunique()
collisions_df = annot_df[annot_df['CyCOGID'].isin(counts[counts.gt(1)].index)].groupby('CyCOGID')[
    ['ko_id', 'ko_name']].value_counts().reset_index()

pd.set_option('max_colwidth', None)
collisions_df


Unnamed: 0,CyCOGID,ko_id,ko_name,count
0,60000002,KO:K03696,ATP-dependent Clp protease ATP-binding subunit ClpC,443
1,60000002,KO:K03695,ATP-dependent Clp protease ATP-binding subunit ClpB,417
2,60000006,KO:K11329,"two-component system, OmpR family, response regulator RpaB",452
3,60000006,KO:K07659,"two-component system, OmpR family, phosphate regulon response regulator OmpR",73
4,60000006,KO:K07657,"two-component system, OmpR family, phosphate regulon response regulator PhoB",34
...,...,...,...,...
317,60014461,KO:K05879,"dihydroxyacetone kinase, C-terminal domain [EC:2.7.1.-]",1
318,60014461,KO:K05878,"dihydroxyacetone kinase, N-terminal domain [EC:2.7.1.-]",1
319,60014461,KO:K00863,dihydroxyacetone kinase [EC:2.7.1.29],1
320,60040231,KO:K11951,bicarbonate transport system permease protein,1


In [12]:
# deduplicate the discordant ortholog-ko mapping by simple majority vote

ko_count_df = pd.DataFrame(annot_df.groupby('CyCOGID').ko_id.value_counts()).rename(
    columns={'ko_id': 'count'}).reset_index()
ko_count_df = ko_count_df.loc[ko_count_df.groupby('CyCOGID')['count'].idxmax(), :]

# add in KO annotation
name_map = annot_df[['ko_id', 'ko_name']].drop_duplicates()
name_map['DescriptionKO'] = name_map['ko_name'].str.split(' \[EC:').str[0]
name_map = name_map.loc[name_map['ko_id'].drop_duplicates().index]    # pick one description from duplicates
ko_count_df = pd.merge(left=ko_count_df, right=name_map[['ko_id', 'DescriptionKO']], on='ko_id', how='left')

# remove prefix from KOID
ko_count_df['KOID'] = ko_count_df['ko_id'].str[3:]
# rename count column and drop old ko_id column
ko_count_df = ko_count_df[['CyCOGID', 'KOID', 'DescriptionKO', 'count']].rename(
    columns={'count': 'NRefsKO'}).set_index('CyCOGID')
# calculate refs with KO mapping other than the one chosen
ko_count_df['NRefsOtherKO'] = annot_df.groupby('CyCOGID').GeneID.count() - ko_count_df['NRefsKO']

# add ko_count_df into ko_map_df
ko_map_df = pd.merge(ko_map_df, ko_count_df, on='CyCOGID', how='left')

pd.reset_option('max_colwidth')
ko_map_df


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
0,60000001,1376,1211,158,7,0,17,17,8,18,...,37,19,196,532,184,membrane protease FtsH catalytic subunit,K03798,cell division protease FtsH,1291.0,0.0
1,60000002,1453,1287,157,9,0,10,19,9,16,...,40,18,198,552,198,ATP-dependent Clp protease ATP-binding subunit...,K03696,ATP-dependent Clp protease ATP-binding subunit...,443.0,417.0
2,60000003,1387,1222,156,9,0,12,16,9,21,...,41,14,203,523,192,ATP-dependent Clp protease proteolytic subunit...,K01358,"ATP-dependent Clp protease, protease subunit",1360.0,0.0
3,60000004,1631,1369,255,7,0,22,24,16,27,...,62,29,173,542,265,hypothetical protein,,,,
4,60000005,990,877,108,5,0,8,11,5,14,...,25,13,128,383,131,chaperonin GroEL,K04077,chaperonin GroEL,935.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,...,0,0,0,0,4,hypothetical protein,,,,
40291,60040292,4,4,0,0,0,0,0,0,0,...,0,0,0,0,4,Tryptophan-rich Synechocystis species C-termin...,,,,
40292,60040293,5,0,5,0,0,0,0,0,0,...,0,0,0,0,0,Putative transposase,,,,
40293,60040294,6,0,6,0,0,0,0,0,0,...,0,0,0,0,0,sulfate transport system substrate-binding pro...,K02048,sulfate transport system substrate-binding pro...,4.0,0.0


In [13]:
# there are many different CyCOG orthologies annotated with the same KO group -- look at some of these

ko_map_counts = ko_map_df['KOID'].value_counts()
print('There are {} of {} KOs with a non-unique CyCOGID-KOID mapping:'.format(
    len(ko_map_counts[ko_map_counts.gt(1)]), ko_map_df['KOID'].nunique()))
print(ko_map_counts[ko_map_counts.gt(1)].head(20))

ko_map_df[ko_map_df['KOID'] == 'K06147'].head(10)


There are 667 of 1751 KOs with a non-unique CyCOGID-KOID mapping:
KOID
K06147    62
K01784    41
K01953    37
K01154    28
K03090    24
K07257    23
K01652    20
K00615    18
K00067    18
K01790    18
K02500    17
K00059    17
K00604    17
K00058    16
K03086    16
K02501    16
K00161    15
K05577    15
K01710    14
K00558    14
Name: count, dtype: int64


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
577,60000578,467,408,57,2,0,8,6,3,4,...,13,7,68,179,66,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",440.0,0.0
732,60000733,461,412,46,3,0,2,4,2,6,...,12,6,73,174,60,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",433.0,0.0
768,60000769,463,411,51,1,0,5,5,2,6,...,12,6,71,172,53,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",428.0,0.0
1174,60001175,423,367,54,2,0,4,7,3,4,...,13,7,60,159,53,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",412.0,0.0
1222,60001223,513,441,71,1,0,5,5,4,8,...,20,8,57,169,72,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",414.0,8.0
1489,60001490,405,269,135,1,0,13,12,7,11,...,39,20,12,59,67,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",339.0,0.0
3322,60003323,33,25,8,0,0,0,2,0,1,...,0,1,0,0,2,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",27.0,0.0
4865,60004866,12,2,10,0,0,4,0,2,1,...,0,0,0,0,0,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",12.0,0.0
6618,60006619,6,6,0,0,0,0,0,0,0,...,0,0,4,2,0,"ABC-type multidrug transport system, ATPase an...",K06147,"ATP-binding cassette, subfamily B, bacterial",2.0,0.0
7547,60007548,7,0,7,0,0,0,1,0,0,...,0,1,0,0,0,"ATP-binding cassette, subfamily B/ATP-binding ...",K06147,"ATP-binding cassette, subfamily B, bacterial",3.0,2.0


# Get KEGG pathways

In [14]:
# get full reference brite hierarchy of kegg orthologies 

url = f'http://rest.kegg.jp/get/br:ko00001/json'
file = requests.get(url, allow_redirects=True)

# url = f'https://rest.kegg.jp/list/ko/'
# file = requests.get(url, allow_redirects=True)

file


<Response [200]>

In [15]:
# parse pathways into dictionary

pathway_dict = {}
pathway_names = {}
ko_dict = {}
ko_names = {}

# exclude pathways specific to eukaryotes, multicellular organisms, and humans
excluded_systems = ['09144 Cellular community - eukaryotes', '09150 Organismal Systems', '09160 Human Diseases']

# iterate through json
for supersystem in json.loads(file.content)['children']:
    print(f'{supersystem["name"]}')
    for system in supersystem['children']:
        print(f'\t{system["name"]}')
        # skip over excluded systems (pathways specific to eukaryotes, multicellular organisms, and humans)
        # if supersystem["name"] in excluded or system["name"] in excluded:
        #     print('\t\tEXCLUDED')
        #     continue
        for pathway in system['children']:
            path_id = f'ko{pathway["name"].split(" ")[0]}'
            path_members = []
            if 'children' in pathway.keys():
                # parse ko numbers of each pathway
                for ko in pathway['children']:
                    ko_id = ko['name'].split(' ')[0]
                    ko_names[ko_id] = ' '.join(ko['name'].split(' ')[1:]).strip(' ')
                    path_members.append(ko_id)
                    # add pathway to ko_dict
                    if ko_id in ko_dict.keys():
                        ko_dict[ko_id].append(path_id)
                    else:
                        ko_dict[ko_id] = [path_id]
                pathway_dict[path_id] = path_members
                pathway_names[path_id] = pathway["name"][6:]
            print(f'\t\t{path_id}: {pathway["name"]} ({len(path_members)})')
      

09100 Metabolism
	09101 Carbohydrate metabolism
		ko00010: 00010 Glycolysis / Gluconeogenesis [PATH:ko00010] (107)
		ko00020: 00020 Citrate cycle (TCA cycle) [PATH:ko00020] (68)
		ko00030: 00030 Pentose phosphate pathway [PATH:ko00030] (89)
		ko00040: 00040 Pentose and glucuronate interconversions [PATH:ko00040] (90)
		ko00051: 00051 Fructose and mannose metabolism [PATH:ko00051] (113)
		ko00052: 00052 Galactose metabolism [PATH:ko00052] (78)
		ko00053: 00053 Ascorbate and aldarate metabolism [PATH:ko00053] (62)
		ko00500: 00500 Starch and sucrose metabolism [PATH:ko00500] (106)
		ko00520: 00520 Amino sugar and nucleotide sugar metabolism [PATH:ko00520] (156)
		ko00620: 00620 Pyruvate metabolism [PATH:ko00620] (134)
		ko00630: 00630 Glyoxylate and dicarboxylate metabolism [PATH:ko00630] (104)
		ko00640: 00640 Propanoate metabolism [PATH:ko00640] (97)
		ko00650: 00650 Butanoate metabolism [PATH:ko00650] (114)
		ko00660: 00660 C5-Branched dibasic acid metabolism [PATH:ko00660] (29)
		ko0

In [16]:
# There are some KO descriptions from the most recent KEGG download that do not match the descriptions 
# of the old annotation. A rough screen shows 243/1752. I want to look at these in more detail

name_df = ko_map_df[['KOID', 'DescriptionKO']].drop_duplicates().reset_index(drop=True)

names = []
substrings = []
for i, koid in enumerate(name_df['KOID']):
    # append KEGG names
    if koid in ko_names.keys():
        name = ko_names[koid]
    else:
        name = np.nan
    names.append(name)
    # check if previous KO Description is a substring of current KEGG names
    substrings.append(str(name_df.loc[i, 'DescriptionKO']) in str(name))
# add to dataframe  
name_df['KEGGNames'] = names
name_df['SubstringMatch'] = substrings

# # fix display options
# pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', None)

# # scren all those that don't have a match
# name_df[~name_df['SubstringMatch']]

# manual inspection of some incongruent (updated?) KOs:
problem_kos = ['K02500', 'K02428', 'K02259', 'K01594', 'K00870', 'K02501', 'K05808', 'K03606', 'K01144', 
               'K03186', 'K01234', 'K03651', 'K07011', 'K03082', 'K01003', 'K05663', 'K03152']
name_df[name_df['KOID'].isin(problem_kos)]


Unnamed: 0,KOID,DescriptionKO,KEGGNames,SubstringMatch
11,K02500,cyclase,hisF; imidazole glycerol-phosphate synthase su...,False
16,K02428,XTP/dITP diphosphohydrolase,,False
162,K02259,cytochrome c oxidase assembly protein subunit 15,"COX15, ctaA; heme a synthase [EC:1.17.99.9]",False
348,K01594,sulfinoalanine decarboxylase,,False
356,K00870,protein kinase,,False
624,K02501,glutamine amidotransferase,hisH; imidazole glycerol-phosphate synthase su...,False
646,K05808,putative sigma-54 modulation protein,hpf; ribosome hibernation promoting factor,False
832,K03606,putative colanic acid biosysnthesis UDP-glucos...,wcaJ; undecaprenyl-phosphate glucose phosphotr...,False
969,K01144,exodeoxyribonuclease V,,False
978,K03186,4-hydroxy-3-polyprenylbenzoate decarboxylase,"ubiX, bsdB, PAD1; flavin prenyltransferase [EC...",False


In [17]:
# match up pathway information to pangenome ko list

pathway_df = ko_map_df[ko_map_df['KOID'].notna()].reset_index(drop=True)

data_dict = {}
for koid in pathway_df['KOID'].unique():
    if koid in ko_dict.keys():
        pathways = ko_dict[koid]
    else:
        continue
    for pathway in pathways:
        if pathway not in data_dict.keys():
            data_dict[pathway] = pathway_df['KOID'].eq(koid).astype(int)
        else:
            data_dict[pathway] = data_dict[pathway] + pathway_df['KOID'].eq(koid).astype(int)

pathway_df = pd.merge(pathway_df, pd.DataFrame(data_dict), left_index=True, right_index=True)
pathway_df


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,ko04148,ko04911,ko04725,ko04360,ko04139,ko00542,ko03040,ko03041,ko03250,ko00062
0,60000001,1376,1211,158,7,0,17,17,8,18,...,0,0,0,0,0,0,0,0,0,0
1,60000002,1453,1287,157,9,0,10,19,9,16,...,0,0,0,0,0,0,0,0,0,0
2,60000003,1387,1222,156,9,0,12,16,9,21,...,0,0,0,0,0,0,0,0,0,0
3,60000005,990,877,108,5,0,8,11,5,14,...,0,0,0,0,0,0,0,0,0,0
4,60000006,1244,922,317,5,0,24,28,13,30,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3597,60040280,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3598,60040281,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3599,60040282,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3600,60040286,4,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# look at pathway statistics and compile dictionary of CyCOGs associated with each pathway

cycog_pathway_dict = {}

for path, cycog_count in pd.DataFrame(data_dict).sum().sort_values(ascending=False).items():
    ko_count = pathway_df.loc[pathway_df[path].gt(0), 'KOID'].nunique()
    total_kos = len(pathway_dict[path])
    print(f'{path}: {pathway_names[path]}\n\t{cycog_count} CyCOGs representing {ko_count}/{total_kos} unique KOs')
    # append list of cycogs associated with each pathway to dictionary
    cycog_pathway_dict[path] = pathway_df.loc[pathway_df[path].gt(0), 'CyCOGID'].to_list()
    

ko02000: Transporters [BR:ko02000]
	434 CyCOGs representing 173/1992 unique KOs
ko00541: O-Antigen nucleotide sugar biosynthesis [PATH:ko00541]
	199 CyCOGs representing 34/99 unique KOs
ko00520: Amino sugar and nucleotide sugar metabolism [PATH:ko00520]
	195 CyCOGs representing 57/156 unique KOs
ko99980: Enzymes with EC numbers
	159 CyCOGs representing 81/1477 unique KOs
ko02048: Prokaryotic defense system [BR:ko02048]
	150 CyCOGs representing 49/192 unique KOs
ko03400: DNA repair and recombination proteins [BR:ko03400]
	149 CyCOGs representing 75/484 unique KOs
ko01002: Peptidases and inhibitors [BR:ko01002]
	142 CyCOGs representing 49/1020 unique KOs
ko99997: Function unknown
	128 CyCOGs representing 78/355 unique KOs
ko00194: Photosynthesis proteins [BR:ko00194]
	126 CyCOGs representing 77/138 unique KOs
ko02010: ABC transporters [PATH:ko02010]
	121 CyCOGs representing 65/515 unique KOs
ko03016: Transfer RNA biogenesis [BR:ko03016]
	114 CyCOGs representing 70/269 unique KOs
ko02020:

# Calculate Enrichment

In [19]:
# set up parameters and data

# significance level
alpha = 0.01
enrichments_df = pd.DataFrame()

# helper function for fdrcorrection
def adjust_pvals(pvals):
    return multitest.fdrcorrection(pvals, alpha=alpha)[1]

# read in cluster data
data_dir = Path('../../data')
pro_ds = xr.open_dataset(data_dir / '5-models/pro-aligned-models.nc')
syn_ds = xr.open_dataset(data_dir / '5-models/syn-aligned-models.nc')
out_dir = data_dir / '7-interpretation'
if not out_dir.is_dir():
    out_dir.mkdir(parents=True)

# save pathway_df to out_dir
pathway_df.to_csv(out_dir / 'cycog-kegg-pathways.csv', index=False)

# analyze only "robust" clusters
pro_clusters = [
    'pro1', 'pro2', 'pro3', 'pro4', 'pro5', 'pro6', 'pro7', 'pro8', 'pro9', 'pro10', 'pro11', 'pro12', 'pro13', 'pro14', 'pro15'
]
syn_clusters = [
    'syn1', 'syn2', 'syn3', 'syn4', 'syn5', 'syn6', 'syn7', 'syn8', 'syn9', 'syn10', 'syn11', 'syn12', 'syn13', 'syn14', 'syn15'
]


In [20]:
# pull out median bootstrap weights

# pull out pro gene weight data
pro_gene_df = pro_ds.median(dim=['Bootstrap', 'Replicate']).GeneWeight.T.to_pandas()
pro_gene_df = pro_gene_df.rename(columns=dict(zip(pro_gene_df.columns, [f"pro{i}" for i in pro_gene_df.columns])))
# select only robust components
pro_gene_df = pro_gene_df.loc[:, pro_clusters]

# pull out syn gene weight data
syn_gene_df = syn_ds.median(dim=['Bootstrap', 'Replicate']).GeneWeight.T.to_pandas()
syn_gene_df = syn_gene_df.rename(columns=dict(zip(syn_gene_df.columns, [f"syn{i}" for i in syn_gene_df.columns])))
# select only robust components
syn_gene_df = syn_gene_df.loc[:, syn_clusters]

syn_gene_df
    

Component,syn1,syn2,syn3,syn4,syn5,syn6,syn7,syn8,syn9,syn10,syn11,syn12,syn13,syn14,syn15
Ortholog,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
60000001,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.086151,0.101842,0.0,0.000000,0.0,0.0,0.019375
60000002,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.024214,0.028422,0.0,0.000000,0.0,0.0,0.000000
60000003,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
60000004,0.0,0.0,0.01846,0.0,0.0,0.000000,0.0,0.122837,0.113767,0.000000,0.0,0.000000,0.0,0.0,0.000000
60000005,0.0,0.0,0.01363,0.0,0.0,0.133094,0.0,0.000000,0.000000,0.000000,0.0,0.020101,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60040234,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
60040235,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
60040237,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000
60040245,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000


In [21]:
# run Mann-Whitney U test for every pathway against every pro cluster

input_df = pro_gene_df

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
pro_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
pro_enrich_df = pro_enrich_df[~pro_enrich_df.isna().all(axis=1)]
# adjust p-values
pro_enrich_df = pro_enrich_df.apply(adjust_pvals, raw=True)

pro_enrich_df


Component,pro1,pro2,pro3,pro4,pro5,pro6,pro7,pro8,pro9,pro10,pro11,pro12,pro13,pro14,pro15
ko02000,1.647110e-09,7.750578e-01,0.699927,0.662499,0.729813,0.515945,0.183869,0.682017,0.838730,5.735125e-01,0.638149,0.624109,5.131840e-35,0.672571,0.752482
ko00541,8.039476e-01,1.823045e-15,0.650087,0.674639,0.672264,0.693228,0.611078,0.686552,0.751441,2.451500e-06,0.605543,0.596344,5.940847e-01,0.630172,0.791404
ko00520,7.523739e-01,5.126096e-13,0.650087,0.683834,0.672535,0.695023,0.611078,0.682017,0.758635,2.929605e-09,0.605543,0.596344,5.940847e-01,0.630172,0.533747
ko99980,6.165838e-01,6.366184e-01,0.421697,0.687156,0.675396,0.689349,0.611078,0.694216,0.707240,7.265002e-01,0.606547,0.596344,5.940847e-01,0.631892,0.817576
ko02048,7.721193e-01,1.416235e-01,0.650087,0.216412,0.669121,0.689349,0.611078,0.685477,0.388912,7.014157e-01,0.605543,0.596344,5.940847e-01,0.630172,0.771807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ko04914,7.523739e-01,7.750578e-01,0.650087,0.662499,0.668479,0.689349,0.611078,0.682017,0.707240,6.960685e-01,0.605543,0.596344,5.940847e-01,0.630172,0.752482
ko01052,7.523739e-01,7.750578e-01,0.650087,0.662499,0.668479,0.689349,0.611078,0.682017,0.707240,6.960685e-01,0.605543,0.596344,5.940847e-01,0.630172,0.752482
ko00522,7.523739e-01,7.750578e-01,0.650087,0.662499,0.668479,0.689349,0.611078,0.682017,0.707240,6.960685e-01,0.605543,0.596344,5.940847e-01,0.630172,0.752482
ko04919,7.523739e-01,7.750578e-01,0.650087,0.662499,0.668479,0.689349,0.611078,0.682017,0.707240,6.960685e-01,0.605543,0.596344,5.940847e-01,0.630172,0.752482


In [22]:
# list the most enriched pathways for each cluster

for cluster in pro_enrich_df.columns:
    pathways = pro_enrich_df[pro_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        enrichments_df.loc[path, 'description'] = pathway_names[path]
        enrichments_df.loc[path, cluster] = pval
        print(f'\t{path} (p={pval:.2e}): {pathway_names[path]}')



pro1 (20 enriched pathways)
	ko00405 (p=2.99e-15): Phenazine biosynthesis [PATH:ko00405]
	ko01501 (p=2.99e-15): beta-Lactam resistance [PATH:ko01501]
	ko03400 (p=9.53e-11): DNA repair and recombination proteins [BR:ko03400]
	ko02000 (p=1.65e-09): Transporters [BR:ko02000]
	ko00480 (p=2.23e-09): Glutathione metabolism [PATH:ko00480]
	ko03036 (p=2.98e-09): Chromosome and associated proteins [BR:ko03036]
	ko02024 (p=1.17e-08): Quorum sensing [PATH:ko02024]
	ko00330 (p=3.13e-08): Arginine and proline metabolism [PATH:ko00330]
	ko04918 (p=3.41e-08): Thyroid hormone synthesis [PATH:ko04918]
	ko00332 (p=3.41e-08): Carbapenem biosynthesis [PATH:ko00332]
	ko02025 (p=5.74e-08): Biofilm formation - Pseudomonas aeruginosa [PATH:ko02025]
	ko00550 (p=6.53e-08): Peptidoglycan biosynthesis [PATH:ko00550]
	ko01011 (p=1.21e-05): Peptidoglycan biosynthesis and degradation proteins [BR:ko01011]
	ko03440 (p=5.10e-05): Homologous recombination [PATH:ko03440]
	ko03009 (p=6.17e-05): Ribosome biogenesis [BR:k

  enrichments_df.loc[path, 'description'] = pathway_names[path]


In [23]:
# run Mann-Whitney U test for every pathway against every syn cluster

input_df = syn_gene_df

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
syn_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
syn_enrich_df = syn_enrich_df[~syn_enrich_df.isna().all(axis=1)]
# adjust p-values
syn_enrich_df = syn_enrich_df.apply(adjust_pvals, raw=True)

syn_enrich_df


Component,syn1,syn2,syn3,syn4,syn5,syn6,syn7,syn8,syn9,syn10,syn11,syn12,syn13,syn14,syn15
ko02000,0.437733,0.602373,0.712480,0.858173,0.682664,0.844508,0.335143,0.770031,0.719463,0.294993,0.264937,0.728342,1.055902e-20,0.763955,0.902215
ko00541,0.770566,0.564693,0.730836,0.691847,0.682664,0.787587,0.813653,0.792794,0.741651,0.793942,0.003851,0.616201,6.672566e-01,0.141504,0.724368
ko00520,0.800623,0.564693,0.762662,0.691821,0.682664,0.748251,0.856636,0.770031,0.779277,0.824997,0.039758,0.630644,6.804044e-01,0.000632,0.095277
ko99980,0.757242,0.564693,0.712480,0.734639,0.682664,0.748251,0.888556,0.869598,0.818724,0.860451,0.775436,0.640010,6.945733e-01,0.763955,0.715164
ko02048,0.770566,0.564693,0.728731,0.691847,0.682664,0.781132,0.813653,0.789082,0.738722,0.793942,0.704074,0.616201,6.672566e-01,0.775262,0.724368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ko04015,0.757242,0.564693,0.712480,0.691821,0.682664,0.748251,0.801872,0.770031,0.719463,0.781453,0.701411,0.616201,6.672566e-01,0.763955,0.715164
ko04371,0.757242,0.564693,0.712480,0.691821,0.682664,0.748251,0.801872,0.770031,0.719463,0.781453,0.701411,0.616201,6.672566e-01,0.763955,0.715164
ko04022,0.757242,0.564693,0.712480,0.691821,0.682664,0.748251,0.801872,0.770031,0.719463,0.781453,0.701411,0.616201,6.672566e-01,0.763955,0.715164
ko04218,0.757242,0.564693,0.712480,0.691821,0.682664,0.748251,0.801872,0.770031,0.719463,0.781453,0.701411,0.616201,6.672566e-01,0.763955,0.715164


In [24]:
# list the most enriched pathways for each cluster

for cluster in syn_enrich_df.columns:
    pathways = syn_enrich_df[syn_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        enrichments_df.loc[path, 'description'] = pathway_names[path]
        enrichments_df.loc[path, cluster] = pval
        print(f'\t{path} (p={pval:.2E}): {pathway_names[path]}')



syn1 (7 enriched pathways)
	ko99974 (p=1.64E-15): Translation
	ko99986 (p=1.28E-07): Glycan metabolism
	ko00511 (p=1.28E-07): Other glycan degradation [PATH:ko00511]
	ko00930 (p=5.37E-05): Caprolactam degradation [PATH:ko00930]
	ko04020 (p=6.60E-05): Calcium signaling pathway [PATH:ko04020]
	ko00500 (p=1.79E-04): Starch and sucrose metabolism [PATH:ko00500]
	ko04910 (p=7.94E-03): Insulin signaling pathway [PATH:ko04910]

syn2 (1 enriched pathways)
	ko01002 (p=2.45E-10): Peptidases and inhibitors [BR:ko01002]

syn3 (23 enriched pathways)
	ko04940 (p=1.01E-22): Type I diabetes mellitus [PATH:ko04940]
	ko99995 (p=3.19E-14): Signaling proteins
	ko01053 (p=2.15E-11): Biosynthesis of siderophore group nonribosomal peptides [PATH:ko01053]
	ko04217 (p=5.08E-09): Necroptosis [PATH:ko04217]
	ko04727 (p=5.17E-09): GABAergic synapse [PATH:ko04727]
	ko05415 (p=1.61E-06): Diabetic cardiomyopathy [PATH:ko05415]
	ko00860 (p=1.71E-06): Porphyrin metabolism [PATH:ko00860]
	ko04714 (p=4.07E-06): Thermog

In [25]:
# reformat and save enrichments dataframe

enrichments_df = enrichments_df.sort_index().reset_index().rename(columns={'index': 'KEGG pathway'})
enrichments_df.to_csv(out_dir / 'kegg-enrichments.csv', index=False)

enrichments_df


Unnamed: 0,KEGG pathway,description,pro1,pro2,pro3,pro4,pro5,pro6,pro7,pro8,...,syn6,syn7,syn8,syn9,syn10,syn11,syn12,syn13,syn14,syn15
0,ko00010,Glycolysis / Gluconeogenesis [PATH:ko00010],,,,,,,1.301916e-05,,...,6.169548e-21,,4.152211e-05,,,,2.930472e-04,,,0.000003
1,ko00020,Citrate cycle (TCA cycle) [PATH:ko00020],,,,,,,,,...,3.774187e-03,,,,,,,,,
2,ko00030,Pentose phosphate pathway [PATH:ko00030],,,,,,,8.475395e-28,4.460089e-08,...,1.435920e-06,3.503354e-09,5.270239e-15,,,,3.428592e-05,,,0.001719
3,ko00040,Pentose and glucuronate interconversions [PATH...,,,,,,,2.496492e-30,4.130385e-09,...,,,,,,,,,,
4,ko00051,Fructose and mannose metabolism [PATH:ko00051],,0.006216,,,,,3.067445e-10,,...,1.640677e-13,,,,,,4.121136e-10,,0.000306,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,ko99992,Structural proteins,,0.005599,,,,,,,...,,,,,,,,,,
165,ko99994,Others,,,,,,,,,...,,,,,0.000001,,,,,
166,ko99995,Signaling proteins,,,,,,,,,...,,8.521656e-13,5.270239e-15,,,0.000957,,,,0.001517
167,ko99996,General function prediction only,,,,,,,,,...,,,,,0.006123,,,,,
