In [1]:
import numpy as np
import pandas as pd
import os
import requests
import json
import xarray as xr
import scipy.stats as stats

from pathlib import Path
from scipy.stats import mannwhitneyu
from statsmodels.stats import multitest


# arrange metadata

In [2]:
# genome metadata

# filepaths
filepath_genome_metadata = '../../data/metadata/genome-metadata.csv'
filepath_ortholog_metadata = '../../data/metadata/ortholog-metadata.csv'

# ortholog metadata
ortho_df = pd.read_csv(filepath_ortholog_metadata)

# genome metadata
genome_df = pd.read_csv(filepath_genome_metadata)

# edit the genome metadata for the purposes of this analysis
genome_df.loc[genome_df['Group'].eq('Uncultured-marine-virus'), 'Group'] = 'Virus'
genome_df.loc[genome_df['Group'].eq('Virus'), 'Clade'] = 'N/A Virus'
genome_df.loc[genome_df['Virocell'], 'Group'] = 'Virocell'
genome_df.loc[genome_df['Virocell'], 'Clade'] = 'N/A Virocell'

genome_df


Unnamed: 0,GenomeID,GenomeName,Type,Group,Clade,Virocell,Completeness
0,2716884698,AG-316-L16,SAG,Prochlorococcus,AMZ-II,False,20.69
1,2716884700,AG-316-N23,SAG,Prochlorococcus,AMZ-II,False,20.69
2,2716884701,AG-316-P23,SAG,Prochlorococcus,AMZ-II,False,12.07
3,2716884699,AG-316-L21,SAG,Prochlorococcus,AMZ-II,False,10.34
4,2716884642,AG-316-A05,SAG,Prochlorococcus,AMZ-II,False,6.90
...,...,...,...,...,...,...,...
765,651703106,S-ShM2,ISOLATE,Virus,N/A Virus,False,0.00
766,651703107,Syn19,ISOLATE,Virus,N/A Virus,False,0.00
767,641201056,Syn5,ISOLATE,Virus,N/A Virus,False,0.00
768,2595698410,metaG-MbCM1,ISOLATE,Virus,N/A Virus,False,0.00


In [3]:
# add genome id and clade info into ortholog df

ortho_df['GenomeID'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['GenomeID'])
ortho_df['Group'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Group'])
ortho_df['Clade'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Clade'])

ortho_df


Unnamed: 0,MappingName,CyCOGID,GenomeName,GeneID,Annotation,GenomeID,Group,Clade
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit,2654587735,Prochlorococcus,HLII
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit,2636415834,Synechococcus,5.2
...,...,...,...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII


In [4]:
# start ko mapping file

ko_map_df = ortho_df.groupby('CyCOGID').GeneID.count().reset_index().rename(columns={'GeneID': 'TotalRefs'})
# add in genus counts
ko_map_df = ko_map_df.join(
    pd.DataFrame(ortho_df.groupby('CyCOGID')['Group'].value_counts()).rename(
        columns={'Group': 'count'}).reset_index().pivot(
        columns='Group', index='CyCOGID', values='count').fillna(0), 
    on='CyCOGID', 
    how='left'
)
# add in clade counts (only of select clades we deal with in this study)
clades = ['HLI', 'HLII', 'LLI', '5.1B-I', '5.1B-CRD1', '5.1A-II', '5.1A-III', '5.1A-IV', '5.1A-CRD2', '5.1A-UC-A-EnvC']
ko_map_df = ko_map_df.join(
    pd.DataFrame(ortho_df[ortho_df.Clade.isin(clades)].groupby('CyCOGID')['Clade'].value_counts()).rename(
        columns={'Clade': 'count'}).reset_index().pivot(
        columns='Clade', index='CyCOGID', values='count').fillna(0), 
    on='CyCOGID', 
    how='left'
).fillna(0)
# convert dataframe back to ints
ko_map_df = ko_map_df.astype(int)
# add back CyCOG annotation
ko_map_df['DescriptionCyCOG'] = ko_map_df['CyCOGID'].map(
    ortho_df[['CyCOGID', 'Annotation']].drop_duplicates().set_index('CyCOGID')['Annotation']
)

ko_map_df


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,5.1A-UC-A-EnvC,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG
0,60000001,1376,1211,158,7,0,17,17,8,18,9,37,19,196,532,184,membrane protease FtsH catalytic subunit
1,60000002,1453,1287,157,9,0,10,19,9,16,9,40,18,198,552,198,ATP-dependent Clp protease ATP-binding subunit...
2,60000003,1387,1222,156,9,0,12,16,9,21,8,41,14,203,523,192,ATP-dependent Clp protease proteolytic subunit...
3,60000004,1631,1369,255,7,0,22,24,16,27,26,62,29,173,542,265,hypothetical protein
4,60000005,990,877,108,5,0,8,11,5,14,7,25,13,128,383,131,chaperonin GroEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,0,0,0,0,0,4,hypothetical protein
40291,60040292,4,4,0,0,0,0,0,0,0,0,0,0,0,0,4,Tryptophan-rich Synechocystis species C-termin...
40292,60040293,5,0,5,0,0,0,0,0,0,0,0,0,0,0,0,Putative transposase
40293,60040294,6,0,6,0,0,0,0,0,0,0,0,0,0,0,0,sulfate transport system substrate-binding pro...


# import kegg annotations from genomes

In [5]:
# check that there is a directory for each genome

data_path = Path('../../data/refseqs/genomes/')

count = 0
for g_id in ortho_df['GenomeID'].unique():
    if not os.path.isdir(data_path / str(g_id)):
        print(f'{g_id} directory not found: {data_path / str(g_id)}')
        count += 1
        
print(f'{count} total missing directories')


0 total missing directories


In [6]:
# import all kegg annotations as a df

ko_df = pd.DataFrame()
for g_id in ortho_df['GenomeID'].unique():
    df = pd.read_csv(data_path / f'{g_id}/{g_id}.ko.tab.txt', sep='\t')
    if len(ko_df) == 0:
        ko_df = df
    else:
        ko_df = pd.concat([ko_df, df])
        
ko_df = ko_df.reset_index(drop=True)

ko_df


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,2607658051,465,99.8,1,465,1,465,0.000000e+00,906.4,KO:K02313,chromosomal replication initiator protein,,Yes
1,2607658053,410,100.0,1,410,11,420,0.000000e+00,856.7,KO:K00799,glutathione S-transferase [EC:2.5.1.18],EC:2.5.1.18,Yes
2,2607658055,455,100.0,1,455,1,455,0.000000e+00,895.6,KO:K00383,glutathione reductase (NADPH) [EC:1.8.1.7],EC:1.8.1.7,Yes
3,2607658058,198,100.0,1,198,1,198,0.000000e+00,409.1,KO:K02276,cytochrome c oxidase subunit III [EC:1.9.3.1],EC:1.9.3.1,Yes
4,2607658059,562,100.0,1,562,1,562,0.000000e+00,1173.7,KO:K02274,cytochrome c oxidase subunit I [EC:1.9.3.1],EC:1.9.3.1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
492987,2610145142,243,35.7,19,239,48,257,4.300000e-27,123.6,KO:K03791,putative chitinase,,Yes
492988,2610145144,309,37.5,4,292,17,311,0.000000e+00,198.4,KO:K03086,RNA polymerase primary sigma factor,,Yes
492989,2610145149,109,41.2,1,95,1,97,1.500000e-13,77.4,KO:K03111,single-strand DNA-binding protein,,Yes
492990,2610145152,261,40.5,5,254,10,269,2.000000e-43,177.9,KO:K10906,exodeoxyribonuclease VIII [EC:3.1.11.-],EC:3.1.11.-,Yes


# Deduplicate gene annotations

**Problem:** Some genes (specific nucleotide sequence from particular reference genome) have more than one KO annotation hit. 
- Based on the counts below, this comes out to 25,218 (23227 + 1973 + 15 + 2 + 1) out of 492,992 annotated genes, or ~5.12%. 
- Most of these are annotations that actually have the same KO number, but just a different EC number or something. Only 3,375 genes (0.68%) have multiple distinct KO annotations
- Of these, qualitatively it seems like the annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Implemented solution:** Since the KO annotations are based on an HMM search, first select the annotation with the lowest E-value. Then if the E-values are exactly the same, randomly select one annotaiton or the other


In [7]:
# how many genes have more than one ko number assigned?

print(ko_df['gene_oid'].value_counts().value_counts())


count
1    440542
2     23227
3      1973
4        15
6         2
5         1
Name: count, dtype: int64


In [8]:
# deeper look: examine genes with more than one annotation

counts = ko_df['gene_oid'].value_counts()
ko_counts = ko_df[ko_df['gene_oid'].isin(counts[counts.gt(1)].index)].groupby('gene_oid')['ko_id'].nunique()
print(ko_counts.value_counts())    # most have the same ko id, just different EC numbers or something

ko_df[ko_df['gene_oid'].isin(ko_counts[ko_counts.gt(1)].index)]


ko_id
1    21836
2     3333
3       48
4        1
Name: count, dtype: int64


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
1549,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06182,23S rRNA pseudouridine2604 synthase [EC:5.4.99...,EC:5.4.99.21,Yes
1550,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06183,16S rRNA pseudouridine516 synthase [EC:5.4.99.19],EC:5.4.99.19,Yes
1561,2681971798,467,89.08,1,467,1,467,0.000000e+00,960.0,KO:K04094,methylenetetrahydrofolate--tRNA-(uracil-5-)-me...,EC:2.1.1.74,Yes
1562,2681971798,467,90.15,1,467,1,467,0.000000e+00,969.0,KO:K03495,tRNA uridine 5-carboxymethylaminomethyl modifi...,,Yes
1620,2681971886,348,80.77,1,338,1,338,0.000000e+00,680.0,KO:K08919,chlorophyll a/b binding light-harvesting prote...,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
492225,2717343036,149,41.86,15,140,4,132,6.500000e-31,126.0,KO:K00980,glycerol-3-phosphate cytidylyltransferase [EC:...,EC:2.7.7.39,Yes
492226,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.1.167,Yes
492227,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.7.70,Yes
492269,2717343185,112,100.00,1,112,1,112,1.400000e-73,243.0,KO:K04751,nitrogen regulatory protein P-II 1,,Yes


In [9]:
# deduplicate genes with more than one annotation

# for each gene_oid's set of annotations, select the one with the lowest e-value
# this step will also randomly select one of the annotations to propogate in cases with the same KO 
# but different EC numbers, or different KOs but same e-value

ko_df = ko_df.loc[ko_df.groupby('gene_oid')['evalue'].idxmin(), :].reset_index(drop=True)
ko_df


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,638291477,531,48.40,4,522,3,549,0.000000e+00,461.5,KO:K17680,twinkle protein [EC:3.6.4.12],EC:3.6.4.12,Yes
1,638291481,243,43.10,5,240,3,242,1.500000e-37,158.3,KO:K02335,DNA polymerase I [EC:2.7.7.7],EC:2.7.7.7,Yes
2,638291512,235,61.80,26,231,1,207,0.000000e+00,253.1,KO:K03465,thymidylate synthase (FAD) [EC:2.1.1.148],EC:2.1.1.148,Yes
3,638310966,295,49.20,2,294,32,309,0.000000e+00,283.1,KO:K06223,DNA adenine methylase [EC:2.1.1.72],EC:2.1.1.72,Yes
4,638311025,200,32.50,49,200,48,186,1.100000e-12,75.5,KO:K07336,PKHD-type hydroxylase [EC:1.14.11.-],EC:1.14.11.-,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
465755,2721491637,397,65.72,27,379,39,383,0.000000e+00,514.0,KO:K02010,iron(III) transport system ATP-binding protein...,EC:3.6.3.30,Yes
465756,2721491638,266,63.67,10,264,5,250,3.500000e-107,354.0,KO:K01069,hydroxyacylglutathione hydrolase [EC:3.1.2.6],EC:3.1.2.6,Yes
465757,2721491639,219,87.26,2,213,1,212,9.000000e-129,411.0,KO:K00765,ATP phosphoribosyltransferase [EC:2.4.2.17],EC:2.4.2.17,Yes
465758,2721491640,599,81.64,1,599,1,599,0.000000e+00,1060.0,KO:K06147,"ATP-binding cassette, subfamily B, bacterial",,Yes


# Deduplicate CyCOG annotations

**Problem:** Some CyCOG (Clusters of Orthologous Genes) consist of genes with discordant KO annotations
- Out of 40,295 orthologous gene families in CyCOGs v6, 36,693 no KO annotation, meaning just 3,602 (8.94%) contain at least one gene with a KO annotation.
- In most of the annotated CyCOGs, all gene members had a concordant KO annotation. In 128 (3.55% of annotated CyCOGs), the KO annotations were discordant among gene members.
- Of these, qualitatively it seems like the discordant annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Implemented solution:** For each CyCOG, select the annotation applied to the majority of members as the representatitve CyCOG annotation. 


In [10]:
# join kegg annotations onto reference gene set

annot_df = pd.merge(ortho_df, ko_df, left_on='GeneID', right_on='gene_oid', how='left')

# how many unique ko annotations per ortholog group?
print(annot_df.groupby('CyCOGID').ko_id.nunique().value_counts())

# drop the sequences without a ko annotation
annot_df = annot_df[annot_df['ko_id'].notna()]

annot_df


ko_id
0     36693
1      3474
2        94
3        14
4        13
7         2
9         2
5         1
8         1
10        1
Name: count, dtype: int64


Unnamed: 0,MappingName,CyCOGID,GenomeName,GeneID,Annotation,GenomeID,Group,Clade,gene_oid,gene_length,...,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III,2.607658e+09,637.0,...,1.0,637.0,1.0,637.0,0.0,1240.7,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI,2.681971e+09,640.0,...,1.0,640.0,1.0,640.0,0.0,1370.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII,2.717339e+09,584.0,...,1.0,584.0,1.0,584.0,0.0,1260.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit,2654587735,Prochlorococcus,HLII,2.655605e+09,584.0,...,1.0,584.0,1.0,584.0,0.0,1250.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit,2636415834,Synechococcus,5.2,2.638208e+09,646.0,...,1.0,643.0,1.0,643.0,0.0,1134.4,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964878,KORDI-49_2507312281,60040286,KORDI-49,2507312281,hypothetical protein,2507262011,Synechococcus,5.1A-WPC1,2.507312e+09,1889.0,...,1.0,1889.0,1.0,1889.0,0.0,3815.0,KO:K07004,,,Yes
964910,GFB01_2638207649,60040294,GFB01,2638207649,sulfate transport system substrate-binding pro...,2636415834,Synechococcus,5.2,2.638208e+09,289.0,...,7.0,280.0,6.0,279.0,0.0,389.4,KO:K02048,sulfate transport system substrate-binding pro...,,Yes
964911,GFB01_2638207687,60040294,GFB01,2638207687,sulfate transport system substrate-binding pro...,2636415834,Synechococcus,5.2,2.638208e+09,338.0,...,7.0,338.0,6.0,338.0,0.0,449.5,KO:K02048,sulfate transport system substrate-binding pro...,,Yes
964912,GFB01_2638207688,60040294,GFB01,2638207688,sulfate transport system substrate-binding pro...,2636415834,Synechococcus,5.2,2.638208e+09,348.0,...,10.0,348.0,19.0,363.0,0.0,502.3,KO:K02048,sulfate transport system substrate-binding pro...,,Yes


In [11]:
# look at non-unique ko mappings

counts = annot_df.groupby('CyCOGID').ko_id.nunique()
collisions_df = annot_df[annot_df['CyCOGID'].isin(counts[counts.gt(1)].index)].groupby('CyCOGID')[
    ['ko_id', 'ko_name']].value_counts().reset_index()

pd.set_option('max_colwidth', None)
collisions_df


Unnamed: 0,CyCOGID,ko_id,ko_name,count
0,60000002,KO:K03696,ATP-dependent Clp protease ATP-binding subunit ClpC,443
1,60000002,KO:K03695,ATP-dependent Clp protease ATP-binding subunit ClpB,417
2,60000006,KO:K11329,"two-component system, OmpR family, response regulator RpaB",452
3,60000006,KO:K07659,"two-component system, OmpR family, phosphate regulon response regulator OmpR",73
4,60000006,KO:K07657,"two-component system, OmpR family, phosphate regulon response regulator PhoB",34
...,...,...,...,...
317,60014461,KO:K05879,"dihydroxyacetone kinase, C-terminal domain [EC:2.7.1.-]",1
318,60014461,KO:K05878,"dihydroxyacetone kinase, N-terminal domain [EC:2.7.1.-]",1
319,60014461,KO:K00863,dihydroxyacetone kinase [EC:2.7.1.29],1
320,60040231,KO:K11951,bicarbonate transport system permease protein,1


In [12]:
# deduplicate the discordant ortholog-ko mapping by simple majority vote

ko_count_df = pd.DataFrame(annot_df.groupby('CyCOGID').ko_id.value_counts()).rename(
    columns={'ko_id': 'count'}).reset_index()
ko_count_df = ko_count_df.loc[ko_count_df.groupby('CyCOGID')['count'].idxmax(), :]

# add in KO annotation
name_map = annot_df[['ko_id', 'ko_name']].drop_duplicates()
name_map['DescriptionKO'] = name_map['ko_name'].str.split(' \[EC:').str[0]
name_map = name_map.loc[name_map['ko_id'].drop_duplicates().index]    # pick one description from duplicates
ko_count_df = pd.merge(left=ko_count_df, right=name_map[['ko_id', 'DescriptionKO']], on='ko_id', how='left')

# remove prefix from KOID
ko_count_df['KOID'] = ko_count_df['ko_id'].str[3:]
# rename count column and drop old ko_id column
ko_count_df = ko_count_df[['CyCOGID', 'KOID', 'DescriptionKO', 'count']].rename(
    columns={'count': 'NRefsKO'}).set_index('CyCOGID')
# calculate refs with KO mapping other than the one chosen
ko_count_df['NRefsOtherKO'] = annot_df.groupby('CyCOGID').GeneID.count() - ko_count_df['NRefsKO']

# add ko_count_df into ko_map_df
ko_map_df = pd.merge(ko_map_df, ko_count_df, on='CyCOGID', how='left')

pd.reset_option('max_colwidth')
ko_map_df


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
0,60000001,1376,1211,158,7,0,17,17,8,18,...,37,19,196,532,184,membrane protease FtsH catalytic subunit,K03798,cell division protease FtsH,1291.0,0.0
1,60000002,1453,1287,157,9,0,10,19,9,16,...,40,18,198,552,198,ATP-dependent Clp protease ATP-binding subunit...,K03696,ATP-dependent Clp protease ATP-binding subunit...,443.0,417.0
2,60000003,1387,1222,156,9,0,12,16,9,21,...,41,14,203,523,192,ATP-dependent Clp protease proteolytic subunit...,K01358,"ATP-dependent Clp protease, protease subunit",1360.0,0.0
3,60000004,1631,1369,255,7,0,22,24,16,27,...,62,29,173,542,265,hypothetical protein,,,,
4,60000005,990,877,108,5,0,8,11,5,14,...,25,13,128,383,131,chaperonin GroEL,K04077,chaperonin GroEL,935.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,...,0,0,0,0,4,hypothetical protein,,,,
40291,60040292,4,4,0,0,0,0,0,0,0,...,0,0,0,0,4,Tryptophan-rich Synechocystis species C-termin...,,,,
40292,60040293,5,0,5,0,0,0,0,0,0,...,0,0,0,0,0,Putative transposase,,,,
40293,60040294,6,0,6,0,0,0,0,0,0,...,0,0,0,0,0,sulfate transport system substrate-binding pro...,K02048,sulfate transport system substrate-binding pro...,4.0,0.0


In [13]:
# there are many different CyCOG orthologies annotated with the same KO group -- look at some of these

ko_map_counts = ko_map_df['KOID'].value_counts()
print('There are {} of {} KOs with a non-unique CyCOGID-KOID mapping:'.format(
    len(ko_map_counts[ko_map_counts.gt(1)]), ko_map_df['KOID'].nunique()))
print(ko_map_counts[ko_map_counts.gt(1)].head(20))

ko_map_df[ko_map_df['KOID'] == 'K06147'].head(10)


There are 667 of 1751 KOs with a non-unique CyCOGID-KOID mapping:
KOID
K06147    62
K01784    41
K01953    37
K01154    28
K03090    24
K07257    23
K01652    20
K00615    18
K00067    18
K01790    18
K02500    17
K00059    17
K00604    17
K00058    16
K03086    16
K02501    16
K00161    15
K05577    15
K01710    14
K00558    14
Name: count, dtype: int64


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
577,60000578,467,408,57,2,0,8,6,3,4,...,13,7,68,179,66,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",440.0,0.0
732,60000733,461,412,46,3,0,2,4,2,6,...,12,6,73,174,60,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",433.0,0.0
768,60000769,463,411,51,1,0,5,5,2,6,...,12,6,71,172,53,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",428.0,0.0
1174,60001175,423,367,54,2,0,4,7,3,4,...,13,7,60,159,53,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",412.0,0.0
1222,60001223,513,441,71,1,0,5,5,4,8,...,20,8,57,169,72,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",414.0,8.0
1489,60001490,405,269,135,1,0,13,12,7,11,...,39,20,12,59,67,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",339.0,0.0
3322,60003323,33,25,8,0,0,0,2,0,1,...,0,1,0,0,2,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",27.0,0.0
4865,60004866,12,2,10,0,0,4,0,2,1,...,0,0,0,0,0,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",12.0,0.0
6618,60006619,6,6,0,0,0,0,0,0,0,...,0,0,4,2,0,"ABC-type multidrug transport system, ATPase an...",K06147,"ATP-binding cassette, subfamily B, bacterial",2.0,0.0
7547,60007548,7,0,7,0,0,0,1,0,0,...,0,1,0,0,0,"ATP-binding cassette, subfamily B/ATP-binding ...",K06147,"ATP-binding cassette, subfamily B, bacterial",3.0,2.0


# Get KEGG pathways

In [14]:
# get full reference brite hierarchy of kegg orthologies 

url = f'http://rest.kegg.jp/get/br:ko00001/json'
file = requests.get(url, allow_redirects=True)

# url = f'https://rest.kegg.jp/list/ko/'
# file = requests.get(url, allow_redirects=True)

file


<Response [200]>

In [15]:
# parse pathways into dictionary

pathway_dict = {}
pathway_names = {}
ko_dict = {}
ko_names = {}

# exclude pathways specific to eukaryotes, multicellular organisms, humans, and other processes not relevant to cyanobacteria
excluded_systems = ['09144 Cellular community - eukaryotes', '09150 Organismal Systems', '09160 Human Diseases']
excluded_pathways = [
    'ko04013', 'ko04341', 'ko04391', 'ko04140', 'ko04137', 'ko04214', 'ko03250', 'ko03262', 'ko03261', 'ko03263', 'ko03264', 
    'ko03265', 'ko03266', 'ko03268', 'ko03267', 'ko04011', 'ko04138', 'ko04139', 'ko04111', 'ko04113', 'ko04114', 'ko00981', 
    'ko03260', 'ko04216', 'ko04217'
]

# iterate through json
for supersystem in json.loads(file.content)['children']:
    print(f'{supersystem["name"]}')
    for system in supersystem['children']:
        print(f'\t{system["name"]}')
        # skip over excluded systems (pathways specific to eukaryotes, multicellular organisms, and humans)
        if supersystem["name"] in excluded_systems or system["name"] in excluded_systems:
            print('\t\tEXCLUDED')
            continue
        for pathway in system['children']:
            path_id = f'ko{pathway["name"].split(" ")[0]}'
            if path_id in excluded_pathways:
                print(f'\t\t\tEXCLUDED\t{path_id}: {pathway["name"]}')
            else:
                path_members = []
                if 'children' in pathway.keys():
                    # parse ko numbers of each pathway
                    for ko in pathway['children']:
                        ko_id = ko['name'].split(' ')[0]
                        ko_names[ko_id] = ' '.join(ko['name'].split(' ')[1:]).strip(' ')
                        path_members.append(ko_id)
                        # add pathway to ko_dict
                        if ko_id in ko_dict.keys():
                            ko_dict[ko_id].append(path_id)
                        else:
                            ko_dict[ko_id] = [path_id]
                    pathway_dict[path_id] = path_members
                    pathway_names[path_id] = pathway["name"][6:]
                print(f'\t\t{path_id}: {pathway["name"]} ({len(path_members)})')
      

09100 Metabolism
	09101 Carbohydrate metabolism
		ko00010: 00010 Glycolysis / Gluconeogenesis [PATH:ko00010] (108)
		ko00020: 00020 Citrate cycle (TCA cycle) [PATH:ko00020] (68)
		ko00030: 00030 Pentose phosphate pathway [PATH:ko00030] (89)
		ko00040: 00040 Pentose and glucuronate interconversions [PATH:ko00040] (90)
		ko00051: 00051 Fructose and mannose metabolism [PATH:ko00051] (113)
		ko00052: 00052 Galactose metabolism [PATH:ko00052] (78)
		ko00053: 00053 Ascorbate and aldarate metabolism [PATH:ko00053] (62)
		ko00500: 00500 Starch and sucrose metabolism [PATH:ko00500] (106)
		ko00520: 00520 Amino sugar and nucleotide sugar metabolism [PATH:ko00520] (156)
		ko00620: 00620 Pyruvate metabolism [PATH:ko00620] (134)
		ko00630: 00630 Glyoxylate and dicarboxylate metabolism [PATH:ko00630] (104)
		ko00640: 00640 Propanoate metabolism [PATH:ko00640] (97)
		ko00650: 00650 Butanoate metabolism [PATH:ko00650] (114)
		ko00660: 00660 C5-Branched dibasic acid metabolism [PATH:ko00660] (29)
		ko0

In [16]:
# There are some KO descriptions from the most recent KEGG download that do not match the descriptions 
# of the old annotation. A rough screen shows 243/1752. I want to look at these in more detail

name_df = ko_map_df[['KOID', 'DescriptionKO']].drop_duplicates().reset_index(drop=True)

names = []
substrings = []
for i, koid in enumerate(name_df['KOID']):
    # append KEGG names
    if koid in ko_names.keys():
        name = ko_names[koid]
    else:
        name = np.nan
    names.append(name)
    # check if previous KO Description is a substring of current KEGG names
    substrings.append(str(name_df.loc[i, 'DescriptionKO']) in str(name))
# add to dataframe  
name_df['KEGGNames'] = names
name_df['SubstringMatch'] = substrings

# # fix display options
# pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', None)

# # scren all those that don't have a match
# name_df[~name_df['SubstringMatch']]

# manual inspection of some incongruent (updated?) KOs:
problem_kos = ['K02500', 'K02428', 'K02259', 'K01594', 'K00870', 'K02501', 'K05808', 'K03606', 'K01144', 
               'K03186', 'K01234', 'K03651', 'K07011', 'K03082', 'K01003', 'K05663', 'K03152']
name_df[name_df['KOID'].isin(problem_kos)]


Unnamed: 0,KOID,DescriptionKO,KEGGNames,SubstringMatch
11,K02500,cyclase,hisF; imidazole glycerol-phosphate synthase su...,False
16,K02428,XTP/dITP diphosphohydrolase,,False
162,K02259,cytochrome c oxidase assembly protein subunit 15,"COX15, ctaA; heme a synthase [EC:1.17.99.9]",False
348,K01594,sulfinoalanine decarboxylase,,False
356,K00870,protein kinase,,False
624,K02501,glutamine amidotransferase,hisH; imidazole glycerol-phosphate synthase su...,False
646,K05808,putative sigma-54 modulation protein,hpf; ribosome hibernation promoting factor,False
832,K03606,putative colanic acid biosysnthesis UDP-glucos...,wcaJ; undecaprenyl-phosphate glucose phosphotr...,False
969,K01144,exodeoxyribonuclease V,,False
978,K03186,4-hydroxy-3-polyprenylbenzoate decarboxylase,"ubiX, bsdB, PAD1; flavin prenyltransferase [EC...",False


In [17]:
# match up pathway information to pangenome ko list

pathway_df = ko_map_df[ko_map_df['KOID'].notna()].reset_index(drop=True)

data_dict = {}
for koid in pathway_df['KOID'].unique():
    if koid in ko_dict.keys():
        pathways = ko_dict[koid]
    else:
        continue
    for pathway in pathways:
        if pathway not in data_dict.keys():
            data_dict[pathway] = pathway_df['KOID'].eq(koid).astype(int)
        else:
            data_dict[pathway] = data_dict[pathway] + pathway_df['KOID'].eq(koid).astype(int)

pathway_df = pd.merge(pathway_df, pd.DataFrame(data_dict), left_index=True, right_index=True)
pathway_df


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,ko02030,ko03450,ko99988,ko04012,ko04310,ko04148,ko00542,ko03040,ko03041,ko00062
0,60000001,1376,1211,158,7,0,17,17,8,18,...,0,0,0,0,0,0,0,0,0,0
1,60000002,1453,1287,157,9,0,10,19,9,16,...,0,0,0,0,0,0,0,0,0,0
2,60000003,1387,1222,156,9,0,12,16,9,21,...,0,0,0,0,0,0,0,0,0,0
3,60000005,990,877,108,5,0,8,11,5,14,...,0,0,0,0,0,0,0,0,0,0
4,60000006,1244,922,317,5,0,24,28,13,30,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3597,60040280,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3598,60040281,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3599,60040282,3,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3600,60040286,4,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# look at pathway statistics and compile dictionary of CyCOGs associated with each pathway

cycog_pathway_dict = {}

for path, cycog_count in pd.DataFrame(data_dict).sum().sort_values(ascending=False).items():
    ko_count = pathway_df.loc[pathway_df[path].gt(0), 'KOID'].nunique()
    total_kos = len(pathway_dict[path])
    print(f'{path}: {pathway_names[path]}\n\t{cycog_count} CyCOGs representing {ko_count}/{total_kos} unique KOs')
    # append list of cycogs associated with each pathway to dictionary
    cycog_pathway_dict[path] = pathway_df.loc[pathway_df[path].gt(0), 'CyCOGID'].to_list()
    

ko02000: Transporters [BR:ko02000]
	434 CyCOGs representing 173/2009 unique KOs
ko00541: O-Antigen nucleotide sugar biosynthesis [PATH:ko00541]
	199 CyCOGs representing 34/99 unique KOs
ko00520: Amino sugar and nucleotide sugar metabolism [PATH:ko00520]
	195 CyCOGs representing 57/156 unique KOs
ko99980: Enzymes with EC numbers
	159 CyCOGs representing 81/1492 unique KOs
ko02048: Prokaryotic defense system [BR:ko02048]
	150 CyCOGs representing 49/192 unique KOs
ko03400: DNA repair and recombination proteins [BR:ko03400]
	149 CyCOGs representing 75/485 unique KOs
ko01002: Peptidases and inhibitors [BR:ko01002]
	142 CyCOGs representing 49/1021 unique KOs
ko99997: Function unknown
	128 CyCOGs representing 78/355 unique KOs
ko00194: Photosynthesis proteins [BR:ko00194]
	126 CyCOGs representing 77/138 unique KOs
ko02010: ABC transporters [PATH:ko02010]
	121 CyCOGs representing 65/515 unique KOs
ko03016: Transfer RNA biogenesis [BR:ko03016]
	114 CyCOGs representing 70/270 unique KOs
ko02020:

# Calculate Enrichment

In [19]:
# set up parameters and data

# significance level
alpha = 0.01
enrichments_df = pd.DataFrame()

# helper function for fdrcorrection
def adjust_pvals(pvals):
    return multitest.fdrcorrection(pvals, alpha=alpha)[1]

# read in cluster data
data_dir = Path('../../data')
pro_ds = xr.open_dataset(data_dir / '5-models/pro-aligned-models.nc')
syn_ds = xr.open_dataset(data_dir / '5-models/syn-aligned-models.nc')
out_dir = data_dir / '7-interpretation'
if not out_dir.is_dir():
    out_dir.mkdir(parents=True)

# save pathway_df to out_dir
pathway_df.to_csv(out_dir / 'cycog-kegg-pathways.csv', index=False)

# analyze only "robust" clusters
pro_clusters = [
    'pro2', 'pro3', 'pro4', 'pro5', 'pro6', 'pro7', 'pro8', 'pro9', 'pro10', 'pro11', 'pro12', 'pro13', 'pro15'
]
syn_clusters = [
    'syn3', 'syn4', 'syn5', 'syn6', 'syn7', 'syn8', 'syn9', 'syn10', 'syn11', 'syn12', 'syn14', 'syn15'
]


In [20]:
# pull out median bootstrap weights

# pull out pro gene weight data
pro_gene_df = pro_ds.median(dim=['Bootstrap', 'Replicate']).GeneWeight.T.to_pandas()
pro_gene_df = pro_gene_df.rename(columns=dict(zip(pro_gene_df.columns, [f"pro{i}" for i in pro_gene_df.columns])))
# select only robust components
pro_gene_df = pro_gene_df.loc[:, pro_clusters]

# pull out syn gene weight data
syn_gene_df = syn_ds.median(dim=['Bootstrap', 'Replicate']).GeneWeight.T.to_pandas()
syn_gene_df = syn_gene_df.rename(columns=dict(zip(syn_gene_df.columns, [f"syn{i}" for i in syn_gene_df.columns])))
# select only robust components
syn_gene_df = syn_gene_df.loc[:, syn_clusters]

syn_gene_df
    

Component,syn3,syn4,syn5,syn6,syn7,syn8,syn9,syn10,syn11,syn12,syn14,syn15
Ortholog,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
60000001,0.0,0.000000,0.082089,0.000000,0.0,0.0,0.0,0.000000,0.098828,0.0,0.0,0.013749
60000002,0.0,0.000000,0.016648,0.000000,0.0,0.0,0.0,0.000000,0.020618,0.0,0.0,0.000000
60000003,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
60000004,0.0,0.012655,0.109760,0.000000,0.0,0.0,0.0,0.120932,0.000000,0.0,0.0,0.000000
60000005,0.0,0.008381,0.000000,0.133746,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
60040234,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
60040235,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
60040237,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
60040245,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000


In [21]:
# run Mann-Whitney U test for every pathway against every pro cluster

input_df = pro_gene_df

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
pro_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
pro_enrich_df = pro_enrich_df[~pro_enrich_df.isna().all(axis=1)]
# adjust p-values
pro_enrich_df = pro_enrich_df.apply(adjust_pvals, raw=True)

pro_enrich_df


Component,pro2,pro3,pro4,pro5,pro6,pro7,pro8,pro9,pro10,pro11,pro12,pro13,pro15
ko02000,7.265543e-01,0.676209,0.729813,0.705959,0.858831,0.245964,0.036741,7.202338e-01,0.770913,0.638149,0.781301,0.638149,3.882516e-35
ko00541,1.480304e-19,0.682059,0.672625,0.708082,0.776920,0.620918,0.708831,5.743984e-09,0.708825,0.608933,0.814051,0.608933,5.952523e-01
ko00520,5.389910e-16,0.689857,0.672625,0.653660,0.780829,0.620918,0.024269,1.163481e-11,0.708825,0.608933,0.623084,0.608933,5.952523e-01
ko99980,8.064119e-01,0.692444,0.637236,0.712547,0.740126,0.620918,0.708831,7.468203e-01,0.711580,0.608933,0.143314,0.608933,5.952523e-01
ko02048,3.731795e-01,0.272762,0.672625,0.705959,0.512117,0.620918,0.708831,7.242743e-01,0.706362,0.608933,0.796189,0.608933,5.952523e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ko99987,8.064119e-01,0.676209,0.672625,0.705959,0.740126,0.620918,0.708831,7.202338e-01,0.706362,0.608933,0.781301,0.608933,5.952523e-01
ko00908,8.064119e-01,0.676209,0.672625,0.705959,0.740126,0.620918,0.708831,7.202338e-01,0.706362,0.608933,0.781301,0.608933,5.952523e-01
ko00405,8.064119e-01,0.676209,0.672625,0.705959,0.740126,0.620918,0.708831,7.202338e-01,0.706362,0.608933,0.781301,0.608933,5.952523e-01
ko99981,8.064119e-01,0.676209,0.672625,0.705959,0.740126,0.620918,0.708831,7.202338e-01,0.706362,0.608933,0.781301,0.608933,5.952523e-01


In [22]:
# list the most enriched pathways for each cluster

for cluster in pro_enrich_df.columns:
    pathways = pro_enrich_df[pro_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        enrichments_df.loc[path, 'description'] = pathway_names[path]
        enrichments_df.loc[path, cluster] = pval
        print(f'\t{path} (p={pval:.2e}): {pathway_names[path]}')



pro2 (12 enriched pathways)
	ko00541 (p=1.48e-19): O-Antigen nucleotide sugar biosynthesis [PATH:ko00541]
	ko00520 (p=5.39e-16): Amino sugar and nucleotide sugar metabolism [PATH:ko00520]
	ko03083 (p=5.89e-11): Polycomb repressive complex [PATH:ko03083]
	ko00514 (p=5.89e-11): Other types of O-glycan biosynthesis [PATH:ko00514]
	ko00053 (p=5.01e-07): Ascorbate and aldarate metabolism [PATH:ko00053]
	ko02025 (p=1.74e-05): Biofilm formation - Pseudomonas aeruginosa [PATH:ko02025]
	ko00051 (p=1.01e-04): Fructose and mannose metabolism [PATH:ko00051]
	ko02035 (p=3.20e-03): Bacterial motility proteins [BR:ko02035]
	ko01002 (p=3.21e-03): Peptidases and inhibitors [BR:ko01002]
	ko02020 (p=6.22e-03): Two-component system [PATH:ko02020]
	ko00525 (p=7.95e-03): Acarbose and validamycin biosynthesis [PATH:ko00525]
	ko00052 (p=8.36e-03): Galactose metabolism [PATH:ko00052]

pro3 (3 enriched pathways)
	ko01002 (p=2.40e-13): Peptidases and inhibitors [BR:ko01002]
	ko04812 (p=2.83e-07): Cytoskeleton p

  enrichments_df.loc[path, 'description'] = pathway_names[path]


In [23]:
# run Mann-Whitney U test for every pathway against every syn cluster

input_df = syn_gene_df

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
syn_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
syn_enrich_df = syn_enrich_df[~syn_enrich_df.isna().all(axis=1)]
# adjust p-values
syn_enrich_df = syn_enrich_df.apply(adjust_pvals, raw=True)

syn_enrich_df


Component,syn3,syn4,syn5,syn6,syn7,syn8,syn9,syn10,syn11,syn12,syn14,syn15
ko02000,0.861232,0.828794,0.731522,0.802551,0.304173,0.683468,0.136661,0.794936,0.322182,4.979953e-19,0.777672,0.887502
ko00541,0.707923,0.748317,0.734636,0.780005,0.798686,0.683468,0.001809,0.796457,0.785166,6.769732e-01,0.043461,0.723833
ko00520,0.707923,0.774094,0.760583,0.751664,0.838544,0.683468,0.021090,0.794936,0.812018,6.834291e-01,0.000053,0.723833
ko99980,0.739416,0.746719,0.793975,0.707078,0.866271,0.683468,0.764023,0.863956,0.843735,6.867422e-01,0.777672,0.723833
ko02048,0.707923,0.748317,0.734636,0.776043,0.798686,0.683468,0.708374,0.796457,0.785166,6.769732e-01,0.777672,0.723833
...,...,...,...,...,...,...,...,...,...,...,...,...
ko99987,0.707923,0.746719,0.731522,0.751664,0.798686,0.683468,0.708374,0.794936,0.785166,6.769732e-01,0.777672,0.723833
ko00908,0.707923,0.746719,0.731522,0.751664,0.798686,0.683468,0.708374,0.794936,0.785166,6.769732e-01,0.777672,0.723833
ko00405,0.707923,0.746719,0.731522,0.751664,0.798686,0.683468,0.708374,0.794936,0.785166,6.769732e-01,0.777672,0.723833
ko99981,0.707923,0.746719,0.731522,0.751664,0.798686,0.683468,0.708374,0.794936,0.785166,6.769732e-01,0.777672,0.723833


In [24]:
# list the most enriched pathways for each cluster

for cluster in syn_enrich_df.columns:
    pathways = syn_enrich_df[syn_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        enrichments_df.loc[path, 'description'] = pathway_names[path]
        enrichments_df.loc[path, cluster] = pval
        print(f'\t{path} (p={pval:.2E}): {pathway_names[path]}')



syn3 (8 enriched pathways)
	ko00907 (p=9.19E-20): Pinene, camphor and geraniol degradation [PATH:ko00907]
	ko00790 (p=9.21E-13): Folate biosynthesis [PATH:ko00790]
	ko00230 (p=4.44E-07): Purine metabolism [PATH:ko00230]
	ko00750 (p=1.50E-06): Vitamin B6 metabolism [PATH:ko00750]
	ko03020 (p=1.50E-06): RNA polymerase [PATH:ko03020]
	ko00040 (p=6.24E-05): Pentose and glucuronate interconversions [PATH:ko00040]
	ko00240 (p=4.19E-04): Pyrimidine metabolism [PATH:ko00240]
	ko00740 (p=6.06E-04): Riboflavin metabolism [PATH:ko00740]

syn4 (13 enriched pathways)
	ko99995 (p=2.63E-16): Signaling proteins
	ko01053 (p=3.94E-13): Biosynthesis of siderophore group nonribosomal peptides [PATH:ko01053]
	ko00860 (p=2.11E-07): Porphyrin metabolism [PATH:ko00860]
	ko00500 (p=2.11E-07): Starch and sucrose metabolism [PATH:ko00500]
	ko00190 (p=4.51E-07): Oxidative phosphorylation [PATH:ko00190]
	ko00270 (p=3.72E-06): Cysteine and methionine metabolism [PATH:ko00270]
	ko04147 (p=1.77E-04): Exosome [BR:ko0

In [25]:
# reformat and save enrichments dataframe

enrichments_df = enrichments_df.sort_index().reset_index().rename(columns={'index': 'KEGG pathway'})
enrichments_df.to_csv(out_dir / 'kegg-enrichments.csv', index=False)

print('Mean KEGG pathway size: {:.2f}'.format(np.mean([len(pathway) for pathway in pathway_dict.values()])))
print('Median KEGG pathway size: {}'.format(np.median([len(pathway) for pathway in pathway_dict.values()])))
print('Total KEGG pathways evaluated: {}'.format(len(cycog_pathway_dict)))
print('N unique KEGG pathways enriched: {}'.format(enrichments_df.shape[0]))
print('Min n enriched pathways per cluster: {}'.format(enrichments_df.loc[:, 'pro2':].notna().sum().min()))
print('Max n enriched pathways per cluster: {}'.format(enrichments_df.loc[:, 'pro2':].notna().sum().max()))
print('Median n enriched pathways per cluster: {}'.format(enrichments_df.loc[:, 'pro2':].notna().sum().median()))

enrichments_df


Mean KEGG pathway size: 127.69
Median KEGG pathway size: 63.0
Total KEGG pathways evaluated: 227
N unique KEGG pathways enriched: 113
Min n enriched pathways per cluster: 1
Max n enriched pathways per cluster: 38
Median n enriched pathways per cluster: 12.0


Unnamed: 0,KEGG pathway,description,pro2,pro3,pro4,pro5,pro6,pro7,pro8,pro9,...,syn5,syn6,syn7,syn8,syn9,syn10,syn11,syn12,syn14,syn15
0,ko00010,Glycolysis / Gluconeogenesis [PATH:ko00010],,,,2.712759e-05,,7.432753e-19,,0.002765,...,,1.652167e-24,,2.116629e-03,,3.236497e-03,,,,5.626867e-08
1,ko00020,Citrate cycle (TCA cycle) [PATH:ko00020],,,,,,,,0.000013,...,,1.323021e-03,,,,,,,,
2,ko00030,Pentose phosphate pathway [PATH:ko00030],,,,4.523140e-15,1.767584e-07,6.999372e-24,,,...,,6.819583e-12,2.878378e-11,3.114291e-04,,1.218639e-16,,,,
3,ko00040,Pentose and glucuronate interconversions [PATH...,,,,9.606244e-08,,4.181606e-26,,,...,,,,,,,,,,
4,ko00051,Fructose and mannose metabolism [PATH:ko00051],0.000101,,,,,6.310735e-09,,0.000004,...,,9.285973e-16,,1.405966e-08,,,,,0.00003,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,ko05111,Biofilm formation - Vibrio cholerae [PATH:ko05...,,,,9.606244e-08,,4.181606e-26,,,...,,,,,,,,,,
109,ko99975,Protein processing,,,,,,,,,...,,,,,,,,,,
110,ko99982,Energy metabolism,,,,,,3.667164e-31,1.346873e-07,,...,,,,6.232021e-05,,,,,,
111,ko99994,Others,,,,,,,,,...,,,,,,,1.179248e-07,,,


# Examine Particular Pathways

In [26]:
# look at genes in ko00195: photosyntesis

pathway_df.loc[pathway_df['ko00195'].astype(bool), :'NRefsOtherKO']


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
18,60000024,816,644,145,3,24,19,17,7,23,...,22,22,95,327,123,ferredoxin,K02639,ferredoxin,810.0,0.0
20,60000032,716,482,155,5,74,7,18,9,14,...,27,19,72,197,74,photosystem II P680 reaction center D1 protein,K02703,photosystem II P680 reaction center D1 protein,632.0,0.0
47,60000063,518,399,77,2,40,5,9,4,7,...,12,11,73,172,65,photosystem II P680 reaction center D2 protein,K02706,photosystem II P680 reaction center D2 protein,497.0,0.0
55,60000071,626,556,68,2,0,10,6,3,5,...,24,5,119,256,61,photosystem II cytochrome b559 subunit beta,K02708,photosystem II cytochrome b559 subunit beta,621.0,0.0
65,60000081,507,452,50,4,1,4,6,3,5,...,9,6,65,194,83,photosystem I P700 chlorophyll a apoprotein A2,K02690,photosystem I P700 chlorophyll a apoprotein A2,474.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2878,60026737,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,ATP synthase F0 subcomplex A subunit,K02108,F-type H+-transporting ATPase subunit a,1.0,0.0
2962,60028393,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,F-type H+-transporting ATPase subunit alpha,K02111,F-type H+-transporting ATPase subunit alpha,1.0,0.0
2986,60028952,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,photosystem I P700 chlorophyll a apoprotein A1,K02689,photosystem I P700 chlorophyll a apoprotein A1,1.0,0.0
3036,60030165,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,ferredoxin--NADP+ reductase,K02641,ferredoxin--NADP+ reductase,1.0,0.0


# Collect Remaining Annotations

- Kegg Orthologs (KOID)
- NCBI Clusters of Orthologous Genes (COGID)
- Pfam (PFAMID)
  

In [27]:
# create dataframe to add annotations onto

cycog_annot_df = ko_map_df.copy()

cycog_annot_df

Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
0,60000001,1376,1211,158,7,0,17,17,8,18,...,37,19,196,532,184,membrane protease FtsH catalytic subunit,K03798,cell division protease FtsH,1291.0,0.0
1,60000002,1453,1287,157,9,0,10,19,9,16,...,40,18,198,552,198,ATP-dependent Clp protease ATP-binding subunit...,K03696,ATP-dependent Clp protease ATP-binding subunit...,443.0,417.0
2,60000003,1387,1222,156,9,0,12,16,9,21,...,41,14,203,523,192,ATP-dependent Clp protease proteolytic subunit...,K01358,"ATP-dependent Clp protease, protease subunit",1360.0,0.0
3,60000004,1631,1369,255,7,0,22,24,16,27,...,62,29,173,542,265,hypothetical protein,,,,
4,60000005,990,877,108,5,0,8,11,5,14,...,25,13,128,383,131,chaperonin GroEL,K04077,chaperonin GroEL,935.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,...,0,0,0,0,4,hypothetical protein,,,,
40291,60040292,4,4,0,0,0,0,0,0,0,...,0,0,0,0,4,Tryptophan-rich Synechocystis species C-termin...,,,,
40292,60040293,5,0,5,0,0,0,0,0,0,...,0,0,0,0,0,Putative transposase,,,,
40293,60040294,6,0,6,0,0,0,0,0,0,...,0,0,0,0,0,sulfate transport system substrate-binding pro...,K02048,sulfate transport system substrate-binding pro...,4.0,0.0


### NCBI COGs 
- https://www.ncbi.nlm.nih.gov/research/cog/

In [28]:
# collate COG annotations

# collect all COG annotations in a df
cog_df = pd.DataFrame()
for g_id in ortho_df['GenomeID'].unique():
    df = pd.read_csv(data_path / f'{g_id}/{g_id}.cog.tab.txt', sep='\t')
    if len(cog_df) == 0:
        cog_df = df
    else:
        cog_df = pd.concat([cog_df, df])       
cog_df = cog_df.reset_index(drop=True)

# deduplicate per gene annotations
print('COG annotations per gene:')
print(cog_df['gene_oid'].value_counts().value_counts())
cog_df = cog_df.loc[cog_df.groupby('gene_oid')['evalue'].idxmin(), :].reset_index(drop=True)

cog_df

COG annotations per gene:
count
1    494737
2       151
Name: count, dtype: int64


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,cog_id,cog_name,cog_length
0,638291479,587,23.93,4,587,26,593,2.000000e-28,121.0,COG0749,DNA polymerase I - 3'-5' exonuclease and polym...,593
1,638291512,235,32.72,29,232,20,235,2.000000e-32,132.0,COG1351,Thymidylate synthase ThyX,273
2,638310966,295,21.26,1,293,2,270,3.000000e-23,102.0,COG0338,Site-specific DNA-adenine methylase,274
3,638311077,487,20.30,93,463,13,403,2.000000e-31,130.0,COG1061,Superfamily II DNA or RNA helicase,442
4,638311081,349,18.57,1,337,1,368,2.000000e-09,57.1,COG0420,DNA repair exonuclease SbcCD nuclease subunit,390
...,...,...,...,...,...,...,...,...,...,...,...,...
494883,2721491640,599,39.05,6,593,1,565,7.000000e-116,411.0,COG1132,"ABC-type multidrug transport system, ATPase an...",567
494884,2721491641,265,27.72,7,241,20,282,1.000000e-14,73.6,COG0596,Pimeloyl-ACP methyl ester carboxylesterase,282
494885,2721491642,168,23.13,40,161,24,157,3.000000e-08,51.9,COG0456,Ribosomal protein S18 acetylase RimI and relat...,177
494886,2721491643,199,39.80,1,195,10,201,1.000000e-31,130.0,COG3222,"Uncharacterized conserved protein, glycosyltra...",211


In [29]:
# deduplicate annotations by CyCOG and add onto CyCOG annotation DataFrame

# deduplicate annotations per CyCOG
annot_df = pd.merge(ortho_df, cog_df, left_on='GeneID', right_on='gene_oid', how='left')
print('\nN unique COGs per CyCOG')
print(annot_df.groupby('CyCOGID').cog_id.nunique().value_counts())
annot_df = annot_df[annot_df['cog_id'].notna()]    

# deduplicate the discordant ortholog-ko mapping by simple majority vote
count_df = pd.DataFrame(annot_df.groupby('CyCOGID').cog_id.value_counts()).rename(
    columns={'cog_id': 'count'}).reset_index()
count_df = count_df.loc[count_df.groupby('CyCOGID')['count'].idxmax(), :]

# add in annotation
name_map = annot_df[['cog_id', 'cog_name']].drop_duplicates()
name_map = name_map.loc[name_map['cog_id'].drop_duplicates().index]    # pick one description from duplicates
count_df = pd.merge(left=count_df, right=name_map[['cog_id', 'cog_name']], on='cog_id', how='left')

# rename columns
count_df = count_df[['CyCOGID', 'cog_id', 'cog_name', 'count']].rename(
    columns={'cog_id': 'COGID', 'cog_name': 'DescriptionCOG', 'count': 'NRefsCOG'}).set_index('CyCOGID')
# calculate refs with annotation other than the majority annotation chosen
count_df['NRefsOtherCOG'] = annot_df.groupby('CyCOGID').GeneID.count() - count_df['NRefsCOG']

# add count_df into cycog_annot_df
cycog_annot_df = pd.merge(cycog_annot_df, count_df, on='CyCOGID', how='left')
cycog_annot_df



N unique COGs per CyCOG
cog_id
0     34298
1      5724
2       223
3        37
4         6
5         3
12        1
10        1
9         1
11        1
Name: count, dtype: int64


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO,COGID,DescriptionCOG,NRefsCOG,NRefsOtherCOG
0,60000001,1376,1211,158,7,0,17,17,8,18,...,184,membrane protease FtsH catalytic subunit,K03798,cell division protease FtsH,1291.0,0.0,COG0465,ATP-dependent Zn proteases,1267.0,0.0
1,60000002,1453,1287,157,9,0,10,19,9,16,...,198,ATP-dependent Clp protease ATP-binding subunit...,K03696,ATP-dependent Clp protease ATP-binding subunit...,443.0,417.0,COG0542,ATP-dependent Clp protease ATP-binding subunit...,1219.0,0.0
2,60000003,1387,1222,156,9,0,12,16,9,21,...,192,ATP-dependent Clp protease proteolytic subunit...,K01358,"ATP-dependent Clp protease, protease subunit",1360.0,0.0,COG0740,"ATP-dependent protease ClpP, protease subunit",1359.0,0.0
3,60000004,1631,1369,255,7,0,22,24,16,27,...,265,hypothetical protein,,,,,,,,
4,60000005,990,877,108,5,0,8,11,5,14,...,131,chaperonin GroEL,K04077,chaperonin GroEL,935.0,0.0,COG0459,Chaperonin GroEL (HSP60 family),916.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,...,4,hypothetical protein,,,,,,,,
40291,60040292,4,4,0,0,0,0,0,0,0,...,4,Tryptophan-rich Synechocystis species C-termin...,,,,,,,,
40292,60040293,5,0,5,0,0,0,0,0,0,...,0,Putative transposase,,,,,,,,
40293,60040294,6,0,6,0,0,0,0,0,0,...,0,sulfate transport system substrate-binding pro...,K02048,sulfate transport system substrate-binding pro...,4.0,0.0,COG1613,"ABC-type sulfate transport system, periplasmic...",4.0,0.0


### Pfam
- https://pfam.xfam.org/

In [30]:
# collate Pfam annotations

# collect all Pfam annotations in a df
pfam_df = pd.DataFrame()
for g_id in ortho_df['GenomeID'].unique():
    try:
        df = pd.read_csv(data_path / f'{g_id}/{g_id}.pfam.tab.txt', sep='\t')
    except:
        print('No annotation file found for genome {}'.format(g_id))
    if len(pfam_df) == 0:
        pfam_df = df
    else:
        pfam_df = pd.concat([pfam_df, df])       
pfam_df = pfam_df.reset_index(drop=True)

# deduplicate per gene annotations
print('Pfam annotations per gene:')
print(pfam_df['gene_oid'].value_counts().value_counts())
pfam_df = pfam_df.loc[pfam_df.groupby('gene_oid')['evalue'].idxmin(), :].reset_index(drop=True)

pfam_df

Pfam annotations per gene:
count
1      494886
2      125694
3       36802
4       14280
5        3353
6        1487
7         721
8         153
9          87
10         52
11         35
14         17
12         15
13         13
16         12
18          8
17          8
19          8
15          7
21          5
32          4
24          3
25          3
27          3
37          3
34          2
23          2
60          1
40          1
36          1
35          1
29          1
33          1
31          1
20          1
22          1
108         1
Name: count, dtype: int64


Unnamed: 0,gene_oid,gene_length,query_start,query_end,subj_start,subj_end,evalue,bit_score,pfam_id,pfam_name,pfam_length
0,638291465,574,211,538,7,337,4.000000e-66,223.4,pfam00940,RNA_pol,407
1,638291475,66,1,64,14,78,4.100000e-18,65.3,pfam05367,Phage_endo_I,149
2,638291476,54,1,54,79,130,1.900000e-12,46.9,pfam05367,Phage_endo_I,149
3,638291477,531,264,510,5,235,3.400000e-18,65.8,pfam03796,DnaB_C,250
4,638291478,87,6,62,2,54,8.500000e-06,25.8,pfam00462,Glutaredoxin,60
...,...,...,...,...,...,...,...,...,...,...,...
677668,2721491640,599,23,303,2,273,3.600000e-41,141.5,pfam00664,ABC_membrane,274
677669,2721491641,265,11,233,1,211,1.500000e-22,81.1,pfam12697,Abhydrolase_6,212
677670,2721491642,168,41,153,3,116,1.800000e-13,50.6,pfam00583,Acetyltransf_1,117
677671,2721491643,199,34,158,6,122,2.400000e-32,111.2,pfam09837,DUF2064,123


In [31]:
# deduplicate annotations by CyCOG and add onto CyCOG annotation DataFrame

# deduplicate annotations per CyCOG
annot_df = pd.merge(ortho_df, pfam_df, left_on='GeneID', right_on='gene_oid', how='left')
print('\nN unique Pfam annotations per CyCOG')
print(annot_df.groupby('CyCOGID').pfam_id.nunique().value_counts())
annot_df = annot_df[annot_df['pfam_id'].notna()]    

# deduplicate the discordant ortholog-ko mapping by simple majority vote
count_df = pd.DataFrame(annot_df.groupby('CyCOGID').pfam_id.value_counts()).rename(
    columns={'pfam_id': 'count'}).reset_index()
count_df = count_df.loc[count_df.groupby('CyCOGID')['count'].idxmax(), :]

# add in annotation
name_map = annot_df[['pfam_id', 'pfam_name']].drop_duplicates()
name_map = name_map.loc[name_map['pfam_id'].drop_duplicates().index]    # pick one description from duplicates
count_df = pd.merge(left=count_df, right=name_map[['pfam_id', 'pfam_name']], on='pfam_id', how='left')

# rename columns
count_df = count_df[['CyCOGID', 'pfam_id', 'pfam_name', 'count']].rename(
    columns={'pfam_id': 'PfamID', 'pfam_name': 'DescriptionPfam', 'count': 'NRefsPfam'}).set_index('CyCOGID')
# calculate refs with annotation other than the majority annotation chosen
count_df['NRefsOtherPfam'] = annot_df.groupby('CyCOGID').GeneID.count() - count_df['NRefsPfam']

# add count_df into cycog_annot_df
cycog_annot_df = pd.merge(cycog_annot_df, count_df, on='CyCOGID', how='left')
cycog_annot_df


N unique Pfam annotations per CyCOG
pfam_id
0     29816
1      9161
2       936
3       261
4        78
5        19
6         9
8         3
9         2
24        2
12        2
31        1
18        1
17        1
19        1
16        1
7         1
Name: count, dtype: int64


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,...,NRefsKO,NRefsOtherKO,COGID,DescriptionCOG,NRefsCOG,NRefsOtherCOG,PfamID,DescriptionPfam,NRefsPfam,NRefsOtherPfam
0,60000001,1376,1211,158,7,0,17,17,8,18,...,1291.0,0.0,COG0465,ATP-dependent Zn proteases,1267.0,0.0,pfam01434,Peptidase_M41,1260.0,109.0
1,60000002,1453,1287,157,9,0,10,19,9,16,...,443.0,417.0,COG0542,ATP-dependent Clp protease ATP-binding subunit...,1219.0,0.0,pfam07724,AAA_2,1331.0,105.0
2,60000003,1387,1222,156,9,0,12,16,9,21,...,1360.0,0.0,COG0740,"ATP-dependent protease ClpP, protease subunit",1359.0,0.0,pfam00574,CLP_protease,1384.0,0.0
3,60000004,1631,1369,255,7,0,22,24,16,27,...,,,,,,,pfam04966,OprB,900.0,125.0
4,60000005,990,877,108,5,0,8,11,5,14,...,935.0,0.0,COG0459,Chaperonin GroEL (HSP60 family),916.0,0.0,pfam00118,Cpn60_TCP1,980.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,...,,,,,,,,,,
40291,60040292,4,4,0,0,0,0,0,0,0,...,,,,,,,pfam07483,W_rich_C,3.0,1.0
40292,60040293,5,0,5,0,0,0,0,0,0,...,,,,,,,pfam14319,Zn_Tnp_IS91,4.0,1.0
40293,60040294,6,0,6,0,0,0,0,0,0,...,4.0,0.0,COG1613,"ABC-type sulfate transport system, periplasmic...",4.0,0.0,pfam13531,SBP_bac_11,4.0,0.0


In [32]:
# save annotation file

cycog_annot_df.to_csv(out_dir / 'cycog-annotations.csv', index=False)
