In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import requests
import json
import xarray as xr
import scipy.stats as stats
from scipy.stats import mannwhitneyu
from statsmodels.stats import multitest


In [2]:
# genome metadata

# filepaths
filepath_genome_metadata = '../../../data/mappings/2021-10-07/collated/berube_pro_syn_set/genome_metadata.csv'
filepath_ortholog_metadata = '../../../data/mappings/2021-10-07/collated/berube_pro_syn_set/ortholog_metadata.csv'

# ortholog metadata
ortho_df = pd.read_csv(filepath_ortholog_metadata)

# genome metadata
genome_df = pd.read_csv(filepath_genome_metadata)

# fix typos
genome_df.loc[genome_df['Clade'] == '5.1A-1', 'Clade'] = '5.1A-I'
genome_df.loc[genome_df['Clade'] == 'CDR2', 'Clade'] = 'CRD2'

genome_df


Unnamed: 0,BerubeProportalID,UpdatedProportalID,GenomeName,Genus,Ecotype,Clade,ReferenceType,IsolationLocation,Ecosystem,Latitude,Longitude,Depth(m),GenomeSize(bp),GeneCount,Completeness
0,2716884681,2716884681,AG-311-D23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,1466304,1796,72.96
1,2716884682,2716884682,AG-311-I02,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,195290,271,11.16
2,2716884683,2716884683,AG-311-I09,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,697970,812,47.41
3,2716884684,2716884684,AG-311-J05,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,623148,755,34.64
4,2716884685,2716884685,AG-311-J23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,1427538,1677,77.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,2623620330,640427149,WH7803,Synechococcus,,5.1B-V,ISOLATE,"Sargasso Sea, North Atlantic Ocean",Epipelagic,33.742300,-67.491300,25.0,2366980,2591,98.91
677,2623620868,638341215,WH7805,Synechococcus,,5.1B-VI,ISOLATE,North Atlantic Ocean,Pelagic,33.750000,-67.500000,,2620367,2937,99.73
678,2606217514,637000314,WH8102,Synechococcus,,5.1A-III,ISOLATE,Tropical Atlantic Ocean,Pelagic,22.495000,-65.600000,,2434428,2588,99.46
679,2606217259,637000214,MED4,Prochlorococcus,High light adapted (HL),HLI,ISOLATE,Mediterranean Sea,Marginal Sea,43.200000,6.866667,5.0,1657990,2069,99.46


In [3]:
# add genome id and clade info into ortholog df

# clades we care about
clades = ['HLI', 'HLII', 'LLI', '5.1A-I', '5.1A-II', '5.1A-III', '5.1A-IV', '5.1B-VII', 'CRD2']

ortho_df['GenomeID'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['BerubeProportalID'])
ortho_df['Genus'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Genus'])
ortho_df['Clade'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Clade'])

# subset gene entries to just the clades we care about
ortho_df = ortho_df[ortho_df['Clade'].isin(clades)].reset_index(drop=True)
ortho_df['GenomeID'] = ortho_df['GenomeID'].astype(int)

ortho_df


Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation,GenomeID,Genus,Clade
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII
3,AG-347-J20_2667851457,60000001,AG-347-J20,2667851457,membrane protease FtsH catalytic subunit,2667527340,Prochlorococcus,HLII
4,AG-355-N22_2667798405,60000001,AG-355-N22,2667798405,membrane protease FtsH catalytic subunit,2667527306,Prochlorococcus,HLII
...,...,...,...,...,...,...,...,...
614710,AG-402-P16_2667758863,60040291,AG-402-P16,2667758863,hypothetical protein,2667527281,Prochlorococcus,LLI
614711,MIT0915_2682085462,60040292,MIT0915,2682085462,Tryptophan-rich Synechocystis species C-termin...,2681812901,Prochlorococcus,LLI
614712,MIT0915_2682085486,60040292,MIT0915,2682085486,Tryptophan-rich Synechocystis species C-termin...,2681812901,Prochlorococcus,LLI
614713,MIT0915_2682085472,60040292,MIT0915,2682085472,Tryptophan-rich Synechocystis species C-termin...,2681812901,Prochlorococcus,LLI


In [4]:
# start ko mapping file

ko_map_df = ortho_df.groupby('OrthologID').GeneID.count().reset_index().rename(columns={'GeneID': 'TotalRefs'})
# add in genus and clade counts
for var in ['Genus', 'Clade']:
    ko_map_df = ko_map_df.join(
        pd.DataFrame(ortho_df.groupby('OrthologID')[var].value_counts()).rename(
            columns={var: 'count'}).reset_index().pivot(
            columns=var, index='OrthologID', values='count').fillna(0), 
        on='OrthologID', 
        how='left'
    )
# convert dataframe back to ints
ko_map_df = ko_map_df.astype(int)
# add back CyCOG annotation
ko_map_df['DescriptionCyCOG'] = ko_map_df['OrthologID'].map(
    ortho_df[['OrthologID', 'Annotation']].drop_duplicates().set_index('OrthologID')['Annotation']
)

ko_map_df


Unnamed: 0,OrthologID,TotalRefs,Prochlorococcus,Synechococcus,5.1A-I,5.1A-II,5.1A-III,5.1A-IV,5.1B-VII,CRD2,HLI,HLII,LLI,DescriptionCyCOG
0,60000001,914,807,107,13,17,5,18,37,17,195,440,172,membrane protease FtsH catalytic subunit
1,60000002,946,844,102,12,18,6,16,40,10,196,464,184,ATP-dependent Clp protease ATP-binding subunit...
2,60000003,933,829,104,8,16,6,21,41,12,203,446,180,ATP-dependent Clp protease proteolytic subunit...
3,60000004,1064,899,165,20,23,11,27,62,22,170,477,252,hypothetical protein
4,60000005,642,572,70,9,11,3,14,25,8,126,323,123,chaperonin GroEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20497,60040266,3,3,0,0,0,0,0,0,0,0,3,0,hypothetical protein
20498,60040287,4,4,0,0,0,0,0,0,0,4,0,0,hypothetical protein
20499,60040289,4,0,4,0,0,0,0,4,0,0,0,0,hypothetical protein
20500,60040291,4,4,0,0,0,0,0,0,0,0,0,4,hypothetical protein


In [5]:
# check that there is a directory for each genome

data_path = Path('../../../data/genomes/jgi/img/proportal')

count = 0
for g_id in ortho_df['GenomeID'].unique():
    if not os.path.isdir(data_path / str(g_id)):
        print(f'{g_id} directory not found: {data_path / str(g_id)}')
        count += 1
        
print(f'{count} total missing directories')


0 total missing directories


In [6]:
# import all kegg annotations as a df

ko_df = pd.DataFrame()
for g_id in ortho_df['GenomeID'].unique():
    df = pd.read_csv(data_path / f'{g_id}/{g_id}.ko.tab.txt', sep='\t')
    if len(ko_df) == 0:
        ko_df = df
    else:
        ko_df = pd.concat([ko_df, df])
        
ko_df = ko_df.reset_index(drop=True)

ko_df


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,2607658051,465,99.80,1,465,1,465,0.000000e+00,906.4,KO:K02313,chromosomal replication initiator protein,,Yes
1,2607658053,410,100.00,1,410,11,420,0.000000e+00,856.7,KO:K00799,glutathione S-transferase [EC:2.5.1.18],EC:2.5.1.18,Yes
2,2607658055,455,100.00,1,455,1,455,0.000000e+00,895.6,KO:K00383,glutathione reductase (NADPH) [EC:1.8.1.7],EC:1.8.1.7,Yes
3,2607658058,198,100.00,1,198,1,198,0.000000e+00,409.1,KO:K02276,cytochrome c oxidase subunit III [EC:1.9.3.1],EC:1.9.3.1,Yes
4,2607658059,562,100.00,1,562,1,562,0.000000e+00,1173.7,KO:K02274,cytochrome c oxidase subunit I [EC:1.9.3.1],EC:1.9.3.1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
319268,2717343206,185,94.05,1,185,1,185,1.500000e-122,389.0,KO:K00891,shikimate kinase [EC:2.7.1.71],EC:2.7.1.71,Yes
319269,2717343209,208,94.71,1,208,34,241,0.000000e+00,452.0,KO:K00799,glutathione S-transferase [EC:2.5.1.18],EC:2.5.1.18,Yes
319270,2717343211,433,94.92,1,433,1,433,0.000000e+00,962.0,KO:K00833,adenosylmethionine---8-amino-7-oxononanoate am...,EC:2.6.1.62,Yes
319271,2717343212,222,95.43,1,219,1,219,0.000000e+00,475.0,KO:K01935,dethiobiotin synthetase [EC:6.3.3.3],EC:6.3.3.3,Yes


# Deduplicate gene annotations

**Problem:** Some genes (specific nucleotide sequence from particular reference genome) have more than one KO annotation listed. 
- Based on the counts below, this comes out to 16,400 (15038 + 1352 + 9 + 1) out of 301,499 annotated genes, or ~5.44%. 
- Most of these are annotations that actually have the same KO number, but just a different EC number or something. Only 2,207 genes (0.73%) have multiple distinct KO annotations
- Of these, qualitatively it seems like the annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Proposed solution:** Since the KO annotations are based on an HMM search, first select the annotation with the lowest E-value. Then if the E-values are exactly the same, randomly select one annotaiton or the other


In [7]:
# how many genes have more than one ko number assigned?

print(ko_df['gene_oid'].value_counts().value_counts())


1    285099
2     15038
3      1352
4         9
6         1
Name: gene_oid, dtype: int64


In [8]:
# deeper look: examine genes with more than one annotation

counts = ko_df['gene_oid'].value_counts()
ko_counts = ko_df[ko_df['gene_oid'].isin(counts[counts.gt(1)].index)].groupby('gene_oid')['ko_id'].nunique()
print(ko_counts.value_counts())    # most have the same ko id, just different EC numbers or something

ko_df[ko_df['gene_oid'].isin(ko_counts[ko_counts.gt(1)].index)]


1    14193
2     2203
3        4
Name: ko_id, dtype: int64


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
1549,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06182,23S rRNA pseudouridine2604 synthase [EC:5.4.99...,EC:5.4.99.21,Yes
1550,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06183,16S rRNA pseudouridine516 synthase [EC:5.4.99.19],EC:5.4.99.19,Yes
1561,2681971798,467,89.08,1,467,1,467,0.000000e+00,960.0,KO:K04094,methylenetetrahydrofolate--tRNA-(uracil-5-)-me...,EC:2.1.1.74,Yes
1562,2681971798,467,90.15,1,467,1,467,0.000000e+00,969.0,KO:K03495,tRNA uridine 5-carboxymethylaminomethyl modifi...,,Yes
1620,2681971886,348,80.77,1,338,1,338,0.000000e+00,680.0,KO:K08919,chlorophyll a/b binding light-harvesting prote...,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
319209,2717343036,149,41.86,15,140,4,132,6.500000e-31,126.0,KO:K00980,glycerol-3-phosphate cytidylyltransferase [EC:...,EC:2.7.7.39,Yes
319210,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.1.167,Yes
319211,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.7.70,Yes
319253,2717343185,112,100.00,1,112,1,112,1.400000e-73,243.0,KO:K04751,nitrogen regulatory protein P-II 1,,Yes


In [9]:
# deduplicate genes with more than one annotation

# for each gene_oid's set of annotations, select the one with the lowest e-value
# this step will also randomly select one of the annotations to propogate in cases with the same KO 
# but different EC numbers, or different KOs but same e-value

ko_df = ko_df.loc[ko_df.groupby('gene_oid')['evalue'].idxmin(), :].reset_index(drop=True)
ko_df


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,2507313697,392,100.00,8,392,1,385,0.000000e+00,737.6,KO:K02338,DNA polymerase III subunit beta [EC:2.7.7.7],EC:2.7.7.7,Yes
1,2507313699,768,99.90,1,768,1,768,0.000000e+00,1521.1,KO:K01952,phosphoribosylformylglycinamidine synthase [EC...,EC:6.3.5.3,Yes
2,2507313700,516,100.00,38,516,1,479,0.000000e+00,957.6,KO:K00764,amidophosphoribosyltransferase [EC:2.4.2.14],EC:2.4.2.14,Yes
3,2507313701,822,100.00,1,822,1,822,0.000000e+00,1604.3,KO:K02469,DNA gyrase subunit A [EC:5.99.1.3],EC:5.99.1.3,Yes
4,2507313703,321,100.00,1,321,1,321,0.000000e+00,689.5,KO:K18979,epoxyqueuosine reductase [EC:1.17.99.6],EC:1.17.99.6,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
301494,2717761256,433,98.85,1,433,1,433,0.000000e+00,980.0,KO:K01845,"glutamate-1-semialdehyde 2,1-aminomutase [EC:5...",EC:5.4.3.8,Yes
301495,2717761263,390,98.21,1,390,1,390,0.000000e+00,875.0,KO:K04487,cysteine desulfurase [EC:2.8.1.7],EC:2.8.1.7,Yes
301496,2717761264,251,98.41,1,251,36,286,0.000000e+00,576.0,KO:K01778,diaminopimelate epimerase [EC:5.1.1.7],EC:5.1.1.7,Yes
301497,2717761266,124,95.97,1,124,1,124,1.000000e-88,287.0,KO:K02639,ferredoxin,,Yes


# Deduplicate CyCOG annotations

**Problem:** Some CyCOG (Clusters of Orthologous Genes) consist of genes with discordant KO annotations
- Out of 20,502 CyCOGs represented by the clades we're considering in this study, 18,132 had no KO annotation, meaning just 2,370 (11.56%) contained at least one gene with a KO annotation.
- In most of the annotated CyCOGs, all gene members had a concordant KO annotation. In 93 (3.94% of annotated CyCOGs), the KO annotations were discordant among gene members.
- Of these, qualitatively it seems like the discordant annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Proposed solution:** For each CyCOG, select the annotation applied to the majority of members as the representatitve CyCOG annotation. 


In [10]:
# join kegg annotations onto reference gene set

annot_df = pd.merge(ortho_df, ko_df, left_on='GeneID', right_on='gene_oid', how='left')

# how many unique ko annotations per ortholog group?
print(annot_df.groupby('OrthologID').ko_id.nunique().value_counts())

# drop the sequences without a ko annotation
annot_df = annot_df[annot_df['ko_id'].notna()]

annot_df


0    18132
1     2277
2       68
3        9
4        9
7        3
6        2
5        1
9        1
Name: ko_id, dtype: int64


Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation,GenomeID,Genus,Clade,gene_oid,gene_length,...,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III,2.607658e+09,637.0,...,1.0,637.0,1.0,637.0,0.000000e+00,1240.7,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI,2.681971e+09,640.0,...,1.0,640.0,1.0,640.0,0.000000e+00,1370.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII,2.717339e+09,584.0,...,1.0,584.0,1.0,584.0,0.000000e+00,1260.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
3,AG-347-J20_2667851457,60000001,AG-347-J20,2667851457,membrane protease FtsH catalytic subunit,2667527340,Prochlorococcus,HLII,2.667851e+09,584.0,...,1.0,584.0,1.0,584.0,0.000000e+00,1220.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
4,AG-355-N22_2667798405,60000001,AG-355-N22,2667798405,membrane protease FtsH catalytic subunit,2667527306,Prochlorococcus,HLII,2.667798e+09,617.0,...,1.0,617.0,1.0,617.0,0.000000e+00,1360.0,KO:K03798,cell division protease FtsH [EC:3.4.24.-],EC:3.4.24.-,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614458,BL107_2624164316,60039843,BL107,2624164316,"imidazole glycerol phosphate synthase, glutami...",2623620351,Synechococcus,5.1A-IV,2.624164e+09,208.0,...,5.0,208.0,3.0,203.0,5.200000e-42,172.9,KO:K02501,glutamine amidotransferase [EC:2.4.2.-],EC:2.4.2.-,Yes
614508,AG-686-B21_2717567036,60039986,AG-686-B21,2717567036,CRISPR associated protein,2716884631,Synechococcus,5.1A-IV,2.717567e+09,202.0,...,1.0,197.0,1.0,204.0,8.600000e-32,132.0,KO:K19126,CRISPR system Cascade subunit CasE,,Yes
614549,AG-679-C18_2717598776,60040047,AG-679-C18,2717598776,Formyl transferase,2716884671,Synechococcus,5.1A-I,2.717599e+09,293.0,...,6.0,244.0,5.0,245.0,1.300000e-48,187.0,KO:K00604,methionyl-tRNA formyltransferase [EC:2.1.2.9],EC:2.1.2.9,Yes
614562,KORDI-52_2507314833,60040062,KORDI-52,2507314833,Glutathione S-transferase,2507262012,Synechococcus,5.1A-II,2.507315e+09,94.0,...,1.0,94.0,1.0,94.0,0.000000e+00,189.9,KO:K00799,glutathione S-transferase [EC:2.5.1.18],EC:2.5.1.18,Yes


In [11]:
# look at non-unique ko mappings

counts = annot_df.groupby('OrthologID').ko_id.nunique()
collisions_df = annot_df[annot_df['OrthologID'].isin(counts[counts.gt(1)].index)].groupby('OrthologID')[
    ['ko_id', 'ko_name']].value_counts().reset_index()

pd.set_option('max_colwidth', None)
collisions_df


Unnamed: 0,OrthologID,ko_id,ko_name,0
0,60000002,KO:K03696,ATP-dependent Clp protease ATP-binding subunit ClpC,295
1,60000002,KO:K03695,ATP-dependent Clp protease ATP-binding subunit ClpB,276
2,60000006,KO:K11329,"two-component system, OmpR family, response regulator RpaB",300
3,60000006,KO:K07659,"two-component system, OmpR family, phosphate regulon response regulator OmpR",30
4,60000006,KO:K07657,"two-component system, OmpR family, phosphate regulon response regulator PhoB",24
...,...,...,...,...
233,60007640,KO:K08678,UDP-glucuronate decarboxylase [EC:4.1.1.35],1
234,60008448,KO:K05921,"5-oxopent-3-ene-1,2,5-tricarboxylate decarboxylase / 2-hydroxyhepta-2,4-diene-1,7-dioate isomerase [EC:4.1.1.68 5.3.3.-]",1
235,60008448,KO:K01557,acylpyruvate hydrolase [EC:3.7.1.5],1
236,60010388,KO:K03790,ribosomal-protein-alanine N-acetyltransferase [EC:2.3.1.128],1


In [12]:
# deduplicate the discordant ortholog-ko mapping by simple majority vote

ko_count_df = pd.DataFrame(annot_df.groupby('OrthologID').ko_id.value_counts()).rename(
    columns={'ko_id': 'count'}).reset_index()
ko_count_df = ko_count_df.loc[ko_count_df.groupby('OrthologID')['count'].idxmax(), :]

# add in KO annotation
name_map = annot_df[['ko_id', 'ko_name']].drop_duplicates()
name_map['DescriptionKO'] = name_map['ko_name'].str.split(' \[EC:').str[0]
name_map = name_map.loc[name_map['ko_id'].drop_duplicates().index]    # pick one description from duplicates
ko_count_df = pd.merge(left=ko_count_df, right=name_map[['ko_id', 'DescriptionKO']], on='ko_id', how='left')

# remove prefix from KOID
ko_count_df['KOID'] = ko_count_df['ko_id'].str[3:]
# rename count column and drop old ko_id column
ko_count_df = ko_count_df[['OrthologID', 'KOID', 'DescriptionKO', 'count']].rename(
    columns={'count': 'NRefsKO'}).set_index('OrthologID')
# calculate refs with KO mapping other than the one chosen
ko_count_df['NRefsOtherKO'] = annot_df.groupby('OrthologID').GeneID.count() - ko_count_df['NRefsKO']

# add ko_count_df into ko_map_df
ko_map_df = pd.merge(ko_map_df, ko_count_df, on='OrthologID', how='left')

pd.reset_option('max_colwidth')
ko_map_df


Unnamed: 0,OrthologID,TotalRefs,Prochlorococcus,Synechococcus,5.1A-I,5.1A-II,5.1A-III,5.1A-IV,5.1B-VII,CRD2,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
0,60000001,914,807,107,13,17,5,18,37,17,195,440,172,membrane protease FtsH catalytic subunit,K03798,cell division protease FtsH,859.0,0.0
1,60000002,946,844,102,12,18,6,16,40,10,196,464,184,ATP-dependent Clp protease ATP-binding subunit...,K03696,ATP-dependent Clp protease ATP-binding subunit...,295.0,276.0
2,60000003,933,829,104,8,16,6,21,41,12,203,446,180,ATP-dependent Clp protease proteolytic subunit...,K01358,"ATP-dependent Clp protease, protease subunit",912.0,0.0
3,60000004,1064,899,165,20,23,11,27,62,22,170,477,252,hypothetical protein,,,,
4,60000005,642,572,70,9,11,3,14,25,8,126,323,123,chaperonin GroEL,K04077,chaperonin GroEL,606.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20497,60040266,3,3,0,0,0,0,0,0,0,0,3,0,hypothetical protein,,,,
20498,60040287,4,4,0,0,0,0,0,0,0,4,0,0,hypothetical protein,,,,
20499,60040289,4,0,4,0,0,0,0,4,0,0,0,0,hypothetical protein,,,,
20500,60040291,4,4,0,0,0,0,0,0,0,0,0,4,hypothetical protein,,,,


In [13]:
# there are many different CyCOG orthologies annotated with the same KO group -- look at some of these

ko_map_counts = ko_map_df['KOID'].value_counts()
print('There are {} of {} KOs with a non-unique CyCOGID-KOID mapping:'.format(
    len(ko_map_counts[ko_map_counts.gt(1)]), ko_map_df['KOID'].nunique()))
print(ko_map_counts[ko_map_counts.gt(1)].head(20))

ko_map_df[ko_map_df['KOID'] == 'K06147'].head(10)


There are 405 of 1491 KOs with a non-unique CyCOGID-KOID mapping:
K06147    38
K01784    26
K01953    24
K01154    20
K00058    14
K00615    13
K01652    13
K00604    13
K02500    11
K07257    10
K02501     9
K00721     8
K00100     8
K01406     8
K03427     8
K00558     8
K01735     8
K01711     8
K00540     8
K01153     7
Name: KOID, dtype: int64


Unnamed: 0,OrthologID,TotalRefs,Prochlorococcus,Synechococcus,5.1A-I,5.1A-II,5.1A-III,5.1A-IV,5.1B-VII,CRD2,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
577,60000578,322,284,38,5,6,2,4,13,8,68,155,61,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",306.0,0.0
732,60000733,306,277,29,4,4,1,6,12,2,72,150,55,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",290.0,0.0
768,60000769,299,266,33,4,5,1,6,12,5,71,144,51,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",278.0,0.0
1174,60001175,275,240,35,5,7,2,4,13,4,60,132,48,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",267.0,0.0
1222,60001223,298,253,45,6,4,2,8,20,5,57,131,65,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",230.0,6.0
1489,60001490,222,127,95,16,11,5,11,39,13,12,53,62,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",194.0,0.0
3272,60003323,6,2,4,1,2,0,1,0,0,0,0,2,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",5.0,0.0
4454,60004866,6,0,6,0,0,1,1,0,4,0,0,0,"ATP-binding cassette, subfamily B",K06147,"ATP-binding cassette, subfamily B, bacterial",6.0,0.0
5764,60006619,6,6,0,0,0,0,0,0,0,4,2,0,"ABC-type multidrug transport system, ATPase an...",K06147,"ATP-binding cassette, subfamily B, bacterial",2.0,0.0
6391,60007548,1,0,1,1,0,0,0,0,0,0,0,0,"ATP-binding cassette, subfamily B/ATP-binding ...",K06147,"ATP-binding cassette, subfamily B, bacterial",1.0,0.0


# Get KEGG pathways

In [14]:
# get full reference brite hierarchy of kegg orthologies 

url = f'http://rest.kegg.jp/get/br:ko00001/json'
file = requests.get(url, allow_redirects=True)

# url = f'https://rest.kegg.jp/list/ko/'
# file = requests.get(url, allow_redirects=True)

file


<Response [200]>

In [15]:
# parse pathways into dictionary

pathway_dict = {}
pathway_names = {}
ko_dict = {}
ko_names = {}

# iterate through json
for supersystem in json.loads(file.content)['children']:
    print(f'{supersystem["name"]}')
    for system in supersystem['children']:
        print(f'\t{system["name"]}')
        for pathway in system['children']:
            path_id = f'ko{pathway["name"].split(" ")[0]}'
            path_members = []
            if 'children' in pathway.keys():
                # parse ko numbers of each pathway
                for ko in pathway['children']:
                    ko_id = ko['name'].split(' ')[0]
                    ko_names[ko_id] = ' '.join(ko['name'].split(' ')[1:]).strip(' ')
                    path_members.append(ko_id)
                    # add pathway to ko_dict
                    if ko_id in ko_dict.keys():
                        ko_dict[ko_id].append(path_id)
                    else:
                        ko_dict[ko_id] = [path_id]
                pathway_dict[path_id] = path_members
                pathway_names[path_id] = pathway["name"][6:]
            print(f'\t\t{path_id}: {pathway["name"]} ({len(path_members)})')
      

09100 Metabolism
	09101 Carbohydrate metabolism
		ko00010: 00010 Glycolysis / Gluconeogenesis [PATH:ko00010] (107)
		ko00020: 00020 Citrate cycle (TCA cycle) [PATH:ko00020] (68)
		ko00030: 00030 Pentose phosphate pathway [PATH:ko00030] (88)
		ko00040: 00040 Pentose and glucuronate interconversions [PATH:ko00040] (89)
		ko00051: 00051 Fructose and mannose metabolism [PATH:ko00051] (112)
		ko00052: 00052 Galactose metabolism [PATH:ko00052] (78)
		ko00053: 00053 Ascorbate and aldarate metabolism [PATH:ko00053] (62)
		ko00500: 00500 Starch and sucrose metabolism [PATH:ko00500] (106)
		ko00520: 00520 Amino sugar and nucleotide sugar metabolism [PATH:ko00520] (156)
		ko00620: 00620 Pyruvate metabolism [PATH:ko00620] (134)
		ko00630: 00630 Glyoxylate and dicarboxylate metabolism [PATH:ko00630] (104)
		ko00640: 00640 Propanoate metabolism [PATH:ko00640] (97)
		ko00650: 00650 Butanoate metabolism [PATH:ko00650] (114)
		ko00660: 00660 C5-Branched dibasic acid metabolism [PATH:ko00660] (29)
		ko0

In [16]:
# There are some KO descriptions from the most recent KEGG download that do not match the descriptions 
# of the old annotation. A rough screen shows 180/1492. I want to look at these in more detail

name_df = ko_map_df[['KOID', 'DescriptionKO']].drop_duplicates().reset_index(drop=True)

names = []
substrings = []
for i, koid in enumerate(name_df['KOID']):
    # append KEGG names
    if koid in ko_names.keys():
        name = ko_names[koid]
    else:
        name = np.nan
    names.append(name)
    # check if previous KO Description is a substring of current KEGG names
    substrings.append(str(name_df.loc[i, 'DescriptionKO']) in str(name))
# add to dataframe  
name_df['KEGGNames'] = names
name_df['SubstringMatch'] = substrings

# # fix display options
# pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', None)

# # scren all those that don't have a match
# name_df[~name_df['SubstringMatch']]

# manual curation of concerning KOs: 17 / 1492 (1.14%)
problem_kos = ['K02500', 'K02428', 'K02259', 'K01594', 'K00870', 'K02501', 'K05808', 'K03606', 'K01144', 
               'K03186', 'K01234', 'K03651', 'K07011', 'K03082', 'K01003', 'K05663', 'K03152']
name_df[name_df['KOID'].isin(problem_kos)]


Unnamed: 0,KOID,DescriptionKO,KEGGNames,SubstringMatch
11,K02500,cyclase,hisF; imidazole glycerol-phosphate synthase su...,False
16,K02428,XTP/dITP diphosphohydrolase,,False
162,K02259,cytochrome c oxidase assembly protein subunit 15,"COX15, ctaA; heme a synthase [EC:1.17.99.9]",False
347,K01594,sulfinoalanine decarboxylase,,False
355,K00870,protein kinase,,False
623,K02501,glutamine amidotransferase,hisH; imidazole glycerol-phosphate synthase su...,False
645,K05808,putative sigma-54 modulation protein,hpf; ribosome hibernation promoting factor,False
832,K03606,putative colanic acid biosysnthesis UDP-glucos...,wcaJ; undecaprenyl-phosphate glucose phosphotr...,False
967,K01144,exodeoxyribonuclease V,,False
976,K03186,4-hydroxy-3-polyprenylbenzoate decarboxylase,"ubiX, bsdB, PAD1; flavin prenyltransferase [EC...",False


In [17]:
# match up pathway information to pangenome ko list

pathway_df = ko_map_df[ko_map_df['KOID'].notna()].reset_index(drop=True)

data_dict = {}
for koid in pathway_df['KOID'].unique():
    if koid in ko_dict.keys():
        pathways = ko_dict[koid]
    else:
        continue
    for pathway in pathways:
        if pathway not in data_dict.keys():
            data_dict[pathway] = pathway_df['KOID'].eq(koid).astype(int)
        else:
            data_dict[pathway] = data_dict[pathway] + pathway_df['KOID'].eq(koid).astype(int)

pathway_df = pd.merge(pathway_df, pd.DataFrame(data_dict), left_index=True, right_index=True)
pathway_df


Unnamed: 0,OrthologID,TotalRefs,Prochlorococcus,Synechococcus,5.1A-I,5.1A-II,5.1A-III,5.1A-IV,5.1B-VII,CRD2,...,ko01054,ko02030,ko03450,ko00542,ko00572,ko03040,ko03041,ko00907,ko99988,ko00062
0,60000001,914,807,107,13,17,5,18,37,17,...,0,0,0,0,0,0,0,0,0,0
1,60000002,946,844,102,12,18,6,16,40,10,...,0,0,0,0,0,0,0,0,0,0
2,60000003,933,829,104,8,16,6,21,41,12,...,0,0,0,0,0,0,0,0,0,0
3,60000005,642,572,70,9,11,3,14,25,8,...,0,0,0,0,0,0,0,0,0,0
4,60000006,760,560,200,29,28,8,30,81,24,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2365,60039820,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2366,60039843,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2367,60039986,2,0,2,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2368,60040047,2,0,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# look at pathway statistics and compile dictionary of CyCOGs associated with each pathway

cycog_pathway_dict = {}

for path, cycog_count in pd.DataFrame(data_dict).sum().sort_values(ascending=False).items():
    ko_count = pathway_df.loc[pathway_df[path].gt(0), 'KOID'].nunique()
    total_kos = len(pathway_dict[path])
    print(f'{path}: {pathway_names[path]}\n\t{cycog_count} CyCOGs representing {ko_count}/{total_kos} unique KOs')
    # append list of cycogs associated with each pathway to dictionary
    cycog_pathway_dict[path] = pathway_df.loc[pathway_df[path].gt(0), 'OrthologID'].to_list()
    

ko02000: Transporters [BR:ko02000]
	267 CyCOGs representing 135/1984 unique KOs
ko00520: Amino sugar and nucleotide sugar metabolism [PATH:ko00520]
	118 CyCOGs representing 49/156 unique KOs
ko00541: O-Antigen nucleotide sugar biosynthesis [PATH:ko00541]
	105 CyCOGs representing 27/99 unique KOs
ko03400: DNA repair and recombination proteins [BR:ko03400]
	100 CyCOGs representing 69/483 unique KOs
ko01002: Peptidases and inhibitors [BR:ko01002]
	98 CyCOGs representing 47/1012 unique KOs
ko99980: Enzymes with EC numbers
	96 CyCOGs representing 66/1467 unique KOs
ko00194: Photosynthesis proteins [BR:ko00194]
	91 CyCOGs representing 73/138 unique KOs
ko03016: Transfer RNA biogenesis [BR:ko03016]
	85 CyCOGs representing 68/269 unique KOs
ko02048: Prokaryotic defense system [BR:ko02048]
	84 CyCOGs representing 33/183 unique KOs
ko02010: ABC transporters [PATH:ko02010]
	76 CyCOGs representing 48/515 unique KOs
ko99997: Function unknown
	74 CyCOGs representing 56/355 unique KOs
ko04147: Exosom

# Calculate Enrichment

In [19]:
# get gene lists

data_dir = Path('/Users/blasks/writing/barnacle-manuscript/development/batch-correction/')

pro_ds = xr.open_dataset(data_dir / 'models/pro/bootstrap0/dataset_bootstrap_0.nc')
syn_ds = xr.open_dataset(data_dir / 'models/syn/bootstrap0/dataset_bootstrap_0.nc')

pro_ds.ortholog.to_numpy()


array([60000001, 60000002, 60000003, ..., 60040262, 60040287, 60040292])

In [20]:
# assemble pro cluster weights & bootstraps

pro_weights_df = pd.DataFrame(index=pro_ds.ortholog.to_numpy())
pro_boots_df = pd.DataFrame(index=pro_ds.ortholog.to_numpy())

pro_clusters = [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 17]
for cluster in pro_clusters:
    df = pd.read_csv(data_dir / f'figures/v2-batch-t=0.01/pro/cluster{cluster}.csv')
    pro_weights_df = pd.merge(pro_weights_df, df.set_index('cycog')['weight'], left_index=True, right_index=True, 
                              how='left').rename(columns={'weight': f'cluster{cluster}'}).fillna(0)
    pro_boots_df = pd.merge(pro_boots_df, df.set_index('cycog')['% bootstraps'], left_index=True, right_index=True, 
                            how='left').rename(columns={'% bootstraps': f'cluster{cluster}'}).fillna(0)

pro_weights_df
    

Unnamed: 0,cluster2,cluster3,cluster4,cluster5,cluster6,cluster8,cluster9,cluster10,cluster11,cluster12,cluster13,cluster14,cluster15,cluster17
60000001,0.0,0.024343,0.000000,0.000152,0.000000,0.0,0.000367,0.000170,0.000311,0.000012,0.000000,0.000998,0.000000,0.166265
60000002,0.0,0.000000,0.000000,0.011042,0.000000,0.0,0.000000,0.000034,0.001395,0.000230,0.000043,0.000054,0.000208,0.036400
60000003,0.0,0.000000,0.000000,0.000312,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60000004,0.0,0.000000,0.009649,0.000000,0.000650,0.0,0.000000,0.000202,0.123848,0.000000,0.000000,0.002409,0.008743,0.186145
60000005,0.0,0.000785,0.000000,0.008032,0.000335,0.0,0.000666,0.000000,0.005554,0.006110,0.000000,0.009096,0.006783,0.035538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60040211,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60040246,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60040262,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60040287,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [21]:
# assemble syn cluster weights & bootstraps

syn_weights_df = pd.DataFrame(index=syn_ds.ortholog.to_numpy())
syn_boots_df = pd.DataFrame(index=syn_ds.ortholog.to_numpy())

syn_clusters = [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15]
for cluster in syn_clusters:
    df = pd.read_csv(data_dir / f'figures/v2-batch-t=0.01/syn/cluster{cluster}.csv')
    syn_weights_df = pd.merge(syn_weights_df, df.set_index('cycog')['weight'], left_index=True, right_index=True, 
                              how='left').rename(columns={'weight': f'cluster{cluster}'}).fillna(0)
    syn_boots_df = pd.merge(syn_boots_df, df.set_index('cycog')['% bootstraps'], left_index=True, right_index=True, 
                            how='left').rename(columns={'% bootstraps': f'cluster{cluster}'}).fillna(0)

syn_weights_df


Unnamed: 0,cluster3,cluster4,cluster5,cluster6,cluster7,cluster8,cluster9,cluster10,cluster12,cluster14,cluster15
60000001,0.000000,0.000000,0.000000,0.000000,0.042467,0.000083,0.000243,0.001954,0.000014,0.000000,0.000005
60000002,0.000000,0.000053,0.000000,0.000000,0.000198,0.000196,0.000000,0.000000,0.000175,0.000000,0.000738
60000003,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60000004,0.005126,0.000000,0.000134,0.061309,0.001946,0.001781,0.000000,0.037152,0.000033,0.001047,0.000278
60000005,0.004883,0.000000,0.000000,0.000000,0.000537,0.004337,0.073075,0.002559,0.000000,0.000994,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
60040186,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60040187,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60040195,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60040219,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [22]:
# set up for enrichment tests

# significance level
alpha = 0.01

# helper function for fdrcorrection
def adjust_pvals(pvals):
    return multitest.fdrcorrection(pvals, alpha=alpha)[1]


In [193]:
# run Mann-Whitney U test for every pathway against every pro cluster

input_df = pro_weights_df * pro_boots_df    # hadamard product
# only consider weights from genes with at least 50% bootstrap value
input_df[pro_boots_df.lt(0.1)] = 0.0

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
pro_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
pro_enrich_df = pro_enrich_df[~pro_enrich_df.isna().all(axis=1)]
# adjust p-values
pro_enrich_df = pro_enrich_df.apply(adjust_pvals, raw=True)

pro_enrich_df


Unnamed: 0,cluster2,cluster3,cluster4,cluster5,cluster6,cluster8,cluster9,cluster10,cluster11,cluster12,cluster13,cluster14,cluster15,cluster17
ko02000,0.835313,0.760732,7.291086e-01,8.563139e-01,0.740575,0.644380,0.270963,4.227831e-01,0.080004,0.773744,6.768287e-01,8.201515e-18,0.269734,0.798598
ko00520,0.000144,0.813058,7.864884e-01,7.727158e-01,0.673747,0.650348,0.722735,1.153815e-19,0.000686,0.830915,6.854261e-01,6.528301e-01,0.736498,0.798598
ko00541,0.000144,0.790040,7.644252e-01,7.655881e-01,0.671283,0.650348,0.746354,3.345370e-19,0.001792,0.816303,6.807174e-01,6.528301e-01,0.736498,0.854237
ko03400,0.796842,0.022383,2.896801e-01,7.289165e-01,0.671283,0.666589,0.798349,5.916675e-01,0.807615,0.025857,7.060478e-01,6.665890e-01,0.034135,0.269435
ko01002,0.050629,0.516032,6.225365e-07,9.811203e-02,0.673747,0.650348,0.765306,2.202279e-01,0.000354,0.445112,1.428679e-08,6.528301e-01,0.736498,0.798598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ko04016,0.835313,0.760044,7.291086e-01,7.289165e-01,0.671283,0.644380,0.722735,7.360047e-01,0.807615,0.751448,6.768287e-01,6.513657e-01,0.736498,0.798598
ko05215,0.835313,0.760044,7.291086e-01,2.809052e-24,0.671283,0.644380,0.722735,7.360047e-01,0.807615,0.751448,6.768287e-01,6.513657e-01,0.736498,0.798598
ko00908,0.000428,0.760044,7.291086e-01,7.289165e-01,0.671283,0.644380,0.722735,7.360047e-01,0.807615,0.751448,6.768287e-01,6.513657e-01,0.736498,0.798598
ko99981,0.835313,0.760044,7.291086e-01,7.289165e-01,0.671283,0.644380,0.722735,7.360047e-01,0.807615,0.751448,6.768287e-01,6.513657e-01,0.736498,0.798598


In [194]:
# list the most enriched pathways for each cluster

for cluster in pro_enrich_df.columns:
    pathways = pro_enrich_df[pro_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        print(f'\t{path} (p={pval:.2e}): {pathway_names[path]}')



cluster2 (13 enriched pathways)
	ko00520 (p=1.44e-04): Amino sugar and nucleotide sugar metabolism [PATH:ko00520]
	ko00541 (p=1.44e-04): O-Antigen nucleotide sugar biosynthesis [PATH:ko00541]
	ko01503 (p=1.44e-04): Cationic antimicrobial peptide (CAMP) resistance [PATH:ko01503]
	ko04113 (p=1.44e-04): Meiosis - yeast [PATH:ko04113]
	ko00514 (p=1.44e-04): Other types of O-glycan biosynthesis [PATH:ko00514]
	ko04213 (p=1.44e-04): Longevity regulating pathway - multiple species [PATH:ko04213]
	ko03083 (p=1.44e-04): Polycomb repressive complex [PATH:ko03083]
	ko04964 (p=1.44e-04): Proximal tubule bicarbonate reclamation [PATH:ko04964]
	ko00908 (p=4.28e-04): Zeatin biosynthesis [PATH:ko00908]
	ko02048 (p=9.70e-04): Prokaryotic defense system [BR:ko02048]
	ko00051 (p=1.93e-03): Fructose and mannose metabolism [PATH:ko00051]
	ko05206 (p=3.24e-03): MicroRNAs in cancer [PATH:ko05206]
	ko00053 (p=9.03e-03): Ascorbate and aldarate metabolism [PATH:ko00053]

cluster3 (19 enriched pathways)
	ko0301

In [69]:
# run Mann-Whitney U test for every pathway against every syn cluster

input_df = syn_weights_df * syn_boots_df    # hadamard product
# only consider weights from genes with at least 50% bootstrap value
input_df[syn_boots_df.lt(0.5)] = 0.0

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
syn_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
syn_enrich_df = syn_enrich_df[~syn_enrich_df.isna().all(axis=1)]
# adjust p-values
syn_enrich_df = syn_enrich_df.apply(adjust_pvals, raw=True)

syn_enrich_df


Unnamed: 0,cluster3,cluster4,cluster5,cluster6,cluster7,cluster8,cluster9,cluster10,cluster12,cluster14,cluster15
ko02000,0.892682,0.842924,0.632922,0.780057,0.718707,5.231963e-01,0.697858,0.742481,2.159775e-12,0.714547,0.790406
ko00520,0.758675,0.691878,0.044665,0.740036,0.740607,2.458260e-05,0.747963,0.645614,6.197167e-01,0.714547,0.297340
ko00541,0.727504,0.691878,0.632922,0.746098,0.718707,2.254554e-08,0.709078,0.642653,6.197167e-01,0.714547,0.665677
ko03400,0.721854,0.691878,0.648892,0.740036,0.757945,6.752294e-01,0.762834,0.648892,6.205195e-01,0.752893,0.684415
ko01002,0.763655,0.717944,0.646998,0.789479,0.130150,6.724668e-01,0.758503,0.646998,6.197167e-01,0.748703,0.681556
...,...,...,...,...,...,...,...,...,...,...,...
ko05215,0.721854,0.691878,0.632922,0.740036,0.718707,6.498771e-01,0.697858,0.642653,6.197167e-01,0.714547,0.665677
ko00908,0.721854,0.691878,0.632922,0.740036,0.718707,6.498771e-01,0.697858,0.642653,6.197167e-01,0.714547,0.665677
ko05020,0.721854,0.691878,0.632922,0.740036,0.718707,6.498771e-01,0.697858,0.642653,6.197167e-01,0.714547,0.665677
ko99981,0.721854,0.691878,0.632922,0.740036,0.718707,6.498771e-01,0.697858,0.642653,6.197167e-01,0.714547,0.665677


In [70]:
# list the most enriched pathways for each cluster

for cluster in syn_enrich_df.columns:
    pathways = syn_enrich_df[syn_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        print(f'\t{path} (p={pval:.2E}): {pathway_names[path]}')



cluster3 (28 enriched pathways)
	ko00860 (p=9.53E-17): Porphyrin metabolism [PATH:ko00860]
	ko00030 (p=9.53E-17): Pentose phosphate pathway [PATH:ko00030]
	ko05415 (p=8.39E-16): Diabetic cardiomyopathy [PATH:ko05415]
	ko99995 (p=1.14E-15): Signaling proteins
	ko00190 (p=1.53E-15): Oxidative phosphorylation [PATH:ko00190]
	ko04714 (p=1.46E-14): Thermogenesis [PATH:ko04714]
	ko05010 (p=1.46E-14): Alzheimer disease [PATH:ko05010]
	ko05130 (p=1.46E-14): Pathogenic Escherichia coli infection [PATH:ko05130]
	ko04940 (p=2.87E-14): Type I diabetes mellitus [PATH:ko04940]
	ko04931 (p=9.83E-10): Insulin resistance [PATH:ko04931]
	ko00270 (p=7.91E-08): Cysteine and methionine metabolism [PATH:ko00270]
	ko00480 (p=2.04E-07): Glutathione metabolism [PATH:ko00480]
	ko04910 (p=2.26E-07): Insulin signaling pathway [PATH:ko04910]
	ko04727 (p=2.88E-07): GABAergic synapse [PATH:ko04727]
	ko00430 (p=2.88E-07): Taurine and hypotaurine metabolism [PATH:ko00430]
	ko04217 (p=5.40E-06): Necroptosis [PATH:ko04

In [80]:
ortho_df

Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation,GenomeID,Genus,Clade
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII
3,AG-347-J20_2667851457,60000001,AG-347-J20,2667851457,membrane protease FtsH catalytic subunit,2667527340,Prochlorococcus,HLII
4,AG-355-N22_2667798405,60000001,AG-355-N22,2667798405,membrane protease FtsH catalytic subunit,2667527306,Prochlorococcus,HLII
...,...,...,...,...,...,...,...,...
614710,AG-402-P16_2667758863,60040291,AG-402-P16,2667758863,hypothetical protein,2667527281,Prochlorococcus,LLI
614711,MIT0915_2682085462,60040292,MIT0915,2682085462,Tryptophan-rich Synechocystis species C-termin...,2681812901,Prochlorococcus,LLI
614712,MIT0915_2682085486,60040292,MIT0915,2682085486,Tryptophan-rich Synechocystis species C-termin...,2681812901,Prochlorococcus,LLI
614713,MIT0915_2682085472,60040292,MIT0915,2682085472,Tryptophan-rich Synechocystis species C-termin...,2681812901,Prochlorococcus,LLI


In [81]:
ortho_df[ortho_df['OrthologID'] == 60002480]

Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation,GenomeID,Genus,Clade
544817,AG-418-C17_2717706677,60002480,AG-418-C17,2717706677,Major capsid protein Gp23,2716884767,Prochlorococcus,HLII
544818,AG-341-K05_2717146761,60002480,AG-341-K05,2717146761,Major capsid protein Gp23,2716884261,Prochlorococcus,LLI


In [109]:
ortho_df[ortho_df['OrthologID'] == 60005548]

Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation,GenomeID,Genus,Clade
589177,AG-341-K05_2717147204,60005548,AG-341-K05,2717147204,hypothetical protein,2716884261,Prochlorococcus,LLI
589178,AG-402-C22_2717276749,60005548,AG-402-C22,2717276749,hypothetical protein,2716884378,Prochlorococcus,HLII


# Some ad-hoc cluster specific analysis

In [158]:
# import cycog.tsv file published with Berube et al.

cycog_df = pd.read_csv('../../../data/genomes/berube_et_al_2018/cycogs.tsv', sep='\t')
cycog_df['OrthologID'] = cycog_df['cycog_iid'].str.split('_').str[1].astype(int)
cycog_df


Unnamed: 0,cycog_iid,cycog_num_taxa,cycog_num_genes,cycog_num_duplications,cycog_num_pro,cycog_num_syn,cycog_num_phage,cycog_cns_product,cycog_genes,OrthologID
0,CyCOG_60000001,600,1376,776,1218,158,0,membrane protease FtsH catalytic subunit,"WH8102_2607658325,MIT0917_2681971350,AG-424-P1...",60000001
1,CyCOG_60000002,599,1453,854,1296,157,0,ATP-dependent Clp protease ATP-binding subunit...,"AG-459-D04_2717739350,AG-418-O03_2717327314,ME...",60000002
2,CyCOG_60000003,592,1387,795,1231,156,0,ATP-dependent Clp protease proteolytic subunit...,"scB245a_518A17_2649316516,AG-670-L08_271740078...",60000003
3,CyCOG_60000004,581,1631,1050,1376,255,0,hypothetical protein,"MIT0703_2608216699,WH8102_2607660049,AG-686-P0...",60000004
4,CyCOG_60000005,578,990,412,882,108,0,chaperonin GroEL,"AG-311-D23_2717614699,AG-311-K14_2717569190,AG...",60000005
...,...,...,...,...,...,...,...,...,...,...
40290,CyCOG_60040291,1,4,3,4,0,0,hypothetical protein,"AG-402-P16_2667758850,AG-402-P16_2667758866,AG...",60040291
40291,CyCOG_60040292,1,4,3,4,0,0,Tryptophan-rich Synechocystis species C-termin...,"MIT0915_2682085462,MIT0915_2682085486,MIT0915_...",60040292
40292,CyCOG_60040293,1,5,4,0,5,0,Putative transposase,"SynAce01_2721490323,SynAce01_2721491348,SynAce...",60040293
40293,CyCOG_60040294,1,6,5,0,6,0,sulfate transport system substrate-binding pro...,"GFB01_2638207649,GFB01_2638207687,GFB01_263820...",60040294


In [163]:
# import genome assembly file published with Berube et al 2018

assembly_df = pd.read_csv('../../../data/genomes/berube_et_al_2018/genome_assembly_summary_20180718.tsv', sep='\t')

virocells = assembly_df[assembly_df['usage_notes'].isin(
    ['likely virocell', 'likely virus or virocell'])].sag_id.to_list()

assembly_df[(assembly_df['sag_id'].isin(virocells)) & (assembly_df['phylogeny'] == 'Prochlorococcus')]


Unnamed: 0,sag_id,phylogeny,ecotype,clade,img_genome_id,usage_notes,selection_criteria,completeness_software,completeness_score,checkm_completeness,...,single_cell_lysis_approach,wga_amp_approach,wga_amp_protocol,ncbi_biosample_accession,sequencing_coverage,seq_meth,env_biome,env_feature,env_material,env_package
156,AG-341-K05,Prochlorococcus,Low light adapted (LL),LLI,2716884261,likely virocell,wgs amplification,checkm,low,8.62,...,chemical,mda based,MDA,SAMN08886098,3542x,Illumina NextSeq 500,ocean_biome,ocean,water,water
312,AG-363-L17,Prochlorococcus,High light adapted (HL),HLVI,2667527373,likely virocell,wgs amplification,checkm,med,57.2,...,chemical,mda based,WGA-X,SAMN08886249,1414x,Illumina NextSeq 500,ocean_biome,ocean,water,water
470,AG-418-C09,Prochlorococcus,High light adapted (HL),unclassified,2716884766,likely virocell,ITS phylogeny,checkm,high,91.3,...,chemical,mda based,WGA-X,SAMN08886407,1561x,Illumina NextSeq 500,ocean_biome,ocean,water,water
471,AG-418-C17,Prochlorococcus,High light adapted (HL),HLII,2716884767,likely virocell,ITS phylogeny,checkm,med,52.47,...,chemical,mda based,WGA-X,SAMN08886408,1541x,Illumina NextSeq 500,ocean_biome,ocean,water,water


In [190]:
# pull out cluster information

genus = 'pro'
cluster = 13
boot_thold = 0.0

if genus == 'pro':
    weight_df = pro_weights_df
    boot_df = pro_boots_df
elif genus == 'syn':
    weight_df = syn_weights_df
    boot_df = syn_boots_df
    
membership = boot_df[f'cluster{cluster}'][boot_df[f'cluster{cluster}'].gt(boot_thold)].index
print(f'{genus} cluster {cluster}: {len(membership)} cycogs with at least {boot_thold} bootstrap support')

# join weights
cluster_df = pd.merge(weight_df.loc[membership, f'cluster{cluster}'].rename('weight'), 
                      boot_df.loc[membership, f'cluster{cluster}'].rename('bootstrap_support'), 
                      left_index=True, right_index=True, how='inner').sort_values(
    ['bootstrap_support', 'weight'], ascending=False)

# join compiled annotation information
cluster_df = pd.merge(cluster_df, ko_map_df, left_index=True, right_on='OrthologID', how='left').reset_index(drop=True)

# join further cycog information
cluster_df = pd.merge(cluster_df, cycog_df, on='OrthologID', how='left')

# count up virocells
cluster_df['virocells'] = [ortho_df[ortho_df['OrthologID'] == id].GenomeName.isin(virocells).sum() \
                           for id in cluster_df['OrthologID']]

# clean up to make readable
cluster_df = cluster_df[['OrthologID', 'DescriptionCyCOG', 'weight', 'bootstrap_support', 'KOID', 'DescriptionKO', 
    'NRefsKO', 'NRefsOtherKO', 'TotalRefs', 'Prochlorococcus', 'cycog_num_pro', 
    'Synechococcus', 'cycog_num_syn', 'cycog_num_phage', 'virocells', 
    'HLI', 'HLII', 'LLI', '5.1A-I', '5.1A-II', '5.1A-III', '5.1A-IV', '5.1B-VII', 'CRD2', 
                         'cycog_genes']]

# save it
cluster_df.to_csv(f'../data/{genus}-cluster{cluster}.csv', index=False)

# show it
cluster_df[['OrthologID', 'DescriptionCyCOG', 'weight', 'bootstrap_support', 'KOID', 'DescriptionKO', 
    'NRefsKO', 'NRefsOtherKO', 'TotalRefs', 'Prochlorococcus', 'Synechococcus', 'cycog_num_phage', 'virocells', 
    'HLI', 'HLII', 'LLI', 'cycog_genes']].head(15)


pro cluster 13: 37 cycogs with at least 0.0 bootstrap support


Unnamed: 0,OrthologID,DescriptionCyCOG,weight,bootstrap_support,KOID,DescriptionKO,NRefsKO,NRefsOtherKO,TotalRefs,Prochlorococcus,Synechococcus,cycog_num_phage,virocells,HLI,HLII,LLI,cycog_genes
0,60002480,Major capsid protein Gp23,0.653958,1.0,,,,,2,2,0,57,2,0,1,1,"P-SS1_2606304859,metaG-MbCM1_2596482216,P-RSM1..."
1,60002463,hypothetical protein,0.60209,1.0,,,,,2,2,0,59,2,0,1,1,"ACG-2014b_2708712293,S-IOM18_2588440527,ACG-20..."
2,60002465,Phage tail sheath protein,0.214549,0.986667,,,,,2,2,0,60,2,0,1,1,"KBS-M-1A_2610075525,ACG-2014j_2708729865,S-ShM..."
3,60006295,hypothetical protein,0.12045,0.98,,,,,9,8,1,0,0,2,5,1,"MIT0915_2682085824,AG-347-I21_2667842518,AG-68..."
4,60002400,gp32 DNA binding protein like,0.214704,0.936667,,,,,3,3,0,58,2,0,1,2,"ACG-2014b_2708712179,ACG-2014g_2708729533,Syn3..."
5,60002595,hypothetical protein,0.144394,0.793333,,,,,48,48,0,0,1,15,27,6,"AG-311-D23_2717614784,AG-402-L20_2667739119,AG..."
6,60001830,hypothetical protein,0.061353,0.726667,K01406,serralysin,2.0,1.0,176,85,91,0,0,3,59,23,"AG-432-D09_2717345280,MIT1300_2684594342,MIT93..."
7,60001259,Tetratricopeptide repeat-containing protein,0.050744,0.603333,K09667,protein O-GlcNAc transferase,2.0,2.0,1416,1360,56,0,2,73,228,1059,"MIT0912_2682082367,AG-347-K22_2717149191,AG-31..."
8,60001501,ribosomal large subunit pseudouridine synthase F,0.033814,0.496667,K06182,23S rRNA pseudouridine2604 synthase,136.0,83.0,225,225,0,0,0,55,138,32,"scB245a_521M10_2655073977,AG-402-A08_271726698..."
9,60001258,hypothetical protein,0.038843,0.47,,,,,318,318,0,0,0,60,154,104,"AG-424-E20_2717715008,AG-412-J13_2717697506,SS..."


In [172]:
cluster_df.columns

Index(['OrthologID', 'DescriptionCyCOG', 'weight', 'bootstrap_support', 'KOID',
       'DescriptionKO', 'NRefsKO', 'NRefsOtherKO', 'TotalRefs',
       'Prochlorococcus', 'Synechococcus', 'HLI', 'HLII', 'LLI', '5.1A-I',
       '5.1A-II', '5.1A-III', '5.1A-IV', '5.1B-VII', 'CRD2', 'cycog_iid',
       'cycog_num_taxa', 'cycog_num_genes', 'cycog_num_duplications',
       'cycog_num_pro', 'cycog_num_syn', 'cycog_num_phage',
       'cycog_cns_product', 'cycog_genes', 'virocells'],
      dtype='object')

In [148]:
collisions_df[collisions_df['OrthologID'] == 60001883]

Unnamed: 0,OrthologID,ko_id,ko_name,0
159,60001883,KO:K01406,serralysin [EC:3.4.24.40],14
160,60001883,KO:K09691,lipopolysaccharide transport system ATP-bindin...,8
161,60001883,KO:K03932,polyhydroxybutyrate depolymerase,4
162,60001883,KO:K12544,S-layer protein,2


In [129]:
ko_map_df[ko_map_df['OrthologID'] == 60032022]

Unnamed: 0,OrthologID,TotalRefs,Prochlorococcus,Synechococcus,5.1A-I,5.1A-II,5.1A-III,5.1A-IV,5.1B-VII,CRD2,HLI,HLII,LLI,DescriptionCyCOG,KOID,DescriptionKO,NRefsKO,NRefsOtherKO
17222,60032022,1,0,1,0,0,0,1,0,0,0,0,0,DNA-directed RNA polymerase subunit omega,K03060,DNA-directed RNA polymerase subunit omega,1.0,0.0


In [169]:
ortho_df[ortho_df['OrthologID'] == 60001883].GenomeName.isin(virocells).sum()

2

In [124]:
syn_weights_df.loc[syn_weights_df.lt(0).any(axis=1), :]

Unnamed: 0,cluster3,cluster4,cluster5,cluster6,cluster7,cluster8,cluster9,cluster10,cluster12,cluster14,cluster15
60001112,0.0,0.0,0.627396,0.0,0.0,0.000161,0.008293,-5.7e-05,0.000273,0.0,0.000481
60001290,-0.001439,0.0,0.360796,0.0,8.2e-05,0.003067,0.299047,0.009531,0.0,0.0,0.019836
60032022,0.0,0.285829,2.7e-05,0.0,0.001521,-1.7e-05,0.0,0.0,0.0,0.0,-7.2e-05


Unnamed: 0,cycog_iid,cycog_num_taxa,cycog_num_genes,cycog_num_duplications,cycog_num_pro,cycog_num_syn,cycog_num_phage,cycog_cns_product,cycog_genes,OrthologID
0,CyCOG_60000001,600,1376,776,1218,158,0,membrane protease FtsH catalytic subunit,"WH8102_2607658325,MIT0917_2681971350,AG-424-P1...",60000001
1,CyCOG_60000002,599,1453,854,1296,157,0,ATP-dependent Clp protease ATP-binding subunit...,"AG-459-D04_2717739350,AG-418-O03_2717327314,ME...",60000002
2,CyCOG_60000003,592,1387,795,1231,156,0,ATP-dependent Clp protease proteolytic subunit...,"scB245a_518A17_2649316516,AG-670-L08_271740078...",60000003
3,CyCOG_60000004,581,1631,1050,1376,255,0,hypothetical protein,"MIT0703_2608216699,WH8102_2607660049,AG-686-P0...",60000004
4,CyCOG_60000005,578,990,412,882,108,0,chaperonin GroEL,"AG-311-D23_2717614699,AG-311-K14_2717569190,AG...",60000005
...,...,...,...,...,...,...,...,...,...,...
40290,CyCOG_60040291,1,4,3,4,0,0,hypothetical protein,"AG-402-P16_2667758850,AG-402-P16_2667758866,AG...",60040291
40291,CyCOG_60040292,1,4,3,4,0,0,Tryptophan-rich Synechocystis species C-termin...,"MIT0915_2682085462,MIT0915_2682085486,MIT0915_...",60040292
40292,CyCOG_60040293,1,5,4,0,5,0,Putative transposase,"SynAce01_2721490323,SynAce01_2721491348,SynAce...",60040293
40293,CyCOG_60040294,1,6,5,0,6,0,sulfate transport system substrate-binding pro...,"GFB01_2638207649,GFB01_2638207687,GFB01_263820...",60040294


In [152]:
!ls '../../../data/genomes/berube_et_al_2018/cycogs.tsv'

[34mberube_singlecells_imgpipeline[m[m         cycogsgenomes.tsv
cyanobacteria_phylogeny_rootedtree.nwk genome_assembly_summary_20180718.tsv
cyanobacteria_phylogeny_taxa.tsv       heterotroph_phylogeny_taxa.tsv
cycogs.tsv
