## Preamble

### Project Template

In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

### Imports

In [None]:
import os
import subprocess
import sys
import time
from datetime import datetime
from glob import glob
from itertools import chain, product
from tempfile import mkstemp

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import sfacts as sf
import statsmodels.api as sm
import statsmodels.formula.api as smf
import xarray as xr
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.stats.multitest import fdrcorrection
from tqdm import tqdm

import lib.plot
from lib.dissimilarity import load_dmat_as_pickle
from lib.pandas_util import align_indexes, aligned_index, idxwhere, invert_mapping

In [None]:
import lib.thisproject.data

### Set Style

In [None]:
sns.set_context("talk")
plt.rcParams["figure.dpi"] = 50

In [None]:
genome_type_palette = {"SPGC": "tab:green", "MAG": "tab:orange", "Isolate": "tab:blue"}

## Data Setup

### Metadata

In [None]:
# Copy/pasted from https://www.genome.jp/kegg/docs/module_statistics.html
_kegg_module_page = """
KEGG modules statistics
As of 2023/10/3

M00001	884	93.1	4558	54.3	563	95.7	141	95.9	155	97.5	25	44.6	4442	55.6	116	28.0	Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
M00002	913	96.1	7334	87.3	569	96.8	142	96.6	155	97.5	47	83.9	7027	88.0	307	74.0	Glycolysis, core module involving three-carbon compounds
M00003	855	90.0	4916	58.5	556	94.6	138	93.9	145	91.2	16	28.6	4749	59.5	167	40.2	Gluconeogenesis, oxaloacetate => fructose-6P
M00004	842	88.6	4183	49.8	545	92.7	141	95.9	135	84.9	21	37.5	4183	52.4	0	0.0	Pentose phosphate pathway (Pentose phosphate cycle)
M00005	938	98.7	8142	97.0	587	99.8	146	99.3	152	95.6	53	94.6	7736	96.9	406	97.8	PRPP biosynthesis, ribose 5P => PRPP
M00006	908	95.6	4735	56.4	572	97.3	143	97.3	152	95.6	41	73.2	4723	59.2	12	2.9	Pentose phosphate pathway, oxidative phase, glucose 6P => ribulose 5P
M00007	870	91.6	6686	79.6	559	95.1	145	98.6	140	88.1	26	46.4	6642	83.2	44	10.6	Pentose phosphate pathway, non-oxidative phase, fructose 6P => ribose 5P
M00008	0	0.0	2188	26.1	0	0.0	0	0.0	0	0.0	0	0.0	2188	27.4	0	0.0	Entner-Doudoroff pathway, glucose-6P => glyceraldehyde-3P + pyruvate
M00009	674	70.9	5431	64.7	463	78.7	78	53.1	129	81.1	4	7.1	5265	66.0	166	40.0	Citrate cycle (TCA cycle, Krebs cycle)
M00010	880	92.6	6759	80.5	547	93.0	141	95.9	152	95.6	40	71.4	6514	81.6	245	59.0	Citrate cycle, first carbon oxidation, oxaloacetate => 2-oxoglutarate
M00011	709	74.6	5703	67.9	493	83.8	82	55.8	129	81.1	5	8.9	5525	69.2	178	42.9	Citrate cycle, second carbon oxidation, 2-oxoglutarate => oxaloacetate
M00012	292	30.7	3064	36.5	4	0.7	131	89.1	150	94.3	7	12.5	3008	37.7	56	13.5	Glyoxylate cycle
M00013	158	16.6	0	0.0	85	14.5	1	0.7	72	45.3	0	0.0	0	0.0	0	0.0	Malonate semialdehyde pathway, propanoyl-CoA => acetyl-CoA
M00014	498	52.4	0	0.0	498	84.7	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Glucuronate pathway (uronate pathway)
M00015	884	93.1	6293	74.9	573	97.4	144	98.0	147	92.5	20	35.7	6197	77.6	96	23.1	Proline biosynthesis, glutamate => proline
M00016	0	0.0	4129	49.2	0	0.0	0	0.0	0	0.0	0	0.0	4127	51.7	2	0.5	Lysine biosynthesis, succinyl-DAP pathway, aspartate => lysine
M00017	106	11.2	2635	31.4	0	0.0	3	2.0	102	64.2	1	1.8	2635	33.0	0	0.0	Methionine biosynthesis, aspartate => homoserine => methionine
M00018	282	29.7	6370	75.9	0	0.0	140	95.2	138	86.8	4	7.1	6117	76.6	253	61.0	Threonine biosynthesis, aspartate => homoserine => threonine
M00019	287	30.2	6329	75.4	0	0.0	141	95.9	146	91.8	0	0.0	6058	75.9	271	65.3	Valine/isoleucine biosynthesis, pyruvate => valine / 2-oxobutanoate => isoleucine
M00020	804	84.6	4447	53.0	540	91.8	144	98.0	111	69.8	9	16.1	4428	55.5	19	4.6	Serine biosynthesis, glycerate-3P => serine
M00021	163	17.2	5851	69.7	0	0.0	147	100.0	1	0.6	15	26.8	5705	71.5	146	35.2	Cysteine biosynthesis, serine => cysteine
M00022	293	30.8	4106	48.9	0	0.0	136	92.5	147	92.5	10	17.9	4098	51.3	8	1.9	Shikimate pathway, phosphoenolpyruvate + erythrose-4P => chorismate
M00023	273	28.7	5306	63.2	0	0.0	137	93.2	136	85.5	0	0.0	4995	62.6	311	74.9	Tryptophan biosynthesis, chorismate => tryptophan
M00024	100	10.5	1914	22.8	0	0.0	1	0.7	99	62.3	0	0.0	1914	24.0	0	0.0	Phenylalanine biosynthesis, chorismate => phenylpyruvate => phenylalanine
M00025	133	14.0	1348	16.1	0	0.0	0	0.0	133	83.6	0	0.0	1348	16.9	0	0.0	Tyrosine biosynthesis, chorismate => HPP => tyrosine
M00026	261	27.5	3960	47.2	2	0.3	130	88.4	126	79.2	3	5.4	3886	48.7	74	17.8	Histidine biosynthesis, PRPP => histidine
M00027	836	88.0	836	10.0	562	95.6	132	89.8	142	89.3	0	0.0	821	10.3	15	3.6	GABA (gamma-Aminobutyrate) shunt
M00028	244	25.7	5434	64.7	0	0.0	144	98.0	97	61.0	3	5.4	5313	66.6	121	29.2	Ornithine biosynthesis, glutamate => ornithine
M00029	346	36.4	0	0.0	341	58.0	0	0.0	0	0.0	5	8.9	0	0.0	0	0.0	Urea cycle
M00030	126	13.3	0	0.0	0	0.0	0	0.0	126	79.2	0	0.0	0	0.0	0	0.0	Lysine biosynthesis, AAA pathway, 2-oxoglutarate => 2-aminoadipate => lysine
M00031	0	0.0	153	1.8	0	0.0	0	0.0	0	0.0	0	0.0	55	0.7	98	23.6	Lysine biosynthesis, mediated by LysW, 2-aminoadipate => lysine
M00032	372	39.2	0	0.0	371	63.1	0	0.0	0	0.0	1	1.8	0	0.0	0	0.0	Lysine degradation, lysine => saccharopine => acetoacetyl-CoA
M00033	0	0.0	929	11.1	0	0.0	0	0.0	0	0.0	0	0.0	914	11.4	15	3.6	Ectoine biosynthesis, aspartate => ectoine
M00034	583	61.4	743	8.8	372	63.3	130	88.4	74	46.5	7	12.5	743	9.3	0	0.0	Methionine salvage pathway
M00035	669	70.4	1000	11.9	571	97.1	2	1.4	87	54.7	9	16.1	977	12.2	23	5.5	Methionine degradation
M00036	684	72.0	869	10.3	466	79.3	132	89.8	78	49.1	8	14.3	869	10.9	0	0.0	Leucine degradation, leucine => acetoacetate + acetyl-CoA
M00037	322	33.9	0	0.0	322	54.8	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Melatonin biosynthesis, animals, tryptophan => serotonin => melatonin
M00038	379	39.9	57	0.7	376	63.9	0	0.0	3	1.9	0	0.0	57	0.7	0	0.0	Tryptophan metabolism, tryptophan => kynurenine => 2-aminomuconate
M00039	124	13.1	0	0.0	0	0.0	124	84.4	0	0.0	0	0.0	0	0.0	0	0.0	Monolignol biosynthesis, phenylalanine/tyrosine => monolignol
M00040	145	15.3	474	5.6	0	0.0	143	97.3	0	0.0	2	3.6	474	5.9	0	0.0	Tyrosine biosynthesis, chorismate => arogenate => tyrosine
M00042	312	32.8	0	0.0	312	53.1	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Catecholamine biosynthesis, tyrosine => dopamine => noradrenaline => adrenaline
M00043	359	37.8	1	0.0	359	61.1	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	Thyroid hormone biosynthesis, tyrosine => triiodothyronine/thyroxine
M00044	728	76.6	493	5.9	536	91.2	131	89.1	50	31.4	11	19.6	493	6.2	0	0.0	Tyrosine degradation, tyrosine => homogentisate
M00045	416	43.8	3010	35.8	409	69.6	0	0.0	0	0.0	7	12.5	2956	37.0	54	13.0	Histidine degradation, histidine => N-formiminoglutamate => glutamate
M00046	690	72.6	867	10.3	549	93.4	136	92.5	0	0.0	5	8.9	867	10.9	0	0.0	Pyrimidine degradation, uracil => beta-alanine, thymine => 3-aminoisobutanoate
M00047	353	37.2	0	0.0	350	59.5	0	0.0	0	0.0	3	5.4	0	0.0	0	0.0	Creatine pathway
M00048	694	73.1	6780	80.7	456	77.6	125	85.0	112	70.4	1	1.8	6562	82.2	218	52.5	De novo purine biosynthesis, PRPP + glutamine => IMP
M00049	893	94.0	6961	82.9	573	97.4	134	91.2	144	90.6	42	75.0	6578	82.4	383	92.3	Adenine ribonucleotide biosynthesis, IMP => ADP,ATP
M00050	888	93.5	6569	78.2	565	96.1	142	96.6	139	87.4	42	75.0	6569	82.3	0	0.0	Guanine ribonucleotide biosynthesis, IMP => GDP,GTP
M00051	638	67.2	1384	16.5	518	88.1	2	1.4	113	71.1	5	8.9	1103	13.8	281	67.7	De novo pyrimidine biosynthesis, glutamine (+ PRPP) => UMP
M00052	915	96.3	7088	84.4	579	98.5	146	99.3	144	90.6	46	82.1	6699	83.9	389	93.7	Pyrimidine ribonucleotide biosynthesis, UMP => UDP/UTP,CDP/CTP
M00053	916	96.4	5919	70.5	576	98.0	147	100.0	152	95.6	41	73.2	5808	72.8	111	26.7	Deoxyribonucleotide biosynthesis, ADP/GDP/CDP/UDP => dATP/dGTP/dCTP/dUTP
M00055	621	65.4	0	0.0	412	70.1	125	85.0	81	50.9	3	5.4	0	0.0	0	0.0	N-glycan precursor biosynthesis
M00056	112	11.8	0	0.0	112	19.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	O-glycan biosynthesis, mucin type core
M00057	477	50.2	0	0.0	476	81.0	0	0.0	0	0.0	1	1.8	0	0.0	0	0.0	Glycosaminoglycan biosynthesis, linkage tetrasaccharide
M00058	488	51.4	0	0.0	488	83.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Glycosaminoglycan biosynthesis, chondroitin sulfate backbone
M00059	562	59.2	0	0.0	562	95.6	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Glycosaminoglycan biosynthesis, heparan sulfate backbone
M00060	0	0.0	782	9.3	0	0.0	0	0.0	0	0.0	0	0.0	782	9.8	0	0.0	KDO2-lipid A biosynthesis, Raetz pathway, LpxL-LpxM type
M00061	0	0.0	1274	15.2	0	0.0	0	0.0	0	0.0	0	0.0	1274	16.0	0	0.0	D-Glucuronate degradation, D-glucuronate => pyruvate + D-glyceraldehyde 3P
M00063	0	0.0	3255	38.8	0	0.0	0	0.0	0	0.0	0	0.0	3255	40.8	0	0.0	CMP-KDO biosynthesis
M00064	0	0.0	1822	21.7	0	0.0	0	0.0	0	0.0	0	0.0	1822	22.8	0	0.0	ADP-L-glycero-D-manno-heptose biosynthesis
M00065	278	29.3	0	0.0	253	43.0	12	8.2	13	8.2	0	0.0	0	0.0	0	0.0	GPI-anchor biosynthesis, core oligosaccharide
M00066	367	38.6	0	0.0	367	62.4	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Lactosylceramide biosynthesis
M00067	371	39.1	0	0.0	370	62.9	0	0.0	0	0.0	1	1.8	0	0.0	0	0.0	Sulfoglycolipids biosynthesis, ceramide/1-alkyl-2-acylglycerol => sulfatide/seminolipid
M00068	238	25.1	0	0.0	238	40.5	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Glycosphingolipid biosynthesis, globo-series, LacCer => Gb4Cer
M00069	352	37.1	0	0.0	352	59.9	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Glycosphingolipid biosynthesis, ganglio series, LacCer => GT3
M00070	376	39.6	0	0.0	376	63.9	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Glycosphingolipid biosynthesis, lacto-series, LacCer => Lc4Cer
M00071	373	39.3	0	0.0	373	63.4	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Glycosphingolipid biosynthesis, neolacto-series, LacCer => nLc4Cer
M00072	127	13.4	0	0.0	0	0.0	1	0.7	126	79.2	0	0.0	0	0.0	0	0.0	N-glycosylation by oligosaccharyltransferase
M00073	752	79.2	0	0.0	496	84.4	134	91.2	119	74.8	3	5.4	0	0.0	0	0.0	N-glycan precursor trimming
M00074	20	2.1	0	0.0	0	0.0	0	0.0	20	12.6	0	0.0	0	0.0	0	0.0	N-glycan biosynthesis, high-mannose type
M00075	329	34.6	0	0.0	329	56.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	N-glycan biosynthesis, complex type
M00076	385	40.5	0	0.0	385	65.5	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Dermatan sulfate degradation
M00077	375	39.5	0	0.0	375	63.8	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Chondroitin sulfate degradation
M00078	355	37.4	0	0.0	355	60.4	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Heparan sulfate degradation
M00079	401	42.2	0	0.0	401	68.2	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Keratan sulfate degradation
M00081	194	20.4	0	0.0	0	0.0	133	90.5	61	38.4	0	0.0	0	0.0	0	0.0	Pectin degradation
M00082	850	89.5	5577	66.4	542	92.2	143	97.3	146	91.8	19	33.9	5577	69.9	0	0.0	Fatty acid biosynthesis, initiation
M00083	876	92.2	6598	78.6	559	95.1	145	98.6	149	93.7	23	41.1	6598	82.7	0	0.0	Fatty acid biosynthesis, elongation
M00085	587	61.8	0	0.0	575	97.8	0	0.0	4	2.5	8	14.3	0	0.0	0	0.0	Fatty acid elongation in mitochondria
M00086	949	99.9	6356	75.7	588	100.0	147	100.0	159	100.0	55	98.2	6140	76.9	216	52.0	beta-Oxidation, acyl-CoA synthesis
M00087	781	82.2	2600	31.0	585	99.5	143	97.3	33	20.8	20	35.7	2600	32.6	0	0.0	beta-Oxidation
M00088	0	0.0	10	0.1	0	0.0	0	0.0	0	0.0	0	0.0	10	0.1	0	0.0	Ketone body biosynthesis, acetyl-CoA => acetoacetate/3-hydroxybutyrate/acetone
M00089	820	86.3	0	0.0	577	98.1	146	99.3	75	47.2	22	39.3	0	0.0	0	0.0	Triacylglycerol biosynthesis
M00090	541	56.9	0	0.0	517	87.9	0	0.0	24	15.1	0	0.0	0	0.0	0	0.0	Phosphatidylcholine (PC) biosynthesis, choline => PC
M00091	549	57.8	1140	13.6	380	64.6	3	2.0	149	93.7	17	30.4	1104	13.8	36	8.7	Phosphatidylcholine (PC) biosynthesis, PE => PC
M00092	852	89.7	0	0.0	575	97.8	140	95.2	94	59.1	43	76.8	0	0.0	0	0.0	Phosphatidylethanolamine (PE) biosynthesis, ethanolamine => PE
M00093	143	15.1	5356	63.8	0	0.0	0	0.0	143	89.9	0	0.0	5356	67.1	0	0.0	Phosphatidylethanolamine (PE) biosynthesis, PA => PS => PE
M00094	709	74.6	0	0.0	574	97.6	3	2.0	131	82.4	1	1.8	0	0.0	0	0.0	Ceramide biosynthesis
M00095	729	76.7	100	1.2	487	82.8	129	87.8	105	66.0	8	14.3	58	0.7	42	10.1	C5 isoprenoid biosynthesis, mevalonate pathway
M00096	139	14.6	2185	26.0	0	0.0	134	91.2	0	0.0	5	8.9	2185	27.4	0	0.0	C5 isoprenoid biosynthesis, non-mevalonate pathway
M00097	131	13.8	35	0.4	0	0.0	131	89.1	0	0.0	0	0.0	35	0.4	0	0.0	beta-Carotene biosynthesis, GGAP => beta-carotene
M00098	870	91.6	356	4.2	564	95.9	147	100.0	142	89.3	17	30.4	356	4.5	0	0.0	Acylglycerol degradation
M00099	665	70.0	0	0.0	574	97.6	0	0.0	90	56.6	1	1.8	0	0.0	0	0.0	Sphingosine biosynthesis
M00100	830	87.4	0	0.0	528	89.8	138	93.9	144	90.6	20	35.7	0	0.0	0	0.0	Sphingosine degradation
M00101	267	28.1	0	0.0	267	45.4	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Cholesterol biosynthesis, squalene 2,3-epoxide => cholesterol
M00102	107	11.3	0	0.0	0	0.0	0	0.0	107	67.3	0	0.0	0	0.0	0	0.0	Ergocalciferol biosynthesis, squalene 2,3-epoxide => ergosterol/ergocalciferol
M00103	307	32.3	0	0.0	307	52.2	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Cholecalciferol biosynthesis
M00104	30	3.2	0	0.0	30	5.1	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Bile acid biosynthesis, cholesterol => cholate/chenodeoxycholate
M00106	141	14.8	0	0.0	141	24.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Conjugated bile acid biosynthesis, cholate => taurocholate/glycocholate
M00107	339	35.7	0	0.0	339	57.7	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Steroid hormone biosynthesis, cholesterol => pregnenolone => progesterone
M00108	28	2.9	0	0.0	28	4.8	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	C21-Steroid hormone biosynthesis, progesterone => corticosterone/aldosterone
M00109	256	26.9	0	0.0	256	43.5	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	C21-Steroid hormone biosynthesis, progesterone => cortisol/cortisone
M00110	351	36.9	0	0.0	351	59.7	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	C19/C18-Steroid hormone biosynthesis, pregnenolone => androstenedione => estrone
M00112	147	15.5	67	0.8	0	0.0	143	97.3	0	0.0	4	7.1	67	0.8	0	0.0	Tocopherol/tocotorienol biosynthesis, homogentisate + phytyl/geranylgeranyl-PP => tocopherol/tocotorienol
M00113	131	13.8	0	0.0	0	0.0	131	89.1	0	0.0	0	0.0	0	0.0	0	0.0	Jasmonic acid biosynthesis
M00114	135	14.2	0	0.0	0	0.0	135	91.8	0	0.0	0	0.0	0	0.0	0	0.0	Ascorbate biosynthesis, plants, fructose-6P => ascorbate
M00115	133	14.0	4713	56.1	0	0.0	133	90.5	0	0.0	0	0.0	4713	59.0	0	0.0	NAD biosynthesis, aspartate => quinolinate => NAD
M00116	0	0.0	1003	11.9	0	0.0	0	0.0	0	0.0	0	0.0	1003	12.6	0	0.0	Menaquinone biosynthesis, chorismate (+ polyprenyl-PP) => menaquinol
M00117	0	0.0	1505	17.9	0	0.0	0	0.0	0	0.0	0	0.0	1505	18.9	0	0.0	Ubiquinone biosynthesis, prokaryotes, chorismate (+ polyprenyl-PP) => ubiquinol
M00118	780	82.1	3109	37.0	562	95.6	143	97.3	74	46.5	1	1.8	3108	38.9	1	0.2	Glutathione biosynthesis, glutamate => glutathione
M00119	0	0.0	2724	32.4	0	0.0	0	0.0	0	0.0	0	0.0	2724	34.1	0	0.0	Pantothenate biosynthesis, valine/L-aspartate => pantothenate
M00120	651	68.5	6854	81.6	495	84.2	101	68.7	49	30.8	6	10.7	6854	85.9	0	0.0	Coenzyme A biosynthesis, pantothenate => CoA
M00121	140	14.7	3068	36.5	0	0.0	136	92.5	0	0.0	4	7.1	3068	38.4	0	0.0	Heme biosynthesis, plants and bacteria, glutamate => heme
M00122	0	0.0	2323	27.7	0	0.0	0	0.0	0	0.0	0	0.0	2323	29.1	0	0.0	Cobalamin biosynthesis, cobyrinate a,c-diamide => cobalamin
M00123	241	25.4	4382	52.2	0	0.0	143	97.3	93	58.5	5	8.9	4358	54.6	24	5.8	Biotin biosynthesis, pimeloyl-ACP/CoA => biotin
M00124	0	0.0	1093	13.0	0	0.0	0	0.0	0	0.0	0	0.0	1093	13.7	0	0.0	Pyridoxal-P biosynthesis, erythrose-4P => pyridoxal-P
M00125	134	14.1	2329	27.7	0	0.0	134	91.2	0	0.0	0	0.0	2329	29.2	0	0.0	Riboflavin biosynthesis, plants and bacteria, GTP => riboflavin/FMN/FAD
M00126	176	18.5	3272	39.0	0	0.0	116	78.9	60	37.7	0	0.0	3272	41.0	0	0.0	Tetrahydrofolate biosynthesis, GTP => THF
M00127	0	0.0	640	7.6	0	0.0	0	0.0	0	0.0	0	0.0	635	8.0	5	1.2	Thiamine biosynthesis, prokaryotes, AIR (+ DXP/tyrosine) => TMP/TPP
M00128	668	70.3	0	0.0	534	90.8	0	0.0	132	83.0	2	3.6	0	0.0	0	0.0	Ubiquinone biosynthesis, eukaryotes, 4-hydroxybenzoate + polyprenyl-PP => ubiquinol
M00129	212	22.3	0	0.0	212	36.1	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Ascorbate biosynthesis, animals, glucose-1P => ascorbate
M00130	577	60.7	0	0.0	574	97.6	0	0.0	0	0.0	3	5.4	0	0.0	0	0.0	Inositol phosphate metabolism, PI=> PIP2 => Ins(1,4,5)P3 => Ins(1,3,4,5)P4
M00131	373	39.3	0	0.0	373	63.4	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Inositol phosphate metabolism, Ins(1,3,4,5)P4 => Ins(1,3,4)P3 => myo-inositol
M00132	604	63.6	0	0.0	467	79.4	131	89.1	0	0.0	6	10.7	0	0.0	0	0.0	Inositol phosphate metabolism, Ins(1,3,4)P3 => phytate
M00133	224	23.6	1345	16.0	224	38.1	0	0.0	0	0.0	0	0.0	1256	15.7	89	21.4	Polyamine biosynthesis, arginine => agmatine => putrescine => spermidine
M00134	825	86.8	645	7.7	552	93.9	103	70.1	148	93.1	22	39.3	644	8.1	1	0.2	Polyamine biosynthesis, arginine => ornithine => putrescine
M00135	457	48.1	422	5.0	457	77.7	0	0.0	0	0.0	0	0.0	422	5.3	0	0.0	GABA biosynthesis, eukaryotes, putrescine => GABA
M00136	0	0.0	193	2.3	0	0.0	0	0.0	0	0.0	0	0.0	193	2.4	0	0.0	GABA biosynthesis, prokaryotes, putrescine => GABA
M00137	132	13.9	0	0.0	0	0.0	132	89.8	0	0.0	0	0.0	0	0.0	0	0.0	Flavanone biosynthesis, phenylalanine => naringenin
M00138	114	12.0	0	0.0	0	0.0	114	77.6	0	0.0	0	0.0	0	0.0	0	0.0	Flavonoid biosynthesis, naringenin => pelargonidin
M00140	0	0.0	2785	33.2	0	0.0	0	0.0	0	0.0	0	0.0	2732	34.2	53	12.8	C1-unit interconversion, prokaryotes
M00141	737	77.6	37	0.4	575	97.8	1	0.7	151	95.0	10	17.9	37	0.5	0	0.0	C1-unit interconversion, eukaryotes
M00142	410	43.2	0	0.0	337	57.3	53	36.1	19	11.9	1	1.8	0	0.0	0	0.0	NADH:ubiquinone oxidoreductase, mitochondria
M00143	736	77.5	0	0.0	534	90.8	80	54.4	117	73.6	5	8.9	0	0.0	0	0.0	NADH dehydrogenase (ubiquinone) Fe-S protein/flavoprotein complex, mitochondria
M00144	0	0.0	3894	46.4	0	0.0	0	0.0	0	0.0	0	0.0	3894	48.8	0	0.0	NADH:quinone oxidoreductase, prokaryotes
M00145	98	10.3	150	1.8	0	0.0	98	66.7	0	0.0	0	0.0	150	1.9	0	0.0	NAD(P)H:quinone oxidoreductase, chloroplasts and cyanobacteria
M00146	383	40.3	0	0.0	383	65.1	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	NADH dehydrogenase (ubiquinone) 1 alpha subcomplex
M00147	348	36.6	0	0.0	348	59.2	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	NADH dehydrogenase (ubiquinone) 1 beta subcomplex
M00148	740	77.9	0	0.0	511	86.9	83	56.5	141	88.7	5	8.9	0	0.0	0	0.0	Succinate dehydrogenase (ubiquinone)
M00149	0	0.0	4021	47.9	0	0.0	0	0.0	0	0.0	0	0.0	3805	47.7	216	52.0	Succinate dehydrogenase, prokaryotes
M00150	0	0.0	891	10.6	0	0.0	0	0.0	0	0.0	0	0.0	891	11.2	0	0.0	Fumarate reductase, prokaryotes
M00151	451	47.5	4003	47.7	340	57.8	76	51.7	26	16.4	9	16.1	4003	50.1	0	0.0	Cytochrome bc1 complex respiratory unit
M00152	374	39.4	0	0.0	340	57.8	0	0.0	34	21.4	0	0.0	0	0.0	0	0.0	Cytochrome bc1 complex
M00153	0	0.0	1487	17.7	0	0.0	0	0.0	0	0.0	0	0.0	1487	18.6	0	0.0	Cytochrome bd ubiquinol oxidase
M00154	320	33.7	0	0.0	304	51.7	0	0.0	16	10.1	0	0.0	0	0.0	0	0.0	Cytochrome c oxidase
M00155	0	0.0	4345	51.7	0	0.0	0	0.0	0	0.0	0	0.0	4232	53.0	113	27.2	Cytochrome c oxidase, prokaryotes
M00156	0	0.0	2013	24.0	0	0.0	0	0.0	0	0.0	0	0.0	2013	25.2	0	0.0	Cytochrome c oxidase, cbb3-type
M00157	112	11.8	7438	88.6	0	0.0	110	74.8	0	0.0	2	3.6	7420	92.9	18	4.3	F-type ATPase, prokaryotes and chloroplasts
M00158	172	18.1	0	0.0	156	26.5	0	0.0	16	10.1	0	0.0	0	0.0	0	0.0	F-type ATPase, eukaryotes
M00159	0	0.0	728	8.7	0	0.0	0	0.0	0	0.0	0	0.0	394	4.9	334	80.5	V/A-type ATPase, prokaryotes
M00160	462	48.6	0	0.0	462	78.6	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	V-type ATPase, eukaryotes
M00161	110	11.6	146	1.7	0	0.0	109	74.1	0	0.0	1	1.8	146	1.8	0	0.0	Photosystem II
M00162	0	0.0	103	1.2	0	0.0	0	0.0	0	0.0	0	0.0	103	1.3	0	0.0	Cytochrome b6f complex
M00163	114	12.0	147	1.8	0	0.0	112	76.2	0	0.0	2	3.6	147	1.8	0	0.0	Photosystem I
M00165	120	12.6	378	4.5	0	0.0	120	81.6	0	0.0	0	0.0	378	4.7	0	0.0	Reductive pentose phosphate cycle (Calvin cycle)
M00168	155	16.3	3630	43.2	0	0.0	146	99.3	0	0.0	9	16.1	3416	42.8	214	51.6	CAM (Crassulacean acid metabolism), dark
M00169	147	15.5	1175	14.0	0	0.0	142	96.6	0	0.0	5	8.9	1175	14.7	0	0.0	CAM (Crassulacean acid metabolism), light
M00170	138	14.5	0	0.0	0	0.0	134	91.2	0	0.0	4	7.1	0	0.0	0	0.0	C4-dicarboxylic acid cycle, phosphoenolpyruvate carboxykinase type
M00171	133	14.0	0	0.0	0	0.0	133	90.5	0	0.0	0	0.0	0	0.0	0	0.0	C4-dicarboxylic acid cycle, NAD - malic enzyme type
M00172	114	12.0	0	0.0	0	0.0	114	77.6	0	0.0	0	0.0	0	0.0	0	0.0	C4-dicarboxylic acid cycle, NADP - malic enzyme type
M00173	0	0.0	27	0.3	0	0.0	0	0.0	0	0.0	0	0.0	27	0.3	0	0.0	Reductive citrate cycle (Arnon-Buchanan cycle)
M00174	0	0.0	37	0.4	0	0.0	0	0.0	0	0.0	0	0.0	37	0.5	0	0.0	Methane oxidation, methanotroph, methane => formaldehyde
M00175	0	0.0	761	9.1	0	0.0	0	0.0	0	0.0	0	0.0	695	8.7	66	15.9	Nitrogen fixation, nitrogen => ammonia
M00176	276	29.1	2785	33.2	0	0.0	135	91.8	135	84.9	6	10.7	2764	34.6	21	5.1	Assimilatory sulfate reduction, sulfate => H2S
M00307	888	93.5	7850	93.5	570	96.9	146	99.3	135	84.9	37	66.1	7475	93.6	375	90.4	Pyruvate oxidation, pyruvate => acetyl-CoA
M00308	0	0.0	73	0.9	0	0.0	0	0.0	0	0.0	0	0.0	6	0.1	67	16.1	Semi-phosphorylative Entner-Doudoroff pathway, gluconate => glycerate-3P
M00309	0	0.0	51	0.6	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	51	12.3	Non-phosphorylative Entner-Doudoroff pathway, gluconate/galactonate => glycerate
M00338	734	77.3	1185	14.1	580	98.6	2	1.4	140	88.1	12	21.4	1166	14.6	19	4.6	Cysteine biosynthesis, homocysteine + serine => cysteine
M00344	57	6.0	0	0.0	0	0.0	0	0.0	57	35.8	0	0.0	0	0.0	0	0.0	Formaldehyde assimilation, xylulose monophosphate pathway
M00345	0	0.0	552	6.6	0	0.0	0	0.0	0	0.0	0	0.0	548	6.9	4	1.0	Formaldehyde assimilation, ribulose monophosphate pathway
M00346	0	0.0	51	0.6	0	0.0	0	0.0	0	0.0	0	0.0	51	0.6	0	0.0	Formaldehyde assimilation, serine pathway
M00356	0	0.0	47	0.6	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	47	11.3	Methanogenesis, methanol => methane
M00357	0	0.0	76	0.9	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	76	18.3	Methanogenesis, acetate => methane
M00358	0	0.0	54	0.6	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	53	12.8	Coenzyme M biosynthesis
M00363	0	0.0	14	0.2	0	0.0	0	0.0	0	0.0	0	0.0	14	0.2	0	0.0	EHEC pathogenicity signature, Shiga toxin
M00364	150	15.8	3842	45.7	0	0.0	144	98.0	0	0.0	6	10.7	3434	43.0	408	98.3	C10-C20 isoprenoid biosynthesis, bacteria
M00365	0	0.0	1362	16.2	0	0.0	0	0.0	0	0.0	0	0.0	954	12.0	408	98.3	C10-C20 isoprenoid biosynthesis, archaea
M00366	141	14.8	0	0.0	0	0.0	140	95.2	0	0.0	1	1.8	0	0.0	0	0.0	C10-C20 isoprenoid biosynthesis, plants
M00367	690	72.6	0	0.0	534	90.8	0	0.0	141	88.7	15	26.8	0	0.0	0	0.0	C10-C20 isoprenoid biosynthesis, non-plant eukaryotes
M00368	131	13.8	0	0.0	0	0.0	131	89.1	0	0.0	0	0.0	0	0.0	0	0.0	Ethylene biosynthesis, methionine => ethylene
M00369	10	1.1	0	0.0	0	0.0	10	6.8	0	0.0	0	0.0	0	0.0	0	0.0	Cyanogenic glycoside biosynthesis, tyrosine => dhurrin
M00370	9	0.9	0	0.0	0	0.0	9	6.1	0	0.0	0	0.0	0	0.0	0	0.0	Glucosinolate biosynthesis, tryptophan => glucobrassicin
M00371	99	10.4	0	0.0	0	0.0	99	67.3	0	0.0	0	0.0	0	0.0	0	0.0	Castasterone biosynthesis, campesterol => castasterone
M00372	129	13.6	0	0.0	0	0.0	129	87.8	0	0.0	0	0.0	0	0.0	0	0.0	Abscisic acid biosynthesis, beta-carotene => abscisic acid
M00373	0	0.0	129	1.5	0	0.0	0	0.0	0	0.0	0	0.0	129	1.6	0	0.0	Ethylmalonyl pathway
M00374	0	0.0	11	0.1	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	11	2.7	Dicarboxylate-hydroxybutyrate cycle
M00375	0	0.0	41	0.5	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	41	9.9	Hydroxypropionate-hydroxybutylate cycle
M00376	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	3-Hydroxypropionate bi-cycle
M00377	0	0.0	51	0.6	0	0.0	0	0.0	0	0.0	0	0.0	51	0.6	0	0.0	Reductive acetyl-CoA pathway (Wood-Ljungdahl pathway)
M00378	0	0.0	255	3.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	255	61.4	F420 biosynthesis, archaea
M00415	846	89.1	0	0.0	570	96.9	136	92.5	113	71.1	27	48.2	0	0.0	0	0.0	Fatty acid elongation in endoplasmic reticulum
M00416	0	0.0	468	5.6	0	0.0	0	0.0	0	0.0	0	0.0	468	5.9	0	0.0	Cytochrome aa3-600 menaquinol oxidase
M00417	0	0.0	2006	23.9	0	0.0	0	0.0	0	0.0	0	0.0	2006	25.1	0	0.0	Cytochrome o ubiquinol oxidase
M00418	0	0.0	6	0.1	0	0.0	0	0.0	0	0.0	0	0.0	6	0.1	0	0.0	Toluene degradation, anaerobic, toluene => benzoyl-CoA
M00419	0	0.0	7	0.1	0	0.0	0	0.0	0	0.0	0	0.0	7	0.1	0	0.0	Cymene degradation, p-cymene => p-cumate
M00422	0	0.0	96	1.1	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	95	22.9	Acetyl-CoA pathway, CO2 => acetyl-CoA
M00432	287	30.2	6495	77.3	0	0.0	140	95.2	146	91.8	1	1.8	6219	77.9	276	66.5	Leucine biosynthesis, 2-oxoisovalerate => 2-oxoisocaproate
M00433	141	14.8	41	0.5	0	0.0	0	0.0	141	88.7	0	0.0	41	0.5	0	0.0	Lysine biosynthesis, 2-oxoglutarate => 2-oxoadipate
M00525	0	0.0	459	5.5	0	0.0	0	0.0	0	0.0	0	0.0	459	5.7	0	0.0	Lysine biosynthesis, acetyl-DAP pathway, aspartate => lysine
M00526	0	0.0	539	6.4	0	0.0	0	0.0	0	0.0	0	0.0	536	6.7	3	0.7	Lysine biosynthesis, DAP dehydrogenase pathway, aspartate => lysine
M00527	139	14.6	1315	15.7	0	0.0	138	93.9	0	0.0	1	1.8	1220	15.3	95	22.9	Lysine biosynthesis, DAP aminotransferase pathway, aspartate => lysine
M00528	0	0.0	36	0.4	0	0.0	0	0.0	0	0.0	0	0.0	36	0.5	0	0.0	Nitrification, ammonia => nitrite
M00529	0	0.0	283	3.4	0	0.0	0	0.0	0	0.0	0	0.0	283	3.5	0	0.0	Denitrification, nitrate => nitrogen
M00530	0	0.0	1354	16.1	0	0.0	0	0.0	0	0.0	0	0.0	1354	17.0	0	0.0	Dissimilatory nitrate reduction, nitrate => ammonia
M00531	224	23.6	1453	17.3	0	0.0	142	96.6	77	48.4	5	8.9	1397	17.5	56	13.5	Assimilatory nitrate reduction, nitrate => ammonia
M00532	113	11.9	0	0.0	0	0.0	113	76.9	0	0.0	0	0.0	0	0.0	0	0.0	Photorespiration
M00533	0	0.0	532	6.3	0	0.0	0	0.0	0	0.0	0	0.0	532	6.7	0	0.0	Homoprotocatechuate degradation, homoprotocatechuate => 2-oxohept-3-enedioate
M00534	0	0.0	8	0.1	0	0.0	0	0.0	0	0.0	0	0.0	8	0.1	0	0.0	Naphthalene degradation, naphthalene => salicylate
M00535	0	0.0	422	5.0	0	0.0	0	0.0	0	0.0	0	0.0	259	3.2	163	39.3	Isoleucine biosynthesis, pyruvate => 2-oxobutanoate
M00537	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	Xylene degradation, xylene => methylbenzoate
M00538	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	Toluene degradation, toluene => benzoate
M00539	0	0.0	7	0.1	0	0.0	0	0.0	0	0.0	0	0.0	7	0.1	0	0.0	Cumate degradation, p-cumate => 2-oxopent-4-enoate + 2-methylpropanoate
M00540	0	0.0	19	0.2	0	0.0	0	0.0	0	0.0	0	0.0	19	0.2	0	0.0	Benzoate degradation, cyclohexanecarboxylic acid =>pimeloyl-CoA
M00541	0	0.0	27	0.3	0	0.0	0	0.0	0	0.0	0	0.0	27	0.3	0	0.0	Benzoyl-CoA degradation, benzoyl-CoA => 3-hydroxypimeloyl-CoA
M00542	0	0.0	9	0.1	0	0.0	0	0.0	0	0.0	0	0.0	9	0.1	0	0.0	EHEC/EPEC pathogenicity signature, T3SS and effectors
M00543	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	Biphenyl degradation, biphenyl => 2-oxopent-4-enoate + benzoate
M00544	0	0.0	10	0.1	0	0.0	0	0.0	0	0.0	0	0.0	10	0.1	0	0.0	Carbazole degradation, carbazole => 2-oxopent-4-enoate + anthranilate
M00545	0	0.0	352	4.2	0	0.0	0	0.0	0	0.0	0	0.0	352	4.4	0	0.0	Trans-cinnamate degradation, trans-cinnamate => acetyl-CoA
M00546	172	18.1	388	4.6	171	29.1	0	0.0	0	0.0	1	1.8	388	4.9	0	0.0	Purine degradation, xanthine => urea
M00547	0	0.0	7	0.1	0	0.0	0	0.0	0	0.0	0	0.0	7	0.1	0	0.0	Benzene/toluene degradation, benzene => catechol / toluene => 3-methylcatechol
M00548	0	0.0	166	2.0	0	0.0	0	0.0	0	0.0	0	0.0	166	2.1	0	0.0	Benzene degradation, benzene => catechol
M00549	930	97.9	4646	55.3	584	99.3	145	98.6	155	97.5	46	82.1	4634	58.0	12	2.9	Nucleotide sugar biosynthesis, glucose => UDP-glucose
M00550	0	0.0	607	7.2	0	0.0	0	0.0	0	0.0	0	0.0	607	7.6	0	0.0	Ascorbate degradation, ascorbate => D-xylulose-5P
M00551	0	0.0	555	6.6	0	0.0	0	0.0	0	0.0	0	0.0	555	7.0	0	0.0	Benzoate degradation, benzoate => catechol / methylbenzoate => methylcatechol
M00552	0	0.0	996	11.9	0	0.0	0	0.0	0	0.0	0	0.0	996	12.5	0	0.0	D-galactonate degradation, De Ley-Doudoroff pathway, D-galactonate => glycerate-3P
M00554	648	68.2	3004	35.8	514	87.4	2	1.4	132	83.0	0	0.0	2957	37.0	47	11.3	Nucleotide sugar biosynthesis, galactose => UDP-galactose
M00555	579	60.9	2138	25.5	438	74.5	134	91.2	1	0.6	6	10.7	2138	26.8	0	0.0	Betaine biosynthesis, choline => betaine
M00563	0	0.0	34	0.4	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	34	8.2	Methanogenesis, methylamine/dimethylamine/trimethylamine => methane
M00564	0	0.0	45	0.5	0	0.0	0	0.0	0	0.0	0	0.0	45	0.6	0	0.0	Helicobacter pylori pathogenicity signature, cagA pathogenicity island
M00565	0	0.0	573	6.8	0	0.0	0	0.0	0	0.0	0	0.0	573	7.2	0	0.0	Trehalose biosynthesis, D-glucose 1P => trehalose
M00567	0	0.0	104	1.2	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	104	25.1	Methanogenesis, CO2 => methane
M00568	0	0.0	673	8.0	0	0.0	0	0.0	0	0.0	0	0.0	673	8.4	0	0.0	Catechol ortho-cleavage, catechol => 3-oxoadipate
M00569	0	0.0	317	3.8	0	0.0	0	0.0	0	0.0	0	0.0	317	4.0	0	0.0	Catechol meta-cleavage, catechol => acetyl-CoA / 4-methylcatechol => propanoyl-CoA
M00570	286	30.1	5709	68.0	0	0.0	141	95.9	145	91.2	0	0.0	5564	69.7	145	34.9	Isoleucine biosynthesis, threonine => 2-oxobutanoate => isoleucine
M00572	0	0.0	1481	17.6	0	0.0	0	0.0	0	0.0	0	0.0	1481	18.6	0	0.0	Pimeloyl-ACP biosynthesis, BioC-BioH pathway, malonyl-ACP => pimeloyl-ACP
M00573	0	0.0	62	0.7	0	0.0	0	0.0	0	0.0	0	0.0	62	0.8	0	0.0	Biotin biosynthesis, BioI pathway, long-chain-acyl-ACP => pimeloyl-ACP => biotin
M00574	0	0.0	11	0.1	0	0.0	0	0.0	0	0.0	0	0.0	11	0.1	0	0.0	Pertussis pathogenicity signature, pertussis toxin
M00575	0	0.0	8	0.1	0	0.0	0	0.0	0	0.0	0	0.0	8	0.1	0	0.0	Pertussis pathogenicity signature, T1SS
M00576	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	ETEC pathogenicity signature, heat-labile and heat-stable enterotoxins
M00577	0	0.0	304	3.6	0	0.0	0	0.0	0	0.0	0	0.0	281	3.5	23	5.5	Biotin biosynthesis, BioW pathway, pimelate => pimeloyl-CoA => biotin
M00579	0	0.0	5364	63.9	0	0.0	0	0.0	0	0.0	0	0.0	5341	66.9	23	5.5	Phosphate acetyltransferase-acetate kinase pathway, acetyl-CoA => acetate
M00580	0	0.0	654	7.8	0	0.0	0	0.0	0	0.0	0	0.0	399	5.0	255	61.4	Pentose phosphate pathway, archaea, fructose 6P => ribose 5P
M00595	0	0.0	465	5.5	0	0.0	0	0.0	0	0.0	0	0.0	465	5.8	0	0.0	Thiosulfate oxidation by SOX complex, thiosulfate => sulfate
M00596	0	0.0	154	1.8	0	0.0	0	0.0	0	0.0	0	0.0	139	1.7	15	3.6	Dissimilatory sulfate reduction, sulfate => H2S
M00597	0	0.0	169	2.0	0	0.0	0	0.0	0	0.0	0	0.0	169	2.1	0	0.0	Anoxygenic photosystem II
M00598	0	0.0	13	0.2	0	0.0	0	0.0	0	0.0	0	0.0	13	0.2	0	0.0	Anoxygenic photosystem I
M00608	0	0.0	106	1.3	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	106	25.5	2-Oxocarboxylic acid chain extension, 2-oxoglutarate => 2-oxoadipate => 2-oxopimelate => 2-oxosuberate
M00609	0	0.0	267	3.2	0	0.0	0	0.0	0	0.0	0	0.0	267	3.3	0	0.0	Cysteine biosynthesis, methionine => cysteine
M00611	112	11.8	142	1.7	0	0.0	112	76.2	0	0.0	0	0.0	142	1.8	0	0.0	Oxygenic photosynthesis in plants and cyanobacteria
M00612	0	0.0	48	0.6	0	0.0	0	0.0	0	0.0	0	0.0	48	0.6	0	0.0	Anoxygenic photosynthesis in purple bacteria
M00613	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	Anoxygenic photosynthesis in green nonsulfur bacteria
M00614	0	0.0	9	0.1	0	0.0	0	0.0	0	0.0	0	0.0	9	0.1	0	0.0	Anoxygenic photosynthesis in green sulfur bacteria
M00615	220	23.2	1431	17.0	0	0.0	142	96.6	73	45.9	5	8.9	1376	17.2	55	13.3	Nitrate assimilation
M00616	1	0.1	1732	20.6	0	0.0	1	0.7	0	0.0	0	0.0	1732	21.7	0	0.0	Sulfate-sulfur assimilation
M00617	0	0.0	113	1.3	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	113	27.2	Methanogen
M00618	0	0.0	43	0.5	0	0.0	0	0.0	0	0.0	0	0.0	43	0.5	0	0.0	Acetogen
M00620	0	0.0	61	0.7	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	61	14.7	Incomplete reductive citrate cycle, acetyl-CoA => oxoglutarate
M00621	857	90.2	5700	67.9	549	93.4	144	98.0	139	87.4	25	44.6	5530	69.3	170	41.0	Glycine cleavage system
M00622	0	0.0	181	2.2	0	0.0	0	0.0	0	0.0	0	0.0	181	2.3	0	0.0	Nicotinate degradation, nicotinate => fumarate
M00623	0	0.0	59	0.7	0	0.0	0	0.0	0	0.0	0	0.0	59	0.7	0	0.0	Phthalate degradation, phthalate => protocatechuate
M00624	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	Terephthalate degradation, terephthalate => 3,4-dihydroxybenzoate
M00625	0	0.0	18	0.2	0	0.0	0	0.0	0	0.0	0	0.0	18	0.2	0	0.0	Methicillin resistance
M00627	0	0.0	441	5.3	0	0.0	0	0.0	0	0.0	0	0.0	441	5.5	0	0.0	beta-Lactam resistance, Bla system
M00630	41	4.3	0	0.0	0	0.0	0	0.0	41	25.8	0	0.0	0	0.0	0	0.0	D-Galacturonate degradation (fungi), D-galacturonate => glycerol
M00631	0	0.0	895	10.7	0	0.0	0	0.0	0	0.0	0	0.0	895	11.2	0	0.0	D-Galacturonate degradation (bacteria), D-galacturonate => pyruvate + D-glyceraldehyde 3P
M00632	623	65.6	2660	31.7	499	84.9	1	0.7	123	77.4	0	0.0	2652	33.2	8	1.9	Galactose degradation, Leloir pathway, galactose => alpha-D-glucose-1P
M00633	0	0.0	42	0.5	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	42	10.1	Semi-phosphorylative Entner-Doudoroff pathway, gluconate/galactonate => glycerate-3P
M00636	0	0.0	9	0.1	0	0.0	0	0.0	0	0.0	0	0.0	9	0.1	0	0.0	Phthalate degradation, phthalate => protocatechuate
M00637	0	0.0	213	2.5	0	0.0	0	0.0	0	0.0	0	0.0	213	2.7	0	0.0	Anthranilate degradation, anthranilate => catechol
M00638	0	0.0	62	0.7	0	0.0	0	0.0	0	0.0	0	0.0	62	0.8	0	0.0	Salicylate degradation, salicylate => gentisate
M00639	0	0.0	117	1.4	0	0.0	0	0.0	0	0.0	0	0.0	117	1.5	0	0.0	Multidrug resistance, efflux pump MexCD-OprJ
M00641	0	0.0	187	2.2	0	0.0	0	0.0	0	0.0	0	0.0	187	2.3	0	0.0	Multidrug resistance, efflux pump MexEF-OprN
M00642	0	0.0	415	4.9	0	0.0	0	0.0	0	0.0	0	0.0	415	5.2	0	0.0	Multidrug resistance, efflux pump MexJK-OprM
M00643	0	0.0	107	1.3	0	0.0	0	0.0	0	0.0	0	0.0	107	1.3	0	0.0	Multidrug resistance, efflux pump MexXY-OprM
M00649	0	0.0	175	2.1	0	0.0	0	0.0	0	0.0	0	0.0	175	2.2	0	0.0	Multidrug resistance, efflux pump AdeABC
M00651	0	0.0	8	0.1	0	0.0	0	0.0	0	0.0	0	0.0	8	0.1	0	0.0	Vancomycin resistance, D-Ala-D-Lac type
M00652	0	0.0	33	0.4	0	0.0	0	0.0	0	0.0	0	0.0	33	0.4	0	0.0	Vancomycin resistance, D-Ala-D-Ser type
M00660	0	0.0	27	0.3	0	0.0	0	0.0	0	0.0	0	0.0	27	0.3	0	0.0	Xanthomonas spp. pathogenicity signature, T3SS and effectors
M00661	7	0.7	0	0.0	0	0.0	0	0.0	7	4.4	0	0.0	0	0.0	0	0.0	Paspaline biosynthesis, geranylgeranyl-PP + indoleglycerol phosphate => paspaline
M00664	0	0.0	90	1.1	0	0.0	0	0.0	0	0.0	0	0.0	90	1.1	0	0.0	Nodulation
M00672	6	0.6	0	0.0	0	0.0	0	0.0	6	3.8	0	0.0	0	0.0	0	0.0	Penicillin biosynthesis, aminoadipate + cycteine + valine => penicillin
M00673	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	Cephamycin C biosynthesis, aminoadipate + cycteine + valine => cephamycin C
M00674	0	0.0	9	0.1	0	0.0	0	0.0	0	0.0	0	0.0	9	0.1	0	0.0	Clavaminate biosynthesis, arginine + glyceraldehyde-3P => clavaminate
M00675	0	0.0	16	0.2	0	0.0	0	0.0	0	0.0	0	0.0	16	0.2	0	0.0	Carbapenem-3-carboxylate biosynthesis, pyrroline-5-carboxylate + malonyl-CoA => carbapenem-3-carboxylate
M00696	0	0.0	247	2.9	0	0.0	0	0.0	0	0.0	0	0.0	247	3.1	0	0.0	Multidrug resistance, efflux pump AcrEF-TolC
M00697	0	0.0	81	1.0	0	0.0	0	0.0	0	0.0	0	0.0	81	1.0	0	0.0	Multidrug resistance, efflux pump MdtEF-TolC
M00698	0	0.0	286	3.4	0	0.0	0	0.0	0	0.0	0	0.0	286	3.6	0	0.0	Multidrug resistance, efflux pump BpeEF-OprC
M00700	0	0.0	235	2.8	0	0.0	0	0.0	0	0.0	0	0.0	235	2.9	0	0.0	Multidrug resistance, efflux pump AbcA
M00702	0	0.0	106	1.3	0	0.0	0	0.0	0	0.0	0	0.0	106	1.3	0	0.0	Multidrug resistance, efflux pump NorB
M00704	0	0.0	67	0.8	0	0.0	0	0.0	0	0.0	0	0.0	67	0.8	0	0.0	Tetracycline resistance, efflux pump Tet38
M00705	0	0.0	200	2.4	0	0.0	0	0.0	0	0.0	0	0.0	200	2.5	0	0.0	Multidrug resistance, efflux pump MepA
M00714	0	0.0	14	0.2	0	0.0	0	0.0	0	0.0	0	0.0	14	0.2	0	0.0	Multidrug resistance, efflux pump QacA
M00718	0	0.0	27	0.3	0	0.0	0	0.0	0	0.0	0	0.0	27	0.3	0	0.0	Multidrug resistance, efflux pump MexAB-OprM
M00725	0	0.0	91	1.1	0	0.0	0	0.0	0	0.0	0	0.0	91	1.1	0	0.0	Cationic antimicrobial peptide (CAMP) resistance, dltABCD operon
M00726	0	0.0	100	1.2	0	0.0	0	0.0	0	0.0	0	0.0	100	1.3	0	0.0	Cationic antimicrobial peptide (CAMP) resistance, lysyl-phosphatidylglycerol (L-PG) synthase MprF
M00730	0	0.0	76	0.9	0	0.0	0	0.0	0	0.0	0	0.0	76	1.0	0	0.0	Cationic antimicrobial peptide (CAMP) resistance, VraFG transporter
M00736	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	Nocardicin A biosynthesis, L-pHPG + arginine + serine => nocardicin A
M00740	0	0.0	14	0.2	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	14	3.4	Methylaspartate cycle
M00741	417	43.9	1299	15.5	403	68.5	3	2.0	0	0.0	11	19.6	1159	14.5	140	33.7	Propanoyl-CoA metabolism, propanoyl-CoA => succinyl-CoA
M00744	0	0.0	54	0.6	0	0.0	0	0.0	0	0.0	0	0.0	54	0.7	0	0.0	Cationic antimicrobial peptide (CAMP) resistance, protease PgtE
M00745	0	0.0	203	2.4	0	0.0	0	0.0	0	0.0	0	0.0	203	2.5	0	0.0	Imipenem resistance, repression of porin OprD
M00746	0	0.0	43	0.5	0	0.0	0	0.0	0	0.0	0	0.0	43	0.5	0	0.0	Multidrug resistance, repression of porin OmpF
M00761	0	0.0	563	6.7	0	0.0	0	0.0	0	0.0	0	0.0	563	7.1	0	0.0	Undecaprenylphosphate alpha-L-Ara4N biosynthesis, UDP-GlcA => undecaprenyl phosphate alpha-L-Ara4N
M00763	0	0.0	160	1.9	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	160	38.6	Ornithine biosynthesis, mediated by LysW, glutamate => ornithine
M00769	0	0.0	20	0.2	0	0.0	0	0.0	0	0.0	0	0.0	20	0.3	0	0.0	Multidrug resistance, efflux pump MexPQ-OpmE
M00773	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Tylosin biosynthesis, methylmalonyl-CoA + malonyl-CoA => tylactone => tylosin
M00774	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	Erythromycin biosynthesis, propanoyl-CoA + methylmalonyl-CoA => deoxyerythronolide B => erythromycin A/B
M00775	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Oleandomycin biosynthesis, malonyl-CoA + methylmalonyl-CoA => 8,8a-deoxyoleandolide => oleandomycin
M00776	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Pikromycin/methymycin biosynthesis, methylmalonyl-CoA + malonyl-CoA => narbonolide/10-deoxymethynolide => pikromycin/methymycin
M00777	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	Avermectin biosynthesis, 2-methylbutanoyl-CoA/isobutyryl-CoA => 6,8a-Seco-6,8a-deoxy-5-oxoavermectin 1a/1b aglycone => avermectin A1a/B1a/A1b/B1b
M00778	0	0.0	116	1.4	0	0.0	0	0.0	0	0.0	0	0.0	116	1.5	0	0.0	Type II polyketide backbone biosynthesis, acyl-CoA + malonyl-CoA => polyketide
M00779	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	Dihydrokalafungin biosynthesis, octaketide => dihydrokalafungin
M00780	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	Tetracycline/oxytetracycline biosynthesis, pretetramide => tetracycline/oxytetracycline
M00781	0	0.0	8	0.1	0	0.0	0	0.0	0	0.0	0	0.0	8	0.1	0	0.0	Nogalavinone/aklavinone biosynthesis, deoxynogalonate/deoxyaklanonate => nogalavinone/aklavinone
M00782	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Mithramycin biosynthesis, 4-demethylpremithramycinone => mithramycin
M00783	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	Tetracenomycin C/8-demethyltetracenomycin C biosynthesis, tetracenomycin F2 => tetracenomycin C/8-demethyltetracenomycin C
M00784	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Elloramycin biosynthesis, 8-demethyltetracenomycin C => elloramycin A
M00785	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Cycloserine biosynthesis, arginine/serine => cycloserine
M00786	1	0.1	0	0.0	0	0.0	0	0.0	1	0.6	0	0.0	0	0.0	0	0.0	Fumitremorgin alkaloid biosynthesis, tryptophan + proline => fumitremorgin C/A
M00787	0	0.0	62	0.7	0	0.0	0	0.0	0	0.0	0	0.0	62	0.8	0	0.0	Bacilysin biosynthesis, prephenate => bacilysin
M00788	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Terpentecin biosynthesis, GGAP => terpentecin
M00789	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Rebeccamycin biosynthesis, tryptophan => rebeccamycin
M00790	0	0.0	47	0.6	0	0.0	0	0.0	0	0.0	0	0.0	47	0.6	0	0.0	Pyrrolnitrin biosynthesis, tryptophan => pyrrolnitrin
M00793	0	0.0	4939	58.8	0	0.0	0	0.0	0	0.0	0	0.0	4757	59.6	182	43.9	dTDP-L-rhamnose biosynthesis
M00794	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	dTDP-6-deoxy-D-allose biosynthesis
M00795	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	dTDP-beta-L-noviose biosynthesis
M00796	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	dTDP-D-mycaminose biosynthesis
M00797	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	dTDP-D-desosamine biosynthesis
M00798	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	dTDP-L-mycarose biosynthesis
M00799	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	dTDP-L-oleandrose biosynthesis
M00800	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	dTDP-L-megosamine biosynthesis
M00801	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	dTDP-L-olivose biosynthesis
M00802	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	dTDP-D-forosamine biosynthesis
M00803	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	dTDP-D-angolosamine biosynthesis
M00804	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	Complete nitrification, comammox, ammonia => nitrite => nitrate
M00805	0	0.0	5	0.1	0	0.0	0	0.0	0	0.0	0	0.0	5	0.1	0	0.0	Staurosporine biosynthesis, tryptophan => staurosporine
M00808	0	0.0	14	0.2	0	0.0	0	0.0	0	0.0	0	0.0	14	0.2	0	0.0	Violacein biosynthesis, tryptophan => violacein
M00810	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	Nicotine degradation, pyridine pathway, nicotine => 2,6-dihydroxypyridine/succinate semialdehyde
M00811	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	Nicotine degradation, pyrrolidine pathway, nicotine => succinate semialdehyde
M00814	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	Acarbose biosynthesis, sedoheptulopyranose-7P => acarbose
M00815	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	Validamycin A biosynthesis, sedoheptulopyranose-7P => validamycin A
M00819	0	0.0	6	0.1	0	0.0	0	0.0	0	0.0	0	0.0	6	0.1	0	0.0	Pentalenolactone biosynthesis, farnesyl-PP => pentalenolactone
M00823	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	Chlortetracycline biosynthesis, pretetramide => chlortetracycline
M00824	0	0.0	39	0.5	0	0.0	0	0.0	0	0.0	0	0.0	39	0.5	0	0.0	9-membered enediyne core biosynthesis, malonyl-CoA => 3-hydroxyhexadeca-4,6,8,10,12,14-hexaenoyl-ACP => 9-membered enediyne core
M00825	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	10-membered enediyne core biosynthesis, malonyl-CoA => 3-hydroxyhexadeca-4,6,8,10,12,14-hexaenoyl-ACP => 10-membered enediyne core
M00826	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	C-1027 benzoxazolinate moiety biosynthesis, chorismate => benzoxazolinyl-CoA
M00827	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	C-1027 beta-amino acid moiety biosynthesis, tyrosine => 3-chloro-4,5-dihydroxy-beta-phenylalanyl-PCP
M00828	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	Maduropeptin beta-hydroxy acid moiety biosynthesis, tyrosine => 3-(4-hydroxyphenyl)-3-oxopropanoyl-PCP
M00829	0	0.0	6	0.1	0	0.0	0	0.0	0	0.0	0	0.0	6	0.1	0	0.0	3,6-Dimethylsalicylyl-CoA biosynthesis, malonyl-CoA => 6-methylsalicylate => 3,6-dimethylsalicylyl-CoA
M00830	0	0.0	3	0.0	0	0.0	0	0.0	0	0.0	0	0.0	3	0.0	0	0.0	Neocarzinostatin naphthoate moiety biosynthesis, malonyl-CoA => 2-hydroxy-5-methyl-1-naphthoate => 2-hydroxy-7-methoxy-5-methyl-1-naphthoyl-CoA
M00831	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Kedarcidin 2-hydroxynaphthoate moiety biosynthesis, malonyl-CoA => 3,6,8-trihydroxy-2-naphthoate => 3-hydroxy-7,8-dimethoxy-6-isopropoxy-2-naphthoyl-CoA
M00832	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Kedarcidin 2-aza-3-chloro-beta-tyrosine moiety biosynthesis, azatyrosine => 2-aza-3-chloro-beta-tyrosyl-PCP
M00833	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	Calicheamicin biosynthesis, calicheamicinone => calicheamicin
M00834	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	Calicheamicin orsellinate moiety biosynthesis, malonyl-CoA => orsellinate-ACP => 5-iodo-2,3-dimethoxyorsellinate-ACP
M00835	0	0.0	23	0.3	0	0.0	0	0.0	0	0.0	0	0.0	23	0.3	0	0.0	Pyocyanine biosynthesis, chorismate => pyocyanine
M00836	0	0.0	40	0.5	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	40	9.6	Coenzyme F430 biosynthesis, sirohydrochlorin => coenzyme F430
M00837	0	0.0	15	0.2	0	0.0	0	0.0	0	0.0	0	0.0	15	0.2	0	0.0	Prodigiosin biosynthesis, L-proline => prodigiosin
M00838	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	Undecylprodigiosin biosynthesis, L-proline => undecylprodigiosin
M00840	0	0.0	82	1.0	0	0.0	0	0.0	0	0.0	0	0.0	82	1.0	0	0.0	Tetrahydrofolate biosynthesis, mediated by ribA and trpF, GTP => THF
M00841	13	1.4	0	0.0	0	0.0	0	0.0	0	0.0	13	23.2	0	0.0	0	0.0	Tetrahydrofolate biosynthesis, mediated by PTPS, GTP => THF
M00842	543	57.2	0	0.0	539	91.7	0	0.0	0	0.0	4	7.1	0	0.0	0	0.0	Tetrahydrobiopterin biosynthesis, GTP => BH4
M00843	0	0.0	16	0.2	0	0.0	0	0.0	0	0.0	0	0.0	16	0.2	0	0.0	L-threo-Tetrahydrobiopterin biosynthesis, GTP => L-threo-BH4
M00844	668	70.3	6233	74.2	377	64.1	144	98.0	141	88.7	6	10.7	5890	73.8	343	82.7	Arginine biosynthesis, ornithine => arginine
M00845	0	0.0	130	1.5	0	0.0	0	0.0	0	0.0	0	0.0	130	1.6	0	0.0	Arginine biosynthesis, glutamate => acetylcitrulline => arginine
M00846	0	0.0	2419	28.8	0	0.0	0	0.0	0	0.0	0	0.0	2417	30.3	2	0.5	Siroheme biosynthesis, glutamyl-tRNA => siroheme
M00847	0	0.0	265	3.2	0	0.0	0	0.0	0	0.0	0	0.0	158	2.0	107	25.8	Heme biosynthesis, archaea, siroheme => heme
M00848	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Aurachin biosynthesis, anthranilate => aurachin A
M00849	0	0.0	341	4.1	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	341	82.2	C5 isoprenoid biosynthesis, mevalonate pathway, archaea
M00850	0	0.0	7	0.1	0	0.0	0	0.0	0	0.0	0	0.0	7	0.1	0	0.0	Vibrio cholerae pathogenicity signature, cholera toxins
M00851	0	0.0	135	1.6	0	0.0	0	0.0	0	0.0	0	0.0	135	1.7	0	0.0	Carbapenem resistance
M00852	0	0.0	10	0.1	0	0.0	0	0.0	0	0.0	0	0.0	10	0.1	0	0.0	Vibrio cholerae pathogenicity signature, toxin coregulated pilus
M00853	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	ETEC pathogenicity signature, colonization factors
M00854	756	79.6	3581	42.6	478	81.3	144	98.0	132	83.0	2	3.6	3564	44.6	17	4.1	Glycogen biosynthesis, glucose-1P => glycogen/starch
M00855	722	76.0	1329	15.8	573	97.4	0	0.0	137	86.2	12	21.4	1328	16.6	1	0.2	Glycogen degradation, glycogen => glucose-6P
M00856	0	0.0	6	0.1	0	0.0	0	0.0	0	0.0	0	0.0	6	0.1	0	0.0	Salmonella enterica pathogenicity signature, typhoid toxin
M00857	0	0.0	7	0.1	0	0.0	0	0.0	0	0.0	0	0.0	7	0.1	0	0.0	Salmonella enterica pathogenicity signature, Vi antigen
M00859	0	0.0	12	0.1	0	0.0	0	0.0	0	0.0	0	0.0	12	0.2	0	0.0	Bacillus anthracis pathogenicity signature, anthrax toxin
M00860	0	0.0	7	0.1	0	0.0	0	0.0	0	0.0	0	0.0	7	0.1	0	0.0	Bacillus anthracis pathogenicity signature, polyglutamic acid capsule biosynthesis
M00861	538	56.6	0	0.0	535	91.0	0	0.0	0	0.0	3	5.4	0	0.0	0	0.0	beta-Oxidation, peroxisome, VLCFA
M00862	239	25.2	0	0.0	239	40.6	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	beta-Oxidation, peroxisome, tri/dihydroxycholestanoyl-CoA => choloyl/chenodeoxycholoyl-CoA
M00866	0	0.0	68	0.8	0	0.0	0	0.0	0	0.0	0	0.0	68	0.9	0	0.0	KDO2-lipid A biosynthesis, Raetz pathway, non-LpxL-LpxM type
M00867	0	0.0	59	0.7	0	0.0	0	0.0	0	0.0	0	0.0	59	0.7	0	0.0	KDO2-lipid A modification pathway
M00868	551	58.0	13	0.2	432	73.5	0	0.0	119	74.8	0	0.0	13	0.2	0	0.0	Heme biosynthesis, animals and fungi, glycine => heme
M00872	300	31.6	0	0.0	299	50.9	0	0.0	0	0.0	1	1.8	0	0.0	0	0.0	O-glycan biosynthesis, mannose type (core M3)
M00873	216	22.7	0	0.0	216	36.7	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Fatty acid biosynthesis in mitochondria, animals
M00874	38	4.0	0	0.0	0	0.0	0	0.0	38	23.9	0	0.0	0	0.0	0	0.0	Fatty acid biosynthesis in mitochondria, fungi
M00875	0	0.0	72	0.9	0	0.0	0	0.0	0	0.0	0	0.0	72	0.9	0	0.0	Staphyloferrin B biosynthesis, L-serine => staphyloferrin B
M00876	0	0.0	107	1.3	0	0.0	0	0.0	0	0.0	0	0.0	107	1.3	0	0.0	Staphyloferrin A biosynthesis, L-ornithine => staphyloferrin A
M00877	0	0.0	70	0.8	0	0.0	0	0.0	0	0.0	0	0.0	70	0.9	0	0.0	Kanosamine biosynthesis, glucose 6-phosphate => kanosamine
M00878	0	0.0	179	2.1	0	0.0	0	0.0	0	0.0	0	0.0	179	2.2	0	0.0	Phenylacetate degradation, phenylaxetate => acetyl-CoA/succinyl-CoA
M00879	0	0.0	861	10.3	0	0.0	0	0.0	0	0.0	0	0.0	861	10.8	0	0.0	Arginine succinyltransferase pathway, arginine => glutamate
M00880	637	67.1	3992	47.5	498	84.7	127	86.4	12	7.5	0	0.0	3645	45.7	347	83.6	Molybdenum cofactor biosynthesis, GTP => molybdenum cofactor
M00881	157	16.5	5456	65.0	0	0.0	141	95.9	0	0.0	16	28.6	5445	68.2	11	2.7	Lipoic acid biosynthesis, plants and bacteria, octanoyl-ACP => dihydrolipoyl-E2/H
M00882	773	81.4	0	0.0	490	83.3	139	94.6	142	89.3	2	3.6	0	0.0	0	0.0	Lipoic acid biosynthesis, eukaryotes, octanoyl-ACP => dihydrolipoyl-H
M00883	472	49.7	463	5.5	472	80.3	0	0.0	0	0.0	0	0.0	463	5.8	0	0.0	Lipoic acid biosynthesis, animals and bacteria, octanoyl-ACP => dihydrolipoyl-H => dihydrolipoyl-E2
M00884	40	4.2	0	0.0	0	0.0	0	0.0	40	25.2	0	0.0	0	0.0	0	0.0	Lipoic acid biosynthesis, octanoyl-CoA => dihydrolipoyl-E2
M00889	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Puromycin biosynthesis, ATP => puromycin
M00890	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	Roseoflavin biosynthesis, FMN => roseoflavin
M00891	1	0.1	0	0.0	0	0.0	0	0.0	1	0.6	0	0.0	0	0.0	0	0.0	Ditryptophenaline biosynthesis, tryptophan + phenylalanine => ditryptophenaline
M00892	870	91.6	0	0.0	572	97.3	139	94.6	144	90.6	15	26.8	0	0.0	0	0.0	UDP-N-acetyl-D-glucosamine biosynthesis, eukaryotes, glucose => UDP-GlcNAc
M00893	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Lovastatin biosynthesis, malonyl-CoA => lovastatin acid
M00894	1	0.1	0	0.0	0	0.0	1	0.7	0	0.0	0	0.0	0	0.0	0	0.0	Cannabidiol biosynthesis, malonyl-CoA => cannabidiol/dronabinol
M00895	0	0.0	77	0.9	0	0.0	0	0.0	0	0.0	0	0.0	77	1.0	0	0.0	Thiamine biosynthesis, prokaryotes, AIR (+ DXP/glycine) => TMP/TPP
M00896	0	0.0	337	4.0	0	0.0	0	0.0	0	0.0	0	0.0	58	0.7	279	67.2	Thiamine biosynthesis, archaea, AIR (+ NAD+) => TMP/TPP
M00897	112	11.8	0	0.0	0	0.0	112	76.2	0	0.0	0	0.0	0	0.0	0	0.0	Thiamine biosynthesis, plants, AIR (+ NAD+) => TMP/thiamine/TPP
M00898	73	7.7	0	0.0	0	0.0	0	0.0	73	45.9	0	0.0	0	0.0	0	0.0	Thiamine biosynthesis, pyridoxal-5P => TMP/thiamine/TPP
M00899	136	14.3	2730	32.5	0	0.0	135	91.8	0	0.0	1	1.8	2588	32.4	142	34.2	Thiamine salvage pathway, HMP/HET => TMP
M00900	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Crocin biosynthesis, crocetin => crocin
M00901	3	0.3	0	0.0	0	0.0	0	0.0	3	1.9	0	0.0	0	0.0	0	0.0	Fumiquinazoline biosynthesis, tryptophan + alanine + anthranilate => fumiquinazoline
M00902	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Podophyllotoxin biosynthesis, coniferyl alcohol => podophyllotoxin
M00903	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Fosfomycin biosynthesis, phosphoenolpyruvate => fosfomycin
M00904	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	Dapdiamides biosynthesis, L-2,3-diaminopropanoate => dapdiamide A/B/C
M00905	0	0.0	4	0.0	0	0.0	0	0.0	0	0.0	0	0.0	4	0.1	0	0.0	Grixazone biosynthesis, aspartate 4-semialdehyde => grixazone B
M00906	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	Ethynylserine biosynthesis, lysine => ethynylserine
M00909	0	0.0	5170	61.6	0	0.0	0	0.0	0	0.0	0	0.0	5170	64.8	0	0.0	UDP-N-acetyl-D-glucosamine biosynthesis, prokaryotes, glucose => UDP-GlcNAc
M00910	143	15.1	0	0.0	0	0.0	142	96.6	0	0.0	1	1.8	0	0.0	0	0.0	Phenylalanine biosynthesis, chorismate => arogenate => phenylalanine
M00911	37	3.9	0	0.0	0	0.0	0	0.0	37	23.3	0	0.0	0	0.0	0	0.0	Riboflavin biosynthesis, fungi, GTP => riboflavin/FMN/FAD
M00912	353	37.2	66	0.8	293	49.8	0	0.0	59	37.1	1	1.8	66	0.8	0	0.0	NAD biosynthesis, tryptophan => quinolinate => NAD
M00913	43	4.5	0	0.0	0	0.0	0	0.0	43	27.0	0	0.0	0	0.0	0	0.0	Pantothenate biosynthesis, 2-oxoisovalerate/spermine => pantothenate
M00914	0	0.0	149	1.8	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	149	35.9	Coenzyme A biosynthesis, archaea, 2-oxoisovalerate => 4-phosphopantoate => CoA
M00915	0	0.0	1	0.0	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	0	0.0	Caffeine degradation, caffeine => xanthine
M00916	333	35.1	2645	31.5	30	5.1	144	98.0	136	85.5	23	41.1	2278	28.5	367	88.4	Pyridoxal-P biosynthesis, R5P + glyceraldehyde-3P + glutamine => pyridoxal-P
M00917	126	13.3	0	0.0	0	0.0	126	85.7	0	0.0	0	0.0	0	0.0	0	0.0	Phytosterol biosynthesis, squalene 2,3-epoxide => campesterol/sitosterol
M00918	0	0.0	190	2.3	0	0.0	0	0.0	0	0.0	0	0.0	190	2.4	0	0.0	Aerobactin biosynthesis, lysine => aerobactin
M00919	0	0.0	304	3.6	0	0.0	0	0.0	0	0.0	0	0.0	304	3.8	0	0.0	Ectoine degradation, ectoine => aspartate
M00921	0	0.0	9	0.1	0	0.0	0	0.0	0	0.0	0	0.0	9	0.1	0	0.0	Cyclooctatin biosynthesis, dimethylallyl-PP + isopentenyl-PP => cyclooctatin
M00922	0	0.0	87	1.0	0	0.0	0	0.0	0	0.0	0	0.0	87	1.1	0	0.0	CMP-Neu5Ac biosynthesis
M00923	0	0.0	7	0.1	0	0.0	0	0.0	0	0.0	0	0.0	7	0.1	0	0.0	UDP-L-FucNAm biosynthesis
M00924	0	0.0	634	7.5	0	0.0	0	0.0	0	0.0	0	0.0	581	7.3	53	12.8	Cobalamin biosynthesis, anaerobic, uroporphyrinogen III => sirohydrochlorin => cobyrinate a,c-diamide
M00925	0	0.0	258	3.1	0	0.0	0	0.0	0	0.0	0	0.0	258	3.2	0	0.0	Cobalamin biosynthesis, aerobic, uroporphyrinogen III => precorrin 2 => cobyrinate a,c-diamide
M00926	0	0.0	1383	16.5	0	0.0	0	0.0	0	0.0	0	0.0	1383	17.3	0	0.0	Heme biosynthesis, bacteria, glutamyl-tRNA => coproporphyrin III => heme
M00927	127	13.4	0	0.0	0	0.0	127	86.4	0	0.0	0	0.0	0	0.0	0	0.0	Gibberellin A12 biosynthesis, GAPP => GA12
M00928	132	13.9	0	0.0	0	0.0	132	89.8	0	0.0	0	0.0	0	0.0	0	0.0	Gibberellin A4/A1 biosynthesis, GA12/GA53 => GA4/GA1
M00929	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Gibberellin A1 biosynthesis, GGPP => GA1
M00930	0	0.0	421	5.0	0	0.0	0	0.0	0	0.0	0	0.0	421	5.3	0	0.0	Menaquinone biosynthesis, futalosine pathway
M00931	0	0.0	221	2.6	0	0.0	0	0.0	0	0.0	0	0.0	221	2.8	0	0.0	Menaquinone biosynthesis, modified futalosine pathway
M00932	126	13.3	54	0.6	0	0.0	126	85.7	0	0.0	0	0.0	54	0.7	0	0.0	Phylloquinone biosynthesis, chorismate (+ phytyl-PP) => phylloquinol
M00933	140	14.7	0	0.0	0	0.0	138	93.9	0	0.0	2	3.6	0	0.0	0	0.0	Plastoquinone biosynthesis, homogentisate + solanesyl-PP => plastoquinol
M00934	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Mycinamicin biosynthesis, malonyl-CoA + methylmalonyl-CoA => protomycinolide IV => mycinamicin II
M00935	0	0.0	40	0.5	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	40	9.6	Methanofuran biosynthesis
M00936	62	6.5	0	0.0	0	0.0	62	42.2	0	0.0	0	0.0	0	0.0	0	0.0	Melatonin biosynthesis, plants, tryptophan => serotonin => melatonin
M00937	1	0.1	0	0.0	0	0.0	0	0.0	1	0.6	0	0.0	0	0.0	0	0.0	Aflatoxin biosynthesis, malonyl-CoA => aflatoxin B1
M00938	811	85.4	4336	51.6	505	85.9	145	98.6	123	77.4	38	67.9	4265	53.4	71	17.1	Pyrimidine deoxyribonucleotide biosynthesis, UDP => dTTP
M00939	0	0.0	307	3.7	0	0.0	0	0.0	0	0.0	0	0.0	307	3.8	0	0.0	Pyrimidine degradation, uracil => 3-hydroxypropanoate
M00940	39	4.1	0	0.0	0	0.0	39	26.5	0	0.0	0	0.0	0	0.0	0	0.0	Flavanone biosynthesis, p-coumaroyl-CoA => liquiritigenin
M00941	17	1.8	0	0.0	0	0.0	17	11.6	0	0.0	0	0.0	0	0.0	0	0.0	Isoflavone biosynthesis, liquiritigenin/naringenin => daidzein/genistein
M00942	16	1.7	0	0.0	0	0.0	16	10.9	0	0.0	0	0.0	0	0.0	0	0.0	Pterocarpan biosynthesis, daidzein => medicarpin
M00943	1	0.1	0	0.0	0	0.0	1	0.7	0	0.0	0	0.0	0	0.0	0	0.0	Reticuline biosynthesis, dopamine + 4HPAA => (S)-reticuline
M00944	1	0.1	0	0.0	0	0.0	1	0.7	0	0.0	0	0.0	0	0.0	0	0.0	Morphine biosynthesis, (S)-reticuline => morphine
M00945	1	0.1	0	0.0	0	0.0	1	0.7	0	0.0	0	0.0	0	0.0	0	0.0	Sanguinarine biosynthesis, (S)-reticuline => sanguinarine
M00946	1	0.1	0	0.0	0	0.0	1	0.7	0	0.0	0	0.0	0	0.0	0	0.0	Noscapine biosynthesis, (S)-reticuline => noscapine
M00947	0	0.0	108	1.3	0	0.0	0	0.0	0	0.0	0	0.0	108	1.4	0	0.0	D-Arginine racemization, D-arginine => L-arginine
M00948	0	0.0	619	7.4	0	0.0	0	0.0	0	0.0	0	0.0	619	7.8	0	0.0	Hydroxyproline degradation, trans-4-hydroxy-L-proline => 2-oxoglutarate
M00949	0	0.0	116	1.4	0	0.0	0	0.0	0	0.0	0	0.0	116	1.5	0	0.0	Staphylopine biosynthesis, L-histidine => staphylopine
M00950	0	0.0	131	1.6	0	0.0	0	0.0	0	0.0	0	0.0	92	1.2	39	9.4	Biotin biosynthesis, BioU pathway, pimeloyl-ACP/CoA => biotin
M00951	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Cremeomycin biosynthesis, aspartate/3,4-AHBA => cremeomycin
M00952	1	0.1	0	0.0	0	0.0	1	0.7	0	0.0	0	0.0	0	0.0	0	0.0	Benzoxazinoid biosynthesis, indoleglycerol phosphate => DIMBOA-glucoside
M00953	17	1.8	0	0.0	0	0.0	17	11.6	0	0.0	0	0.0	0	0.0	0	0.0	Mugineic acid biosynthesis, methionine => 3-epihydroxymugineic acid
M00956	0	0.0	180	2.1	0	0.0	0	0.0	0	0.0	0	0.0	180	2.3	0	0.0	Lysine degradation, bacteria, L-lysine => succinate
M00957	0	0.0	151	1.8	0	0.0	0	0.0	0	0.0	0	0.0	151	1.9	0	0.0	Lysine degradation, bacteria, L-lysine => glutarate => succinate/acetyl-CoA
M00958	593	62.4	2109	25.1	548	93.2	0	0.0	36	22.6	9	16.1	2109	26.4	0	0.0	Adenine ribonucleotide degradation, AMP => Urate
M00959	544	57.3	1705	20.3	526	89.5	0	0.0	14	8.8	4	7.1	1705	21.4	0	0.0	Guanine ribonucleotide degradation, GMP => Urate
M00960	0	0.0	23	0.3	0	0.0	0	0.0	0	0.0	0	0.0	23	0.3	0	0.0	Lysine degradation, bacteria, L-lysine => D-lysine => succinate
M00961	4	0.4	0	0.0	0	0.0	4	2.7	0	0.0	0	0.0	0	0.0	0	0.0	Betacyanin biosynthesis, L-tyrosine => amaranthin
M00962	1	0.1	0	0.0	0	0.0	0	0.0	1	0.6	0	0.0	0	0.0	0	0.0	Psilocybin biosynthesis, tryptophan => psilocybin
M00963	6	0.6	0	0.0	0	0.0	0	0.0	6	3.8	0	0.0	0	0.0	0	0.0	Chanoclavine aldehyde biosynthesis, tryptophan => chanoclavine-I aldehyde
M00964	1	0.1	0	0.0	0	0.0	0	0.0	1	0.6	0	0.0	0	0.0	0	0.0	Fumigaclavine biosynthesis, chanoclavine-I aldehyde => fumigaclavine C
M00965	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	Vindoline biosynthesis, tabersonine => vindoline
M00966	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	Equol biosynthesis, daidzein => equol
M00967	0	0.0	6	0.1	0	0.0	0	0.0	0	0.0	0	0.0	6	0.1	0	0.0	Flavone degradation, luteolin/apigenin => DHCA/phloretate
M00968	0	0.0	210	2.5	0	0.0	0	0.0	0	0.0	0	0.0	1	0.0	209	50.4	Pentose bisphosphate pathway (nucleoside degradation), archaea, nucleoside/NMP => 3-PGA/glycerone phosphate
M00969	1	0.1	0	0.0	0	0.0	0	0.0	1	0.6	0	0.0	0	0.0	0	0.0	Fumagillin biosynthesis, farnesyl-PP => fumagillin
M00970	824	86.7	5301	63.1	552	93.9	142	96.6	102	64.2	28	50.0	5282	66.2	19	4.6	Proline degradation, proline => glutamate
M00971	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	QS-7 biosynthesis, 2,3-epoxysqualene => QS-7
M00972	821	86.4	1354	16.1	567	96.4	138	93.9	99	62.3	17	30.4	1354	17.0	0	0.0	Proline metabolism
M00973	0	0.0	2	0.0	0	0.0	0	0.0	0	0.0	0	0.0	2	0.0	0	0.0	Anammox, nitrite + ammonia => nitrogen"""

kegg_module_description = pd.Series({line.split('\t')[0]: line.split('\t')[-1] for line in _kegg_module_page.split('\n')[4:]}, name='description')
assert kegg_module_description.index.str.startswith('M0').all()
len(kegg_module_description)

In [None]:
species_list = (
    pd.read_table("meta/species_group.tsv")[
        lambda x: x.species_group_id == "xjin_ucfmt_hmp2"
    ]
    .species_id.astype(str)
    .unique()
)

In [None]:
def parse_taxonomy_string(taxonomy_string):
    values = taxonomy_string.split(";")
    return pd.Series(values, index=["d__", "p__", "c__", "o__", "f__", "g__", "s__"])

In [None]:
species_taxonomy_inpath = "ref/uhgg_genomes_all_v2.tsv"

species_taxonomy = (
    pd.read_table(species_taxonomy_inpath)[lambda x: x.Genome == x.Species_rep]
    .assign(species_id=lambda x: "1" + x.MGnify_accession.str.split("-").str[2])
    .set_index("species_id")
    .Lineage.apply(parse_taxonomy_string)
)
species_taxonomy

In [None]:
phylum_palette = lib.plot.construct_ordered_palette(
    sorted(species_taxonomy.p__.unique()),
    cm="tab10",
)

# assert len(set(phylum_palette.values())) == len((phylum_palette.values()))

### Strain Statistics

In [None]:
def classify_genome(x):
    if (x.genome_type == 'Isolate') & x.passes_filter:
        return 'isolate'
    elif (x.genome_type == 'Isolate') & ~x.passes_filter:
        return 'isolate_fails_qc'
    elif (x.genome_type == 'MAG') & x.passes_filter:
        return 'mag'
    elif (x.genome_type == 'MAG') & ~x.passes_filter:
        return 'mag_fails_qc'
    elif (x.genome_type == 'SPGC') & x.passes_filter:
        return 'spgc'
    elif (x.genome_type == 'SPGC') & x.passes_geno_positions & x.passes_in_sample_list:
        return 'sfacts_only'
    elif (x.genome_type == 'SPGC') & ~(x.passes_geno_positions & x.passes_in_sample_list):
        return 'sfacts_fails_qc'
    else:
        raise ValueError("Genome did not match classification criteria:", x)

In [None]:
filt_stats = []
missing_species = []

_species_list = species_list
# _species_list = ["100003"]

for species in tqdm(_species_list):
    inpath = f"data/group/xjin_ucfmt_hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc-fit.eggnog-strain_gene.strain_meta_for_analysis.tsv"
    if not os.path.exists(inpath):
        missing_species.append(species)
        continue
    data = pd.read_table(inpath, index_col="genome_id").assign(species=species)
    filt_stats.append(data)
filt_stats = pd.concat(filt_stats).assign(genome_class=lambda x: x.apply(classify_genome, axis=1))

print(
    len(missing_species),
    "out of",
    len(_species_list),
    "species are missing stats.",
)

In [None]:
d0 = filt_stats.groupby(['species', 'clust']).genome_class.value_counts().unstack('genome_class', fill_value=0).assign(highest_genome_class=lambda x: (x[['isolate', 'mag', 'spgc', 'sfacts_only', 'isolate_fails_qc', 'mag_fails_qc', 'sfacts_fails_qc']] > 0).idxmax(1))
d1 = d0.groupby('species').highest_genome_class.value_counts().unstack(fill_value=0)[['isolate', 'mag', 'spgc', 'sfacts_only', 'isolate_fails_qc', 'mag_fails_qc', 'sfacts_fails_qc']]
d2 = d1.join(species_taxonomy).sort_values(list(species_taxonomy.columns.values))

row_colors = d2.p__.map(phylum_palette)
sns.clustermap(d2[['isolate', 'mag', 'spgc', 'sfacts_only']], norm=mpl.colors.SymLogNorm(1), col_cluster=False, row_cluster=False, row_colors=row_colors)

In [None]:
plt.hist2d(
    "min_ref_gene_raw_diss",
    "min_ref_gene_filt_diss",
    data=filt_stats[lambda x: x.passes_filter].dropna(subset=["ref_nn_genome_id"]),
    bins=50,
    norm=mpl.colors.PowerNorm(1 / 3),
)
None

In [None]:
missing_species

In [None]:
filt_stats.columns

In [None]:
# What fraction of StrainFacts genotypes (with enough positions == 100) have a genotype dissimilarity of less than 1%?
filt_stats[
    lambda x: x.genome_type.isin(["SPGC"])
    & x.passes_geno_positions
    & x.passes_in_sample_list
].min_ref_geno_diss.lt(0.01).mean()

In [None]:
# Set of species for dereplication analysis
d = filt_stats[
    lambda x: x.genome_type.isin(["SPGC"])
    & x.passes_geno_positions
    & x.passes_in_sample_list
]
species_list1 = list(d.species.unique())
len(species_list1)

In [None]:
# Dereplication analysis: how many clusters are solely represented by an SPGC (or MAG/Isolate)?

d = filt_stats[
    lambda x: x.passes_geno_positions
    & (
        (x.genome_type.isin(["SPGC"]) & x.passes_in_sample_list)
        | (x.genome_type.isin(["Isolate", "MAG"]))
    )
]
d.genome_type.value_counts()

In [None]:
d = (
    # Take all genomes with enough genotyped positions, and drop any SPGC
    # genomes that are only found in UCFMT or XJIN samples.
    filt_stats[
        lambda x: x.passes_geno_positions
        & (
            (x.genome_type.isin(["SPGC"]) & x.passes_in_sample_list)
            | (x.genome_type.isin(["Isolate", "MAG"]))
        )
    ]
    # Count the number of genomes of each type in each cluster.
    [["species", "clust", "genome_type"]]
    .value_counts()
    .unstack(fill_value=0)
    # Tag each cluster by it's "best type"
    .assign(best_genome_type=lambda x: x[["Isolate", "MAG", "SPGC"]].idxmax(1))
    # Count for each species the number of clusters with each tag.
    .groupby("species")
    .best_genome_type.value_counts()
    .unstack(fill_value=0)
)

(
    d.join(d.divide(d.sum(1), axis=0), rsuffix="_frac")
    .join(species_taxonomy[["p__", "f__", "g__", "s__"]])
    .sort_values("SPGC", ascending=False)
    .head(20)
)

In [None]:
d = (
    # Take all genomes that pass the full filter.
    filt_stats[lambda x: x.passes_filter]
    # Count the number of genomes of each type in each cluster.
    [["species", "clust", "genome_type"]]
    .value_counts()
    .unstack(fill_value=0)
    # Tag each cluster by it's "best type"
    .assign(best_genome_type=lambda x: x[["Isolate", "MAG", "SPGC"]].idxmax(1))
    # Count for each species the number of clusters with each tag.
    .groupby("species")
    .best_genome_type.value_counts()
    .unstack(fill_value=0)
)

(
    d.join(d.divide(d.sum(1), axis=0), rsuffix="_frac")
    .join(species_taxonomy[["f__", "g__", "s__"]])
    .sort_values("SPGC", ascending=False)
    .head(40)
)

In [None]:
print(
    filt_stats[
        lambda x: x.min_ref_geno_diss.isna() & x.passes_geno_positions
    ].species.value_counts()
)

assert ~filt_stats[lambda x: x.passes_geno_positions].min_ref_geno_diss.isna().any()
assert ~filt_stats[lambda x: x.passes_geno_positions].ref_nn_gene_raw_diss.isna().any()

In [None]:
d = filt_stats[lambda x: x.genome_type.isin(["SPGC"]) & x.passes_filter]
species_list2 = idxwhere((d.species.value_counts() >= 10))

In [None]:
filt_stats.groupby("species").min_ref_gene_raw_diss.apply(
    lambda x: x.isna().mean()
).sort_values().tail(10)

In [None]:
filt_stats[lambda x: x.passes_filter & x.genome_type.isin(["SPGC"])].dropna(
    subset=["ref_nn_genome_id"]
)[
    [
        "min_ref_gene_raw_diss",
        "min_ref_gene_filt_diss",
        "ref_nn_gene_raw_diss",
        "ref_nn_gene_filt_diss",
    ]
].quantile(
    [0.25, 0.5, 0.75]
).T

### Relationship between genotype and gene distance

In [None]:
x = "min_ref_geno_diss"

d0 = filt_stats[lambda x: x.passes_filter].dropna(subset=["ref_nn_genome_id"])

for y in [
    "min_ref_gene_raw_diss",
    "min_ref_gene_filt_diss",
    "ref_nn_gene_raw_diss",
    "ref_nn_gene_filt_diss",
]:
    for genome_set in [["SPGC"], ["Isolate", "MAG"]]:
        d1 = d0[lambda x: x.genome_type.isin(genome_set)]
        print(y, genome_set, sp.stats.spearmanr(d1[x], d1[y]))

In [None]:
d0 = filt_stats[lambda x: x.passes_filter].dropna(subset=["ref_nn_genome_id"])

fit_raw_ref = smf.ols(
    (
        "ref_nn_gene_raw_diss ~ "
        "0 + C(species, Sum) + np.log10(min_ref_geno_diss)"
        # "+ C(species, Sum):genome_type "
        # "+ C(species, Sum):np.log10(min_ref_geno_diss) "
        # "+ genome_type:np.log10(min_ref_geno_diss)"
        # "+ C(species, Sum):genome_type:np.log10(min_ref_geno_diss)"
    ),
    data=d0[lambda x: x.genome_type.isin(["Isolate", "MAG"])],
).fit()
print(fit_raw_ref.aic)

In [None]:
fig, axs = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(7, 9))

for (genome_type, d1), ax in zip(
    d0.assign(predict=lambda x: fit_raw_ref.predict(x)).groupby("genome_type"),
    axs.flatten(),
):
    ax.set_title(genome_type)
    bins = np.linspace(0, 0.5, num=50)
    ax.hist2d(
        "predict",
        "ref_nn_gene_raw_diss",
        data=d1,
        bins=bins,
        norm=mpl.colors.PowerNorm(1 / 2),
    )
    ax.set_aspect(1)
    ax.plot([0, 0.6], [0, 0.6], c="w", lw=1)
fig.tight_layout()
ax.set_xlabel("predicted")
ax.set_ylabel("observed")

In [None]:
d1 = d0[lambda x: x.genome_type == "SPGC"].assign(
    predict=lambda x: fit_raw_ref.predict(x)
)
print(d1["ref_nn_gene_raw_diss"].quantile([0.25, 0.5, 0.75]))
print(d1["predict"].quantile([0.25, 0.5, 0.75]))
print(sp.stats.pearsonr(d1["ref_nn_gene_raw_diss"], d1["predict"]))
print((d1.ref_nn_gene_raw_diss - d1.predict).quantile([0.25, 0.5, 0.75]))
plt.hist(d1.ref_nn_gene_raw_diss - d1.predict, bins=50)

In [None]:
(d1.ref_nn_gene_raw_diss - d1.predict).quantile([0.25, 0.5, 0.75])

In [None]:
d = filt_stats[lambda x: x.passes_filter].dropna(subset=["ref_nn_genome_id"])

fit_raw_ref2 = smf.ols(
    (
        "ref_nn_gene_raw_diss ~ "
        "0 + C(species, Sum) + np.log10(min_ref_geno_diss) + genome_type"
        # "+ C(species, Sum):genome_type "
        # "+ C(species, Sum):np.log10(min_ref_geno_diss) "
        # "+ genome_type:np.log10(min_ref_geno_diss)"
        # "+ C(species, Sum):genome_type:np.log10(min_ref_geno_diss)"
    ),
    data=d,
).fit()

In [None]:
fit_raw_ref2.summary()

In [None]:
# species = '102506'
d = filt_stats[lambda x: x.passes_filter]
plt.scatter('min_ref_geno_diss', 'ref_nn_gene_raw_diss', data=d[lambda x: x.genome_type.isin(['Isolate'])], s=1)
# plt.scatter('min_ref_geno_diss', 'ref_nn_gene_raw_diss', data=d[lambda x: x.genome_type.isin(['MAG'])], s=1)
plt.scatter('min_ref_geno_diss', 'ref_nn_gene_raw_diss', data=d[lambda x: x.genome_type.isin(['SPGC'])], s=1)

# plt.scatter('min_ref_geno_diss', 'ref_nn_gene_raw_diss', data=d, s=1)

plt.xscale('log')

## Taxonomic diversity of strains

In [None]:
filt_stats[
    lambda x: x.passes_filter & x.genome_type.isin(["SPGC"])
].species.value_counts().to_frame("num_spgc_strains").join(species_taxonomy)[
    lambda x: x.p__ == "p__Euryarchaeota"
]

In [None]:
d0 = (
    filt_stats[lambda x: x.passes_filter & x.genome_type.isin(["SPGC"])]
    .species.value_counts()
    .to_frame("num_spgc_strains")
    .join(species_taxonomy)
)

fig, ax = plt.subplots(figsize=(5, 20))
ax.invert_yaxis()

_phylum_list = (
    d0.groupby("p__").num_spgc_strains.sum().sort_values(ascending=False).index
)
_phylum_palette = lib.plot.construct_ordered_palette(_phylum_list, cm="rainbow")

y_start = 0
for p__ in _phylum_list:
    d1 = d0[lambda x: x.p__ == p__]
    num_species = len(d1)
    yy = np.arange(y_start, y_start + num_species)
    plt.scatter(
        d1.num_spgc_strains,
        yy,
        color=_phylum_palette[p__],
        s=50,
        marker="x",
        lw=2,
        label=(p__, num_species, d1.num_spgc_strains.sum()),
    )
    y_start += num_species
ax.set_xscale("log")
ax.legend(bbox_to_anchor=(1, 1))

In [None]:
d0 = (
    filt_stats[lambda x: x.passes_filter & x.genome_type.isin(["SPGC"])]
    .species.value_counts()
    .to_frame("num_spgc_strains")
    .join(species_taxonomy)
)

d0.groupby("p__").num_spgc_strains.sum().sort_values(
    ascending=False
)  # sort_values('num_spgc_strains', ascending=False).head(10)

In [None]:
d0 = (
    filt_stats[lambda x: x.passes_filter & x.genome_type.isin(["SPGC"])]
    .groupby("species")
    .apply(
        lambda x: pd.Series(
            dict(
                num_spgc_strains=len(x),
                num_geno_positions=x.num_geno_positions.median(),
            )
        )
    )
    # .to_frame("num_spgc_strains")
    .join(species_taxonomy)
)


for p__ in [
    "p__Firmicutes_A",
    "p__Bacteroidota",
    "p__Proteobacteria",
    "p__Euryarchaeota",
]:
    d1 = d0[lambda x: x.p__ == p__].sort_values("num_spgc_strains", ascending=False)
    print(p__)
    print(d1[["num_spgc_strains", "s__"]].head(8))
    print()

## How many species/genomes for pangenomics?

In [None]:
_species_list = species_list

gene_meta = []
missing_species = []
for species in tqdm(_species_list):
    inpath = f"data/species/sp-{species}/midasdb.gene_meta.tsv"
    if not os.path.exists(inpath):
        missing_species.append(species)
        continue
    data = pd.read_table(inpath, index_col="gene_id").assign(species=species)
    gene_meta.append(data)
gene_meta = pd.concat(gene_meta)

print(
    len(missing_species),
    "out of",
    len(_species_list),
    "species are missing stats.",
)

In [None]:
_species_list = species_list

_result = []
missing_species = []
for species in tqdm(_species_list):
    inpath = f"data/species/sp-{species}/midasdb.gene_x_cog_category.tsv"
    if not os.path.exists(inpath):
        missing_species.append(species)
        continue
    data = pd.read_table(inpath, index_col="gene_id")
    _result.append(data)
gene_x_cog_category = pd.concat(_result)

print(
    len(missing_species),
    "out of",
    len(_species_list),
    "species are missing stats.",
)

gene_x_cog_category_matrix = gene_x_cog_category.reset_index().set_index(['gene_id', 'cog_category']).assign(present=True).present.unstack('cog_category', fill_value=False)

In [None]:
cog_category_description = pd.read_table('ref/cog-20.categories.tsv', names=['cog_category', 'color', 'description'], index_col='cog_category').description
cog_category_description.sort_index()

In [None]:
# Dereplication analysis: how many clusters are solely represented by an SPGC (or MAG/Isolate)?

d = filt_stats[
    lambda x: x.passes_filter
    & (
        (x.genome_type.isin(["SPGC"]) & x.passes_in_sample_list)
        | (x.genome_type.isin(["Isolate", "MAG"]))
    )
]
print("Num strains of each type:", d.genome_type.value_counts())

species_with_gt10_spgc_strains = idxwhere(
    d[lambda x: x.genome_type.isin(["SPGC"])].species.value_counts() >= 10
)
print("Num species with >=10 genomes:", len(species_with_gt10_spgc_strains))

print(
    "Num strains of each type in pangenomics species:",
    d[
        lambda x: x.species.isin(species_with_gt10_spgc_strains)
    ].genome_type.value_counts(),
)

In [None]:
d[lambda x: x.species.isin(species_with_gt10_spgc_strains)][
    ["species", "genome_type"]
].value_counts().unstack("genome_type", fill_value=0).SPGC.quantile(
    [0.25, 0.5, 0.75], interpolation="nearest"
)

In [None]:
d[lambda x: x.species.isin(species_with_gt10_spgc_strains)][
    ["species", "genome_type"]
].value_counts().unstack("genome_type", fill_value=0).assign(
    spgc_ratio=lambda x: x.SPGC / (x.Isolate + x.MAG)
).sort_values(
    "spgc_ratio"
)

## Pangenomics (TODO: this doesn't really belong in this notebook)

In [None]:
prevalence = []

for species in tqdm(species_with_gt10_spgc_strains):
    spgc_inpath = f"data/group/xjin_ucfmt_hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc-fit.uhgg-strain_gene.prevalence-hmp2.tsv"
    ref_inpath = (
        f"data/species/sp-{species}/midasdb.gene75_new.uhgg-strain_gene.prevalence.tsv"
    )
    spgc_prev = pd.read_table(
        spgc_inpath, names=["gene_id", "prevalence"], index_col="gene_id"
    ).prevalence
    ref_prev = pd.read_table(
        ref_inpath, names=["gene_id", "prevalence"], index_col="gene_id"
    ).prevalence
    data = pd.DataFrame(dict(spgc=spgc_prev, ref=ref_prev)).assign(species=species)
    prevalence.append(data)
prevalence = pd.concat(prevalence).fillna(0)

In [None]:
print(sp.stats.pearsonr(prevalence.ref, prevalence.spgc))
plt.hist2d("ref", "spgc", data=prevalence, bins=20, norm=mpl.colors.LogNorm())
plt.colorbar()
None

In [None]:
plt.hist(prevalence.spgc[lambda x: x > 0], bins=np.linspace(0, 1, num=20))
None

In [None]:
prevalence_class = prevalence.spgc.map(lambda x: np.where(x > 0.9, "core", np.where(x > 0.15, "shell", "cloud")))

In [None]:
d0 = (
    prevalence_class.to_frame("prevalence_class")
    .join(gene_x_cog_category_matrix)
    .assign(
        cloud=lambda x: x.prevalence_class == "cloud",
        shell=lambda x: x.prevalence_class == "shell",
        core=lambda x: x.prevalence_class == "core",
    )
)

result = []
for _prevalence_class, _cog_category in tqdm(list(product(['core', 'shell', 'cloud'], gene_x_cog_category_matrix.columns))):
    d1 = d0[[_prevalence_class, _cog_category]].value_counts().unstack().reindex(index=[True, False], columns=[True, False]).fillna(0)
    d1_pc = d1 + 1
    log_oddsratio = np.log2((d1_pc.loc[True, True] / d1_pc.loc[True, False]) / (d1_pc.loc[False, True] / d1_pc.loc[False, False]))
    result.append((_prevalence_class, _cog_category, log_oddsratio, sp.stats.fisher_exact(d1)[1]))
prevalence_class_cog_category_enrichment = pd.DataFrame(result, columns=['prevalence_class', 'cog_category', 'log2_oddsratio', 'pvalue']).set_index(['prevalence_class', 'cog_category'])

In [None]:
d = prevalence_class_cog_category_enrichment

d_oddsr = d.log2_oddsratio.unstack('prevalence_class')
d_signf = d.pvalue.map(lambda x: np.where(x < 0.05, '·', '')).unstack('prevalence_class')

prevalence_class_order = ['core', 'shell', 'cloud']
cog_category_order = d_oddsr['core'].sort_values(ascending=False).index

fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(d_oddsr.loc[cog_category_order, prevalence_class_order], norm=mpl.colors.PowerNorm(1, vmin=-4, vmax=+4), cmap='coolwarm', ax=ax)
# Annotations (because seaborn annotations are failing)
for (i, _cog_category), (j, _prevalence_class) in product(enumerate(cog_category_order), enumerate(prevalence_class_order)):
    ax.annotate(d_signf.loc[_cog_category, _prevalence_class], xy=(j + 0.5, i + 0.5), ha='center', va='center')

In [None]:
prevalence_class_cog_category_enrichment.loc['core'].sort_values(['log2_oddsratio'], ascending=False).join(cog_category_description)

In [None]:
prevalence_class_cog_category_enrichment.loc['shell'].sort_values(['log2_oddsratio'], ascending=False).join(cog_category_description)

In [None]:
prevalence_class_cog_category_enrichment.loc['cloud'].sort_values(['log2_oddsratio'], ascending=False).join(cog_category_description)

In [None]:
spgc_prevalence_class_tally = []
ref_prevalence_class_tally = []

for species in tqdm(species_with_gt10_spgc_strains):
    spgc_inpath = f"data/group/xjin_ucfmt_hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc-fit.uhgg-strain_gene.prevalence_class_fraction-hmp2.tsv"
    spgc_data = (
        pd.read_table(spgc_inpath)
        .assign(species=species)
        .rename(columns={"strain": "genome_id"})
        .astype({"genome_id": str})
        .set_index(["species", "genome_id"])
    )
    spgc_prevalence_class_tally.append(spgc_data)

    # TODO: ref_prevalence_class_tally
    ref_inpath = f"data/species/sp-{species}/midasdb.gene75_new.uhgg-strain_gene.prevalence_class_fraction.tsv"
    ref_data = (
        pd.read_table(ref_inpath)
        .assign(species=species)
        .rename(columns={"strain": "genome_id"})
        .astype({"genome_id": str})
        .set_index(["species", "genome_id"])
    )
    ref_prevalence_class_tally.append(ref_data)

# Filter SPGC
spgc_prevalence_class_tally = (
    pd.concat(spgc_prevalence_class_tally)
    .fillna(0)
    .loc[
        filt_stats[
            lambda x: x.genome_type.isin(["SPGC"])
            & x.species.isin(species_with_gt10_spgc_strains)
        ]
        .reset_index()
        .set_index(["species", "genome_id"])
        .passes_filter
    ]
)

# Filter Ref
ref_prevalence_class_tally = (
    pd.concat(ref_prevalence_class_tally)
    .fillna(0)
    .loc[
        filt_stats[
            lambda x: x.genome_type.isin(["MAG", "Isolate"])
            & x.species.isin(species_with_gt10_spgc_strains)
        ]
        .reset_index()
        .set_index(["species", "genome_id"])
        .passes_filter
    ]
)

In [None]:
spgc_prevalence_class_frac = spgc_prevalence_class_tally.divide(
    spgc_prevalence_class_tally.sum(1), axis=0
)  # .groupby("species").median()
ref_prevalence_class_frac = ref_prevalence_class_tally.divide(
    ref_prevalence_class_tally.sum(1), axis=0
)  # .groupby("species").median()

In [None]:
d1 = spgc_prevalence_class_frac.groupby("species").median()

for frac in ["core", "shell", "cloud"]:
    plt.hist(d1[frac], label=frac, alpha=0.7)
plt.legend()

d1.groupby("species").median().quantile([0.25, 0.5, 0.75])

In [None]:
d1 = ref_prevalence_class_frac.groupby("species").median()

for frac in ["core", "shell", "cloud"]:
    plt.hist(d1[frac], label=frac, alpha=0.7)
plt.legend()

d1.groupby("species").median().quantile([0.25, 0.5, 0.75])

In [None]:
d0 = pd.DataFrame(
    dict(
        ref=ref_prevalence_class_frac.groupby("species").median().stack(),
        spgc=spgc_prevalence_class_frac.groupby("species").median().stack(),
    )
).rename_axis(index=["species", "pangenome_fraction"])
for pangenome_fraction, d1 in d0.groupby("pangenome_fraction"):
    plt.scatter("ref", "spgc", data=d1, s=10, alpha=0.7)

In [None]:
(
    spgc_prevalence_class_frac.groupby("species")
    .median()
    .join(ref_prevalence_class_frac.groupby("species").median(), rsuffix="_ref")
    .assign(
        total_num_ref_genomes=filt_stats[
            lambda x: x.genome_type.isin(["MAG", "Isolate"])
        ]["species"].value_counts()
    )
    .join(species_taxonomy[["f__", "g__", "s__"]])
    .sort_values("cloud", ascending=False)
    .head(20)
)

In [None]:
(
    spgc_prevalence_class_frac.groupby("species")
    .median()
    .join(ref_prevalence_class_frac.groupby("species").median(), rsuffix="_ref")
    .assign(
        total_num_ref_genomes=filt_stats[
            lambda x: x.genome_type.isin(["MAG", "Isolate"])
        ]["species"].value_counts()
    )
    .join(species_taxonomy[["f__", "g__", "s__"]])
    .sort_values("cloud", ascending=True)
    .head(20)
)

In [None]:
d2 = (
    spgc_prevalence_class_frac.groupby("species")
    .median()
    .rename_axis(columns="pangenome_fraction")
    .stack()
    .to_frame("frac")
    .join(species_taxonomy, on="species")
    .reset_index()
)

sns.stripplot(x="pangenome_fraction", hue="p__", y="frac", data=d2, dodge=True)
lib.plot.rotate_xticklabels()
plt.legend(bbox_to_anchor=(1, 1))

In [None]:
d = (
    spgc_prevalence_class_frac.groupby("species")
    .median()
    .join(ref_prevalence_class_frac.groupby("species").median(), rsuffix="_ref")
    .assign(
        total_num_ref_genomes=filt_stats[
            lambda x: x.genome_type.isin(["MAG", "Isolate"])
        ]["species"].value_counts(),
        total_num_spgc_genomes=filt_stats[lambda x: x.genome_type.isin(["SPGC"])][
            "species"
        ].value_counts(),
    )
    .join(species_taxonomy[["p__", "f__", "g__", "s__"]])
)

_phylum_palette = lib.plot.construct_ordered_palette(d.p__.unique(), cm="tab10")


import mpltern

fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(projection="ternary", ternary_sum=100.0)
ax.grid()


ax.set_tlabel("Core (%)")
ax.set_llabel("Shell (%)")
ax.set_rlabel("Cloud (%)")

ax.set_tlim(20, 100)
ax.set_llim(10, 62)
ax.set_rlim(0, 35)

for p__, d1 in d.groupby("p__"):
    ax.scatter(
        "core",
        "shell",
        "cloud",
        data=d1,
        s=d1["total_num_spgc_genomes"],
        marker="o",
        lw=2,
        edgecolor=_phylum_palette[p__],
        facecolor="none",
        alpha=0.85,
    )
    ax.scatter(
        [],
        [],
        [],
        edgecolor=_phylum_palette[p__],
        label=p__,
        lw=2,
        facecolor="none",
    )

ax.legend(loc="upper left")

In [None]:
# phylum_palette = lib.plot.construct_ordered_palette(species_taxonomy.p__.unique(), cm='tab10')

d = (
    spgc_prevalence_class_frac.groupby("species")
    .median()
    .join(ref_prevalence_class_frac.groupby("species").median(), rsuffix="_ref")
    .assign(
        total_num_ref_genomes=filt_stats[
            lambda x: x.genome_type.isin(["MAG", "Isolate"])
        ]["species"].value_counts()
    )
    .join(species_taxonomy[["f__", "g__", "s__"]])
)

import mpltern

fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(projection="ternary", ternary_sum=100.0)

ax.set_tlabel("Core (%)")
ax.set_llabel("Shell (%)")
ax.set_rlabel("Cloud (%)")

# ax.grid()

ax.scatter("core_ref", "shell_ref", "cloud_ref", data=d, s=10)

In [None]:
# phylum_palette = lib.plot.construct_ordered_palette(species_taxonomy.p__.unique(), cm='tab10')

d = (
    spgc_prevalence_class_frac.groupby("species")
    .median()
    .rename_axis(columns="pangenome_fraction")
    # .stack()
    # .to_frame("frac")
    .join(species_taxonomy, on="species")
    # .assign(phylum_color=lambda x: x.p__.map(phylum_palette))
    # .reset_index()
)

import mpltern

fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(projection="ternary", ternary_sum=100.0)

ax.set_tlabel("Core (%)")
ax.set_llabel("Shell (%)")
ax.set_rlabel("Cloud (%)")

# ax.grid()

ax.scatter("core", "shell", "cloud", data=d, s=10)

In [None]:
_species_list = species_list

morans_i = []
missing_species = []
for species in tqdm(_species_list):
    inpath = f"data/group/xjin_ucfmt_hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc-fit.uhgg-strain_gene.morans_i.tsv"
    if not os.path.exists(inpath):
        missing_species.append(species)
        continue
    data = pd.read_table(inpath, index_col="gene_id")
    morans_i.append(data)
morans_i = pd.concat(morans_i).assign(spgc_species_percentile=lambda x: x.spgc.groupby(gene_meta.species).rank(pct=True))

print(
    len(missing_species),
    "out of",
    len(_species_list),
    "species are missing stats.",
)

In [None]:
inpath

In [None]:
d = filt_stats[lambda x: x.genome_type.isin(["SPGC"]) & x.passes_filter]
species_list3 = idxwhere((d.species.value_counts() >= 20))
len(species_list3)

In [None]:
d = morans_i.join(gene_meta.species)[lambda x: x.species.isin(species_list2)]

plt.hist2d('ref', 'spgc', data=d.dropna(), bins=20, norm=mpl.colors.PowerNorm(1/3))
sp.stats.pearsonr(d.dropna()['ref'], d.dropna()['spgc'])

In [None]:
d = morans_i.join(gene_meta.species)[lambda x: x.species.isin(species_list3)]

plt.hist2d('ref', 'spgc', data=d.dropna(), bins=20, norm=mpl.colors.PowerNorm(1/5))
sp.stats.pearsonr(d.dropna()['ref'], d.dropna()['spgc'])

In [None]:
plt.hist(d.spgc.dropna(), bins=100, alpha=0.5)
plt.hist(d.ref.dropna(), bins=100, alpha=0.5)

plt.yscale('log')
None

In [None]:
d = gene_meta.join(gene_x_cog_category_matrix).assign(cmi=morans_i.ref, prevalence=prevalence.ref).fillna({'cmi': 0})
d

In [None]:
# NOTE: Can take 15 or more minutes to run.

d0 = pd.DataFrame(dict(cmi=morans_i.spgc, prevalence=prevalence.spgc)).assign(species=gene_meta.species).join(gene_x_cog_category_matrix).dropna()

results = {}
for species in tqdm(species_list3):
    d1 = d0[lambda x: (x.species == species) & (x.prevalence > 0.15) & (x.prevalence < 0.9)]
    for category in gene_x_cog_category_matrix.columns:
        if (d1[category].mean() == 0) or (d1[category].mean() == 1):
            continue
        fit = smf.rlm(f'cmi ~ cr(prevalence, 4) + {category}', data=d1).fit()
        results[(species, category)] = [fit.params[f'{category}[T.True]'], fit.pvalues[f'{category}[T.True]']]

for category in tqdm(gene_x_cog_category_matrix.columns):
    d1 = d0[lambda x: (x.species.isin(species_list3)) & (x.prevalence > 0.15) & (x.prevalence < 0.9)]
    fit = smf.rlm(f'cmi ~ cr(prevalence, 4) + {category} + species', data=d1).fit()
    results[('overall', category)] = [fit.params[f'{category}[T.True]'], fit.pvalues[f'{category}[T.True]']]

spgc_species_by_cog_category_cdi_bias_test = pd.DataFrame(results.values(), index=results.keys(), columns=['coef', 'pvalue'])

In [None]:
d0 = pd.DataFrame(dict(cmi=morans_i.spgc, prevalence=prevalence.spgc)).assign(species=gene_meta.species).join(gene_x_cog_category_matrix).dropna()
d1 = d0[lambda x: (x.species.isin(species_list3))]
_fit = smf.rlm(f'cmi ~ cr(prevalence, 10) + species', data=d1).fit()
# _fit.params[f'{category}[T.True]'], _fit.pvalues[f'{category}[T.True]']

In [None]:
_fit.summary()

In [None]:
d2 = d1.assign(predict=_fit.predict(), resid=_fit.resid)

bins = np.linspace(d1.cmi.min(), d1.cmi.max(), num=10)

fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)
for cat, ax in zip(['G', 'W', 'X', 'J'], axs.flatten()):
    ax.hist2d('predict', 'resid', data=d2[d2[cat]], bins=bins, norm=mpl.colors.PowerNorm(1/2), density=True)
    ax.set_title(cat)

In [None]:
plt.scatter(np.random.rand(d2['W'].sum()) + 1, d2[d2['W']].prevalence, s=1)
plt.scatter(np.random.rand(d2['G'].sum()) + 2, d2[d2['G']].prevalence, s=1, alpha=0.2)

In [None]:
plt.scatter('prevalence', 'resid', data=d2[d2['X']], s=3, alpha=0.2)

In [None]:
d3 = d2[lambda x: (x.species == '101337')]

plt.scatter('prevalence', 'cmi', data=d3, s=3, alpha=0.2)
pp = np.linspace(0, 1)
plt.plot(pp, _fit.predict(pd.DataFrame(dict(prevalence=pp, species='100022'))), color='tab:orange')

In [None]:
d2[lambda x: (x.species == '101337')].assign(cmi_rank=lambda x: x.cmi.rank(pct=True)).cmi_rank.median()

In [None]:
d2[lambda x: (x.species == '101337')].assign(cmi_rank=lambda x: x.cmi.rank(pct=True))[lambda x: x.G].cmi_rank.median()

In [None]:
qq = np.linspace(0, 1, num=100)
plt.scatter(d2[d2['W']].resid.quantile(qq), d2[d2['G']].resid.quantile(qq))
plt.plot([0, 0.15], [0, 0.15])
# plt.yscale('log')
# plt.xscale('log')

In [None]:
# NOTE: Can take 15 or more minutes to run.

d0 = pd.DataFrame(dict(cmi=morans_i.ref, prevalence=prevalence.ref)).assign(species=gene_meta.species).join(gene_x_cog_category_matrix).dropna()

results = {}
for species in tqdm(species_list3):
    d1 = d0[lambda x: (x.species == species) & (x.prevalence > 0.15) & (x.prevalence < 0.9)]
    for category in gene_x_cog_category_matrix.columns:
        if (d1[category].mean() == 0) or (d1[category].mean() == 1):
            continue
        fit = smf.rlm(f'cmi ~ cr(prevalence, 4) + {category}', data=d1).fit()
        results[(species, category)] = [fit.params[f'{category}[T.True]'], fit.pvalues[f'{category}[T.True]']]

for category in tqdm(gene_x_cog_category_matrix.columns):
    d1 = d0[lambda x: (x.species.isin(species_list3)) & (x.prevalence > 0.15) & (x.prevalence < 0.9)]
    fit = smf.rlm(f'cmi ~ cr(prevalence, 4) + {category} + species', data=d1).fit()
    results[('overall', category)] = [fit.params[f'{category}[T.True]'], fit.pvalues[f'{category}[T.True]']]

ref_species_by_cog_category_cdi_bias_test = pd.DataFrame(results.values(), index=results.keys(), columns=['coef', 'pvalue'])

In [None]:
d0 = spgc_species_by_cog_category_cdi_bias_test
d1_coef = d0.coef.unstack(fill_value=0)
d1_signf = d0.pvalue.unstack(fill_value=1.0).map(lambda p: np.where(p < 0.05, '•', ''))

cmi_category_order = list(d1_coef.mean().sort_values().index)
cmi_species_order = list(np.square(d1_coef).mean(1).sort_values(ascending=False).drop('overall').index) + ['overall']

d2_coef = d1_coef.loc[cmi_species_order, cmi_category_order]
d2_signf = d1_signf.loc[cmi_species_order, cmi_category_order]

fig, ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(d2_coef)
# sns.heatmap(d1_coef.loc[cmi_species_order, cmi_category_order], annot=d1_signf.loc[cmi_species_order, cmi_category_order], fmt='', norm=mpl.colors.SymLogNorm(linthresh=0.001, vmin=-1e5, vmax=1e5), xticklabels=1, yticklabels=1, cmap='coolwarm', ax=ax)
sns.heatmap(d2_coef, norm=mpl.colors.SymLogNorm(linthresh=0.001, vmin=-.1, vmax=.1), xticklabels=1, yticklabels=1, cmap='PuOr', ax=ax)

# Annotations (because seaborn annotations are failing)
for (i, species), (j, category) in product(enumerate(cmi_species_order), enumerate(cmi_category_order)):
    ax.annotate(d2_signf.loc[species, category], xy=(j + 0.5, i + 0.5), ha='center', va='center')

In [None]:
spgc_species_by_cog_category_cdi_bias_test.loc['overall'].reindex(cmi_category_order)

In [None]:
cog_category_description.reindex(cmi_category_order)

In [None]:
n = 20
r = 0.1
offset = 1

p = int(round(n * r))
q = int(round(n * (1 - r)))
print(p, q)
x = [0]*p + [1]*q
y = [0]*(p - offset) + [1]*(q + offset)
print(x)
print(y)
sp.stats.pearsonr(x, y)

In [None]:
','.join(species_list3)

In [None]:
d0 = ref_species_by_cog_category_cdi_bias_test
d1_coef = d0.coef.unstack(fill_value=0)
d1_signf = d0.pvalue.unstack(fill_value=1.0).map(lambda p: np.where(p < 0.05, '•', ''))

d2_coef = d1_coef.loc[cmi_species_order, cmi_category_order]
d2_signf = d1_signf.loc[cmi_species_order, cmi_category_order]

fig, ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(d2_coef)
# sns.heatmap(d1_coef.loc[cmi_species_order, cmi_category_order], annot=d1_signf.loc[cmi_species_order, cmi_category_order], fmt='', norm=mpl.colors.SymLogNorm(linthresh=0.001, vmin=-1e5, vmax=1e5), xticklabels=1, yticklabels=1, cmap='coolwarm', ax=ax)
sns.heatmap(d2_coef, norm=mpl.colors.SymLogNorm(linthresh=0.001, vmin=-.1, vmax=.1), xticklabels=1, yticklabels=1, cmap='PuOr', ax=ax)

# Annotations (because seaborn annotations are failing)
for (i, species), (j, category) in product(enumerate(cmi_species_order), enumerate(cmi_category_order)):
    ax.annotate(d2_signf.loc[species, category], xy=(j + 0.5, i + 0.5), ha='center', va='center')

In [None]:
_species_list = species_list

co_clust = []
missing_species = []
for species in tqdm(_species_list):
    ref_inpath = f"data/species/sp-{species}/midasdb.gene75_new.uhgg-strain_gene.gene_clust-t10.tsv"
    spgc_inpath = f"data/group/xjin_ucfmt_hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc-fit.uhgg-strain_gene.gene_clust-t10.tsv"
    if not os.path.exists(spgc_inpath):
        missing_species.append(species)
        continue
    ref_data = pd.read_table(ref_inpath, names=['gene_id', 'cluster'], index_col="gene_id").cluster
    spgc_data = pd.read_table(spgc_inpath, names=['gene_id', 'cluster'], index_col="gene_id").cluster
    co_clust.append(pd.DataFrame(dict(ref_clust=ref_data, spgc_clust=spgc_data)).assign(species=species))
co_clust = pd.concat(co_clust).fillna({'spgc_clust': -5, 'ref_clust': -5}).astype({'spgc_clust': int, 'ref_clust': int})

print(
    len(missing_species),
    "out of",
    len(_species_list),
    "species are missing stats.",
)

In [None]:
d = (co_clust[lambda x: (x.spgc_clust >= 0) & x.species.isin(species_list3)][['species', 'spgc_clust']].value_counts()[lambda x: (x > 1)])
clust_list = d.index
len(d), d.sum(), d.mean()

In [None]:
d.groupby('species').apply(len).median()

In [None]:
gene_x_module = gene_meta.KEGG_Module.dropna().str.split(',').explode()#[lambda x: x.str.startswith('map')]
gene_x_module.value_counts().head(20)

In [None]:
gene_x_pathway = gene_meta.KEGG_inpathway.dropna().str.split(',').explode()[lambda x: x.str.startswith('map')]
gene_x_pathway.value_counts().head(20)

In [None]:
d0 = (
    co_clust[lambda x: (x.spgc_clust >= 0) & (x.index.isin(gene_x_module.index))]
    .join(gene_x_module)
    .assign(
        # ref_clust=lambda x: x.species + "-" + x.ref_clust.astype(str),
        spgc_clust=lambda x: x.species + "-" + x.spgc_clust.astype(str),
    )
)
d0

In [None]:
thresh = 1

obs_count_module_multihit = (
    # Count the total number of clusters with multiple hits to the same KEGG_Module
    d0[["spgc_clust", "KEGG_Module"]]
    .value_counts()
    .gt(thresh)
    .groupby('spgc_clust')
    .any()
    .sum()
)
perm_count_module_multihit = []
for i in tqdm(range(999)):
    perm_count_module_multihit.append(
        d0.assign(
            # Permute clusters within species.
            spgc_clust=lambda x: x.groupby("species").spgc_clust.sample(frac=1).values
        )[["spgc_clust", "KEGG_Module"]]
    .value_counts()
    .gt(thresh)
    .groupby('spgc_clust')
    .any()
    .sum()
    )
perm_count_module_multihit = np.array(perm_count_module_multihit)
plt.hist(perm_count_module_multihit)
plt.axvline(obs_count_module_multihit)

print(perm_count_module_multihit.mean(), perm_count_module_multihit.std(), obs_count_module_multihit, obs_count_module_multihit / perm_count_module_multihit.mean())

In [None]:
thresh = 2

obs_count_module_multihit = (
    # Count the total number of clusters with multiple hits to the same KEGG_Module
    d0[["spgc_clust", "KEGG_Module"]]
    .value_counts()
    .gt(thresh)
    .groupby('spgc_clust')
    .any()
    .sum()
)
perm_count_module_multihit = []
for i in tqdm(range(999)):
    perm_count_module_multihit.append(
        d0.assign(
            # Permute clusters within species.
            spgc_clust=lambda x: x.groupby("species").spgc_clust.sample(frac=1).values
        )[["spgc_clust", "KEGG_Module"]]
    .value_counts()
    .gt(thresh)
    .groupby('spgc_clust')
    .any()
    .sum()
    )
perm_count_module_multihit = np.array(perm_count_module_multihit)
plt.hist(perm_count_module_multihit)
plt.axvline(obs_count_module_multihit)

print(perm_count_module_multihit.mean(), perm_count_module_multihit.std(), obs_count_module_multihit, obs_count_module_multihit / perm_count_module_multihit.mean())

In [None]:
thresh = 3

obs_count_module_multihit = (
    # Count the total number of clusters with multiple hits to the same KEGG_Module
    d0[["spgc_clust", "KEGG_Module"]]
    .value_counts()
    .gt(thresh)
    .groupby('spgc_clust')
    .any()
    .sum()
)
perm_count_module_multihit = []
for i in tqdm(range(999)):
    perm_count_module_multihit.append(
        d0.assign(
            # Permute clusters within species.
            spgc_clust=lambda x: x.groupby("species").spgc_clust.sample(frac=1).values
        )[["spgc_clust", "KEGG_Module"]]
    .value_counts()
    .gt(thresh)
    .groupby('spgc_clust')
    .any()
    .sum()
    )
perm_count_module_multihit = np.array(perm_count_module_multihit)
plt.hist(perm_count_module_multihit)
plt.axvline(obs_count_module_multihit)

print(perm_count_module_multihit.mean(), perm_count_module_multihit.std(), obs_count_module_multihit, obs_count_module_multihit / perm_count_module_multihit.mean())

In [None]:
subject = pd.read_table('meta/ucfmt/subject.tsv', index_col='subject_id')
subject[['donor_subject_id', 'remission']].value_counts(dropna=False).unstack()

In [None]:
subject[lambda x: (~x.withdrawal_due_to_failure.fillna(True))][['donor_subject_id', 'remission']].value_counts().unstack()

Unfortunately, no evidence that donors performed differently. :-/

In [None]:
donor_comparison

In [None]:
_species_list = species_list

donor_comparison = []
missing_species = []
for species in tqdm(_species_list):
    inpath = f"data/group/xjin_ucfmt_hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc-fit.uhgg-strain_gene-ucfmt.tsv"
    if not os.path.exists(inpath):
        missing_species.append(species)
        continue
    donor_comparison.append(pd.read_table(inpath, index_col="gene_id"))

donor_comparison = (
    pd.concat(donor_comparison)
    [["D0044", "D0097"]].dropna()
    .assign(
        label=lambda x: (
            # Encode in binary
            x.D0097 * 2**0 + x.D0044 * 2**1
        ).map(  
            {
                # Lookup
                0: "neither",
                1: "d97",
                2: "d44",
                3: "both",
            }
        )
    )
)
# mwas = pd.concat(mwas).assign(
#     log2_oddsratio_pc_ibd=lambda x: np.log2(x.oddsratio_pc_ibd),
#     neg_log10_pvalue=lambda x: -np.log10(x.fisher_exact_pvalue_ibd),
#     num_subjects_present=lambda x: x["present-nonIBD"] + x["present-IBD"],
#     num_subjects_absent=lambda x: x["absent-nonIBD"] + x["absent-IBD"],
#     num_subjects_total=lambda x: x.num_subjects_present + x.num_subjects_absent,
#     prevalence=lambda x: x.num_subjects_present
#     / x.num_subjects_total,
# )

print(
    len(missing_species),
    "out of",
    len(_species_list),
    "species are missing stats.",
)

In [None]:
d = donor_comparison.groupby(gene_meta.species).label.value_counts().unstack()
d.quantile([0.25, 0.5, 0.75]).T

In [None]:
d = donor_comparison.groupby(gene_meta.species).label.value_counts().unstack()
d.apply(lambda x: x / x.sum(), axis=1).quantile([0.25, 0.5, 0.75]).T

In [None]:
_species_list = species_list

mwas = []
missing_species = []
for species in tqdm(_species_list):
    inpath = f"data/group/xjin_ucfmt_hmp2/species/sp-{species}/r.proc.gtpro.sfacts-fit.gene99_new-v22-agg75.spgc-fit.uhgg-strain_gene.hmp2_mwas-f30-n1.tsv"
    if not os.path.exists(inpath):
        missing_species.append(species)
        continue
    mwas.append(pd.read_table(inpath, index_col="gene_id").assign(species=species))
mwas = pd.concat(mwas).assign(
    log2_oddsratio_pc_ibd=lambda x: np.log2(x.oddsratio_pc_ibd),
    neg_log10_pvalue=lambda x: -np.log10(x.fisher_exact_pvalue_ibd),
    num_subjects_present=lambda x: x["present-nonIBD"] + x["present-IBD"],
    num_subjects_absent=lambda x: x["absent-nonIBD"] + x["absent-IBD"],
    num_subjects_total=lambda x: x.num_subjects_present + x.num_subjects_absent,
    prevalence=lambda x: x.num_subjects_present
    / x.num_subjects_total,
)

print(
    len(missing_species),
    "out of",
    len(_species_list),
    "species are missing stats.",
)

In [None]:
donor_comparison.join(gene_meta.species)[['species', 'label']].value_counts().unstack().sort_values('d44').head(10)
# No species has 0 d44/d97 genes, meaning they're always different strains across the two donors.

In [None]:
clust_size = co_clust.groupby(['species', 'spgc_clust']).apply(len)
clust_size

In [None]:
_clust, _donor = align_indexes(co_clust[lambda x: x.species.isin(species_list3)], donor_comparison)
# _clust_size = _donor.join(_clust).groupby(['species', 'spgc_clust']).apply(len)
_clust_size = co_clust[lambda x: x.spgc_clust >= 0][['species', 'spgc_clust']].value_counts()

d = _donor.join(_clust)
# d1 = d.groupby(['species', 'spgc_clust'])[['D0044', 'D0097']].mean()[(_clust_size > 1) & (_clust_size.index.to_frame().spgc_clust >= 0)]
# d2 = d.groupby(['species', 'spgc_clust']).label.value_counts().unstack('label', fill_value=0).divide(_clust_size, axis=0)[(_clust_size > 1) & (_clust_size.index.to_frame().spgc_clust >= 0)]
donor_strain_cluster_membership = d.groupby(['species', 'spgc_clust'])[['D0044', 'D0097']].sum().astype(int).assign(clust_size=_clust_size)[lambda x: x.clust_size > 1].join(d.groupby(['species', 'spgc_clust']).label.value_counts().unstack(fill_value=0)).assign(D0044_frac=lambda x: x.D0044 / x.clust_size, D0097_frac=lambda x: x.D0097 / x.clust_size)

donor_strain_cluster_membership.sort_values('clust_size', ascending=False)

In [None]:
plt.hist2d('D0097_frac', 'D0044_frac', data=donor_strain_cluster_membership[lambda x: x.clust_size > 1], bins=np.linspace(0, 1, num=21))
plt.colorbar()
None
# plt.hist(donor_strain_cluster_membership[lambda x: x.clust_size > 5]['D0097_frac'], bins=bins, alpha=0.5)

In [None]:
donor_strain_cluster_membership.assign(
    d44_present=lambda x: x.D0044_frac > 0.9, d97_present=lambda x: x.D0097_frac > 0.9
).assign(both_present=lambda x: x.d44_present & x.d97_present)[
    lambda x: x.clust_size > 1
][['both', 'd44', 'd97', 'd44_present', 'd97_present', 'both_present']].groupby('species').sum().join(species_taxonomy.s__)

In [None]:
# Number of species being considered
donor_strain_cluster_membership.index.to_frame().species.value_counts().shape

In [None]:
differential_cluster_membership = donor_strain_cluster_membership[lambda x: x.clust_size > 1][['D0044_frac', 'D0097_frac']][lambda x: ((x.D0044_frac > 0.9) & (x.D0097_frac < 0.1)) | ((x.D0097_frac > 0.9) & (x.D0044_frac < 0.1))]
differential_cluster_membership.round().groupby('species').sum()

In [None]:
d = differential_cluster_membership.round().groupby('species').sum()
d.shape, d.values.sum()

In [None]:
differential_cluster_membership.assign(num_genes=donor_strain_cluster_membership.clust_size).sort_values('num_genes', ascending=False).xs('102506', level='species')

In [None]:
gene_meta.columns

In [None]:
_species = "102506"

for _clust in (
    differential_cluster_membership.assign(
        num_genes=donor_strain_cluster_membership.clust_size
    )
    .sort_values("num_genes", ascending=False)
    .xs(_species, level="species")
    .head(50)
    .index
):
    print("-------------")
    print("Cluster", _clust)
    print()
    print(
        donor_strain_cluster_membership.loc[[(_species, _clust)]][
            ["D0044", "D0097", "clust_size"]
        ]
    )
    print()
    print(
        gene_x_cog_category_matrix.loc[
            co_clust[lambda x: (x.species == _species) & (x.spgc_clust == _clust)].index
        ]
        .sum()[lambda x: x > 0]
        .sort_values(ascending=False)
        .to_frame()
        .T
    )
    print()
    print(
        "\n".join(
            gene_meta.loc[
                co_clust[
                    lambda x: (x.species == _species) & (x.spgc_clust == _clust)
                ].index
            ][lambda x: ~(x.Preferred_name.isna() & x.Description.isna())]
            .astype(str)
            .apply(
                lambda x: f"({x.Preferred_name}) {x.Description} ({x.KEGG_ko})", axis=1
            )
            .values
        )
    )
    print()
    print(
        mwas.loc[
            co_clust[lambda x: (x.species == _species) & (x.spgc_clust == _clust)].index
        ]
        .rename(
            columns={
                "num_subjects_total": "nsubj",
                "log2_oddsratio_pc_ibd": "lor",
                "fisher_exact_pvalue_ibd": "p",
            }
        )[["nsubj", "prevalence", "lor", "p"]]
        .round(2)
        .assign(morans_i=morans_i.spgc_species_percentile)
        .value_counts()
    )
    print()

In [None]:
_species = "101337"

for _clust in (
    differential_cluster_membership.assign(
        num_genes=donor_strain_cluster_membership.clust_size
    )
    .sort_values("num_genes", ascending=False)
    .xs(_species, level="species")
    .head(50)
    .index
):
    print("-------------")
    print("Cluster", _clust)
    print()
    print(
        donor_strain_cluster_membership.loc[[(_species, _clust)]][
            ["D0044", "D0097", "clust_size"]
        ]
    )
    print()
    print(
        gene_x_cog_category_matrix.loc[
            co_clust[lambda x: (x.species == _species) & (x.spgc_clust == _clust)].index
        ]
        .sum()[lambda x: x > 0]
        .sort_values(ascending=False)
        .to_frame()
        .T
    )
    print()
    print(
        "\n".join(
            gene_meta.loc[
                co_clust[
                    lambda x: (x.species == _species) & (x.spgc_clust == _clust)
                ].index
            ][lambda x: ~(x.Preferred_name.isna() & x.Description.isna())]
            .astype(str)
            .apply(
                lambda x: f"({x.Preferred_name}) {x.Description} ({x.KEGG_ko})", axis=1
            )
            .values
        )
    )
    print()
    print(
        mwas.loc[
            co_clust[lambda x: (x.species == _species) & (x.spgc_clust == _clust)].index
        ]
        .rename(
            columns={
                "num_subjects_total": "nsubj",
                "log2_oddsratio_pc_ibd": "lor",
                "fisher_exact_pvalue_ibd": "p",
            }
        )[["nsubj", "prevalence", "lor", "p"]]
        .round(2)
        .assign(morans_i=morans_i.spgc_species_percentile)
        .value_counts()
    )
    print()

In [None]:
d = (
    mwas[lambda x:
        # (x.species == "102506")
    True
        & (x.num_subjects_present >= 10)
        & (x.num_subjects_absent >= 10)
    ].assign(
        donor=donor_comparison.label,
        species=gene_meta.species,
        prevalence_class=prevalence_class,
        nlength=gene_meta.nlength,
        clust=co_clust.spgc_clust,
    )[
        lambda x: x.donor.isin(["d97", "d44"])
        & (x.prevalence_class == "shell")
        & (x.nlength > 300)
    ]
    .assign(fdr=lambda x: fdrcorrection(x.fisher_exact_pvalue_ibd)[1])
).sort_values(['fisher_exact_pvalue_ibd', 'clust'])

d[lambda x: x.fisher_exact_pvalue_ibd < 1e-2]

In [None]:
d = (
    mwas[lambda x:
        # (x.species == "102506") &
    True
        & (x.num_subjects_present >= 10)
        & (x.num_subjects_absent >= 10)
    ].assign(
        donor=donor_comparison.label,
        species=gene_meta.species,
        prevalence_class=prevalence_class,
        nlength=gene_meta.nlength,
        clust=co_clust.spgc_clust[lambda x: x >= 0],
    )
        .join(co_clust[['species', 'spgc_clust']].value_counts().rename("clust_size"), on=['species', 'clust'])
    [
        lambda x: True
        & x.donor.isin(["d97", "d44", "both"])
        & (x.prevalence_class == "shell")
        & (x.nlength > 300)
        & (x.clust_size > 1)
    ]
    .assign(fdr=lambda x: fdrcorrection(x.fisher_exact_pvalue_ibd)[1])
    .sort_values('fisher_exact_pvalue_ibd')
)

plt.scatter('log2_oddsratio_pc_ibd', 'fisher_exact_pvalue_ibd', c='fdr', data=d, lw=1, facecolor='none', marker='o', cmap='viridis_r')
plt.yscale('log')
plt.gca().invert_yaxis()
plt.colorbar()

d[lambda x: x.fisher_exact_pvalue_ibd < 1e-2][['fisher_exact_pvalue_ibd', 'oddsratio_pc_ibd', 'species',
       'log2_oddsratio_pc_ibd', 'neg_log10_pvalue', 'num_subjects_present',
       'num_subjects_absent', 'prevalence', 'donor',
       'prevalence_class', 'nlength', 'clust', 'clust_size', 'fdr']].head(30)

In [None]:
d[lambda x: x.fisher_exact_pvalue_ibd < 1e-3][
    [
        "fisher_exact_pvalue_ibd",
        "log2_oddsratio_pc_ibd",
        "num_subjects_total",
        "prevalence",
        "donor",
        'fdr',
        'clust',
        'clust_size',
    ]
].join(
    gene_meta[
        [
            "species",
            "PFAMs",
            "eggNOG_OGs",
            "COG_category",
            "Description",
            "Preferred_name",
        ]
    ]
)

In [None]:
from statsmodels.graphics.gofplots import qqplot

d0 = (
    mwas[lambda x:
        # (x.species == "102506") &
    True
        & (x.num_subjects_present >= 10)
        & (x.num_subjects_absent >= 10)
    ].assign(
        donor=donor_comparison.label,
        species=gene_meta.species,
        prevalence_class=prevalence_class,
        nlength=gene_meta.nlength,
        clust=co_clust.spgc_clust,
    )[
        lambda x: x.donor.isin(["d97", "d44"])
        & (x.prevalence_class == "shell")
        & (x.nlength > 300)
    ]
    .assign(fdr=lambda x: fdrcorrection(x.fisher_exact_pvalue_ibd)[1])
    .sort_values('fisher_exact_pvalue_ibd')
)

d1 = d0.join(gene_x_cog_category_matrix)

fig, ax = plt.subplots(figsize=(10, 10))

cog_category_list = gene_x_cog_category_matrix.loc[d0.index].sum().sort_values(ascending=False).head(10).index

for cat in cog_category_list:
    d2 = d1[d1[cat]]
    ax.scatter(np.linspace(0, 1, num=d2.shape[0] + 1)[1:], d2.fisher_exact_pvalue_ibd, label=cat, s=40)
    # ax.scatter(np.linspace(0, 1, num=d2.shape[0] + 1)[1:], sp.stats.uniform.ppf(d2.fisher_exact_pvalue_ibd), label=cat, s=40)  # Equivalently, and generalizable to other distributions.
ax.plot([0, 1], [0, 1], color='k')
ax.legend(bbox_to_anchor=(1, 1), markerscale=1)
ax.set_yscale('log')
ax.set_xscale('log')
# plt.xlim(right=1e-1)
# plt.ylim(top=1e-1)

In [None]:
d0 = (
    mwas[lambda x:
        (x.species == "102506") &
    True
        & (x.num_subjects_present >= 10)
        & (x.num_subjects_absent >= 10)
    ].assign(
        donor=donor_comparison.label,
        species=gene_meta.species,
        prevalence_class=prevalence_class,
        nlength=gene_meta.nlength,
        clust=co_clust.spgc_clust,
    )
    .assign(fdr=lambda x: fdrcorrection(x.fisher_exact_pvalue_ibd)[1])
    .sort_values('fisher_exact_pvalue_ibd')
)

bins = np.linspace(-5, 5)
plt.hist(mwas.loc[idxwhere(donor_comparison.label == 'd97')][lambda x: x.fisher_exact_pvalue_ibd < 1e-2].log2_oddsratio_pc_ibd, bins=bins, alpha=0.5)
plt.hist(mwas.loc[idxwhere(donor_comparison.label == 'd44')][lambda x: x.fisher_exact_pvalue_ibd < 1e-2].log2_oddsratio_pc_ibd, bins=bins, alpha=0.5)
# plt.hist(mwas.loc[idxwhere(donor_comparison.label == 'both')][lambda x: x.fisher_exact_pvalue_ibd < 1e-2].log2_oddsratio_pc_ibd, bins=bins, alpha=0.5)

In [None]:
donor_enrichment_contingency = mwas.assign(donor=donor_comparison.label, enriched_in_ibd=lambda x: x.log2_oddsratio_pc_ibd > 0).dropna(subset=['donor'])[lambda x: x.fisher_exact_pvalue_ibd < 1e-2][['donor', 'enriched_in_ibd']].value_counts().unstack()
donor_enrichment_contingency

In [None]:
sp.stats.chi2_contingency(donor_enrichment_contingency)

In [None]:
d3 = d1[lambda x: (x[['G', 'L', 'M', 'R']].any(axis=1)) & (x.fisher_exact_pvalue_ibd < 1e-2)][['oddsratio_pc_ibd', 'fisher_exact_pvalue_ibd', 'fdr', 'donor', 'species']].join(gene_meta[['eggNOG_OGs', 'nlength', 'COG_category', 'Description', 'Preferred_name']]).sort_values('oddsratio_pc_ibd')
d3.sort_values('fdr')#gene_x_cog_category_matrix.loc[d3.index][['G', 'L', 'M', 'R', 'no_category']].assign(cat=d3.COG_category, cog=d3.eggNOG_OGs, oddrat=d3.oddsratio_pc_ibd, species=d3.species, )

In [None]:
d0 = mwas.assign(donor=donor_comparison.label).assign(prevalence_class=prevalence_class)[
    lambda x: (x.num_subjects_present >= 10)
    & (x.num_subjects_absent >= 10)
    & x.donor.isin(["d97", "d44"])
    & (x.prevalence_class == 'shell')
].assign(fdr=lambda x: fdrcorrection(x.fisher_exact_pvalue_ibd)[1], enriched_in_ibd=lambda x: x.log2_oddsratio_pc_ibd > 0, low_pvalue_hit=lambda x: x.fisher_exact_pvalue_ibd < 5e-3).sort_values(
    "fisher_exact_pvalue_ibd"
)
d0[lambda x: x.low_pvalue_hit][['donor', 'enriched_in_ibd']].value_counts().unstack()

In [None]:
d0 = mwas.assign(donor=donor_comparison.label).assign(prevalence_class=prevalence_class)[
    lambda x: (x.num_subjects_present >= 10)
    & (x.num_subjects_absent >= 10)
    & x.donor.isin(["d97", "d44"])
    & (x.prevalence_class == 'shell')
].assign(fdr=lambda x: fdrcorrection(x.fisher_exact_pvalue_ibd)[1], enriched_in_ibd=lambda x: x.log2_oddsratio_pc_ibd > 0, low_pvalue_hit=lambda x: x.fisher_exact_pvalue_ibd < 1e-3).sort_values(
    "fisher_exact_pvalue_ibd"
)
d0[lambda x: x.low_pvalue_hit][['donor', 'enriched_in_ibd']].value_counts().unstack()

In [None]:
plt.hist2d('log2_oddsratio_pc_ibd', 'neg_log10_pvalue', data=mwas, norm=mpl.colors.PowerNorm(1/10), bins=100)
None

In [None]:
d = mwas[
    lambda x: (x.num_subjects_present > 20)
    & (x.num_subjects_absent > 20)
].sort_values("fisher_exact_pvalue_ibd")
d.assign(lambda x: x.value_counts()
# fdrcorrection(d.fisher_exact_pvalue_ibd)[1]

In [None]:
fdr_by_species.assign(signif=lambda x: x.fdr < 0.05)[['species', 'signif']].value_counts().unstack(fill_value=0).sort_values(True, ascending=False).join(species_taxonomy).head(20)

In [None]:
mwas = mwas.assign(fdr=lambda x: x[test_filter].groupby('species').fisher_exact_pvalue_ibd.pipe()
plt.hist2d('log2_oddsratio_pc_ibd', 'neg_log10_pvalue', data=mwas[test_filter], norm=mpl.colors.PowerNorm(1/10), bins=100)
None

In [None]:
bins = np.logspace(-5, 0, num=40)
plt.hist(mwas[test_filter].fisher_exact_pvalue_ibd, bins=bins)
plt.plot(bins[1:], (bins[1:] - bins[:-1]) * mwas[test_filter].shape[0])
plt.xscale('log')
plt.yscale('log')

In [None]:
plt.plot(np.sort(fdrcorrection(mwas[test_filter].fisher_exact_pvalue_ibd)[1]))