In [1]:
import numpy as np
import pandas as pd
import os
import requests
import json
import xarray as xr
import scipy.stats as stats

from pathlib import Path
from scipy.stats import mannwhitneyu
from statsmodels.stats import multitest


# arrange metadata

In [2]:
# genome metadata

# filepaths
filepath_genome_metadata = '../../data/metadata/genome-metadata.csv'
filepath_ortholog_metadata = '../../data/metadata/ortholog-metadata.csv'

# ortholog metadata
ortho_df = pd.read_csv(filepath_ortholog_metadata)

# genome metadata
genome_df = pd.read_csv(filepath_genome_metadata)

# edit the genome metadata for the purposes of this analysis
genome_df.loc[genome_df['Group'].eq('Uncultured-marine-virus'), 'Group'] = 'Virus'
genome_df.loc[genome_df['Group'].eq('Virus'), 'Clade'] = 'N/A Virus'
genome_df.loc[genome_df['Virocell'], 'Group'] = 'Virocell'
genome_df.loc[genome_df['Virocell'], 'Clade'] = 'N/A Virocell'

genome_df


Unnamed: 0,GenomeID,GenomeName,Type,Group,Clade,Virocell,Completeness
0,2716884698,AG-316-L16,SAG,Prochlorococcus,AMZ-II,False,20.69
1,2716884700,AG-316-N23,SAG,Prochlorococcus,AMZ-II,False,20.69
2,2716884701,AG-316-P23,SAG,Prochlorococcus,AMZ-II,False,12.07
3,2716884699,AG-316-L21,SAG,Prochlorococcus,AMZ-II,False,10.34
4,2716884642,AG-316-A05,SAG,Prochlorococcus,AMZ-II,False,6.90
...,...,...,...,...,...,...,...
765,651703106,S-ShM2,ISOLATE,Virus,N/A Virus,False,0.00
766,651703107,Syn19,ISOLATE,Virus,N/A Virus,False,0.00
767,641201056,Syn5,ISOLATE,Virus,N/A Virus,False,0.00
768,2595698410,metaG-MbCM1,ISOLATE,Virus,N/A Virus,False,0.00


In [3]:
# add genome id and clade info into ortholog df

ortho_df['GenomeID'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['GenomeID'])
ortho_df['Group'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Group'])
ortho_df['Clade'] = ortho_df['GenomeName'].map(genome_df.set_index('GenomeName')['Clade'])

ortho_df


Unnamed: 0,MappingName,CyCOGID,GenomeName,GeneID,Annotation,GenomeID,Group,Clade
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit,2606217514,Synechococcus,5.1A-III
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit,2681812859,Prochlorococcus,LLI
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit,2716884419,Prochlorococcus,HLII
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit,2654587735,Prochlorococcus,HLII
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit,2636415834,Synechococcus,5.2
...,...,...,...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein,2667527365,Prochlorococcus,LLII.LLIII


In [4]:
# start ko mapping file

ko_map_df = ortho_df.groupby('CyCOGID').GeneID.count().reset_index().rename(columns={'GeneID': 'TotalRefs'})
# add in genus counts
ko_map_df = ko_map_df.join(
    pd.DataFrame(ortho_df.groupby('CyCOGID')['Group'].value_counts()).rename(
        columns={'Group': 'count'}).reset_index().pivot(
        columns='Group', index='CyCOGID', values='count').fillna(0), 
    on='CyCOGID', 
    how='left'
)
# add in clade counts (only of select clades we deal with in this study)
clades = ['HLI', 'HLII', 'LLI', '5.1B-I', '5.1B-CRD1', '5.1A-II', '5.1A-III', '5.1A-IV', '5.1A-CRD2', '5.1A-UC-A-EnvC']
ko_map_df = ko_map_df.join(
    pd.DataFrame(ortho_df[ortho_df.Clade.isin(clades)].groupby('CyCOGID')['Clade'].value_counts()).rename(
        columns={'Clade': 'count'}).reset_index().pivot(
        columns='Clade', index='CyCOGID', values='count').fillna(0), 
    on='CyCOGID', 
    how='left'
).fillna(0)
# convert dataframe back to ints
ko_map_df = ko_map_df.astype(int)
# add back CyCOG annotation
ko_map_df['DescriptionCyCOG'] = ko_map_df['CyCOGID'].map(
    ortho_df[['CyCOGID', 'Annotation']].drop_duplicates().set_index('CyCOGID')['Annotation']
)

ko_map_df


Unnamed: 0,CyCOGID,TotalRefs,Prochlorococcus,Synechococcus,Virocell,Virus,5.1A-CRD2,5.1A-II,5.1A-III,5.1A-IV,5.1A-UC-A-EnvC,5.1B-CRD1,5.1B-I,HLI,HLII,LLI,DescriptionCyCOG
0,60000001,1376,1211,158,7,0,17,17,8,18,9,37,19,196,532,184,membrane protease FtsH catalytic subunit
1,60000002,1453,1287,157,9,0,10,19,9,16,9,40,18,198,552,198,ATP-dependent Clp protease ATP-binding subunit...
2,60000003,1387,1222,156,9,0,12,16,9,21,8,41,14,203,523,192,ATP-dependent Clp protease proteolytic subunit...
3,60000004,1631,1369,255,7,0,22,24,16,27,26,62,29,173,542,265,hypothetical protein
4,60000005,990,877,108,5,0,8,11,5,14,7,25,13,128,383,131,chaperonin GroEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40290,60040291,4,4,0,0,0,0,0,0,0,0,0,0,0,0,4,hypothetical protein
40291,60040292,4,4,0,0,0,0,0,0,0,0,0,0,0,0,4,Tryptophan-rich Synechocystis species C-termin...
40292,60040293,5,0,5,0,0,0,0,0,0,0,0,0,0,0,0,Putative transposase
40293,60040294,6,0,6,0,0,0,0,0,0,0,0,0,0,0,0,sulfate transport system substrate-binding pro...


# import kegg annotations from genomes

In [5]:
# check that there is a directory for each genome

data_path = Path('../../data/refseqs/genomes/')

count = 0
for g_id in ortho_df['GenomeID'].unique():
    if not os.path.isdir(data_path / str(g_id)):
        print(f'{g_id} directory not found: {data_path / str(g_id)}')
        count += 1
        
print(f'{count} total missing directories')


0 total missing directories


In [6]:
# import all kegg annotations as a df

ko_df = pd.DataFrame()
for g_id in ortho_df['GenomeID'].unique():
    df = pd.read_csv(data_path / f'{g_id}/{g_id}.ko.tab.txt', sep='\t')
    if len(ko_df) == 0:
        ko_df = df
    else:
        ko_df = pd.concat([ko_df, df])
        
ko_df = ko_df.reset_index(drop=True)

ko_df


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
0,2607658051,465,99.8,1,465,1,465,0.000000e+00,906.4,KO:K02313,chromosomal replication initiator protein,,Yes
1,2607658053,410,100.0,1,410,11,420,0.000000e+00,856.7,KO:K00799,glutathione S-transferase [EC:2.5.1.18],EC:2.5.1.18,Yes
2,2607658055,455,100.0,1,455,1,455,0.000000e+00,895.6,KO:K00383,glutathione reductase (NADPH) [EC:1.8.1.7],EC:1.8.1.7,Yes
3,2607658058,198,100.0,1,198,1,198,0.000000e+00,409.1,KO:K02276,cytochrome c oxidase subunit III [EC:1.9.3.1],EC:1.9.3.1,Yes
4,2607658059,562,100.0,1,562,1,562,0.000000e+00,1173.7,KO:K02274,cytochrome c oxidase subunit I [EC:1.9.3.1],EC:1.9.3.1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
492987,2610145142,243,35.7,19,239,48,257,4.300000e-27,123.6,KO:K03791,putative chitinase,,Yes
492988,2610145144,309,37.5,4,292,17,311,0.000000e+00,198.4,KO:K03086,RNA polymerase primary sigma factor,,Yes
492989,2610145149,109,41.2,1,95,1,97,1.500000e-13,77.4,KO:K03111,single-strand DNA-binding protein,,Yes
492990,2610145152,261,40.5,5,254,10,269,2.000000e-43,177.9,KO:K10906,exodeoxyribonuclease VIII [EC:3.1.11.-],EC:3.1.11.-,Yes


# Deduplicate gene annotations

**Problem:** Some genes (specific nucleotide sequence from particular reference genome) have more than one KO annotation hit. 
- Based on the counts below, this comes out to 25,218 (23227 + 1973 + 15 + 2 + 1) out of 492,992 annotated genes, or ~5.12%. 
- Most of these are annotations that actually have the same KO number, but just a different EC number or something. Only 3,375 genes (0.68%) have multiple distinct KO annotations
- Of these, qualitatively it seems like the annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Implemented solution:** Since the KO annotations are based on an HMM search, first select the annotation with the lowest E-value. Then if the E-values are exactly the same, randomly select one annotaiton or the other


In [7]:
# how many genes have more than one ko number assigned?

print(ko_df['gene_oid'].value_counts().value_counts())


count
1    440542
2     23227
3      1973
4        15
6         2
5         1
Name: count, dtype: int64


In [8]:
# deeper look: examine genes with more than one annotation

counts = ko_df['gene_oid'].value_counts()
ko_counts = ko_df[ko_df['gene_oid'].isin(counts[counts.gt(1)].index)].groupby('gene_oid')['ko_id'].nunique()
print(ko_counts.value_counts())    # most have the same ko id, just different EC numbers or something

ko_df[ko_df['gene_oid'].isin(ko_counts[ko_counts.gt(1)].index)]


ko_id
1    21836
2     3333
3       48
4        1
Name: count, dtype: int64


Unnamed: 0,gene_oid,gene_length,percent_identity,query_start,query_end,subj_start,subj_end,evalue,bit_score,ko_id,ko_name,EC,img_ko_flag
1549,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06182,23S rRNA pseudouridine2604 synthase [EC:5.4.99...,EC:5.4.99.21,Yes
1550,2681971726,240,93.75,1,240,1,240,0.000000e+00,513.0,KO:K06183,16S rRNA pseudouridine516 synthase [EC:5.4.99.19],EC:5.4.99.19,Yes
1561,2681971798,467,89.08,1,467,1,467,0.000000e+00,960.0,KO:K04094,methylenetetrahydrofolate--tRNA-(uracil-5-)-me...,EC:2.1.1.74,Yes
1562,2681971798,467,90.15,1,467,1,467,0.000000e+00,969.0,KO:K03495,tRNA uridine 5-carboxymethylaminomethyl modifi...,,Yes
1620,2681971886,348,80.77,1,338,1,338,0.000000e+00,680.0,KO:K08919,chlorophyll a/b binding light-harvesting prote...,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
492225,2717343036,149,41.86,15,140,4,132,6.500000e-31,126.0,KO:K00980,glycerol-3-phosphate cytidylyltransferase [EC:...,EC:2.7.7.39,Yes
492226,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.1.167,Yes
492227,2717343036,149,44.78,15,145,4,137,3.000000e-36,141.0,KO:K03272,D-beta-D-heptose 7-phosphate kinase / D-beta-D...,EC:2.7.7.70,Yes
492269,2717343185,112,100.00,1,112,1,112,1.400000e-73,243.0,KO:K04751,nitrogen regulatory protein P-II 1,,Yes


In [None]:
# deduplicate genes with more than one annotation

# for each gene_oid's set of annotations, select the one with the lowest e-value
# this step will also randomly select one of the annotations to propogate in cases with the same KO 
# but different EC numbers, or different KOs but same e-value

ko_df = ko_df.loc[ko_df.groupby('gene_oid')['evalue'].idxmin(), :].reset_index(drop=True)
ko_df


# Deduplicate CyCOG annotations

**Problem:** Some CyCOG (Clusters of Orthologous Genes) consist of genes with discordant KO annotations
- Out of 40,295 orthologous gene families in CyCOGs v6, 36,693 no KO annotation, meaning just 3,602 (8.94%) contain at least one gene with a KO annotation.
- In most of the annotated CyCOGs, all gene members had a concordant KO annotation. In 128 (3.55% of annotated CyCOGs), the KO annotations were discordant among gene members.
- Of these, qualitatively it seems like the discordant annotations are usually quite similar to one another (e.g. same pathway, or different subunits of the same protein complex)

**Implemented solution:** For each CyCOG, select the annotation applied to the majority of members as the representatitve CyCOG annotation. 


In [None]:
# join kegg annotations onto reference gene set

annot_df = pd.merge(ortho_df, ko_df, left_on='GeneID', right_on='gene_oid', how='left')

# how many unique ko annotations per ortholog group?
print(annot_df.groupby('CyCOGID').ko_id.nunique().value_counts())

# drop the sequences without a ko annotation
annot_df = annot_df[annot_df['ko_id'].notna()]

annot_df


In [None]:
# look at non-unique ko mappings

counts = annot_df.groupby('CyCOGID').ko_id.nunique()
collisions_df = annot_df[annot_df['CyCOGID'].isin(counts[counts.gt(1)].index)].groupby('CyCOGID')[
    ['ko_id', 'ko_name']].value_counts().reset_index()

pd.set_option('max_colwidth', None)
collisions_df


In [None]:
# deduplicate the discordant ortholog-ko mapping by simple majority vote

ko_count_df = pd.DataFrame(annot_df.groupby('CyCOGID').ko_id.value_counts()).rename(
    columns={'ko_id': 'count'}).reset_index()
ko_count_df = ko_count_df.loc[ko_count_df.groupby('CyCOGID')['count'].idxmax(), :]

# add in KO annotation
name_map = annot_df[['ko_id', 'ko_name']].drop_duplicates()
name_map['DescriptionKO'] = name_map['ko_name'].str.split(' \[EC:').str[0]
name_map = name_map.loc[name_map['ko_id'].drop_duplicates().index]    # pick one description from duplicates
ko_count_df = pd.merge(left=ko_count_df, right=name_map[['ko_id', 'DescriptionKO']], on='ko_id', how='left')

# remove prefix from KOID
ko_count_df['KOID'] = ko_count_df['ko_id'].str[3:]
# rename count column and drop old ko_id column
ko_count_df = ko_count_df[['CyCOGID', 'KOID', 'DescriptionKO', 'count']].rename(
    columns={'count': 'NRefsKO'}).set_index('CyCOGID')
# calculate refs with KO mapping other than the one chosen
ko_count_df['NRefsOtherKO'] = annot_df.groupby('CyCOGID').GeneID.count() - ko_count_df['NRefsKO']

# add ko_count_df into ko_map_df
ko_map_df = pd.merge(ko_map_df, ko_count_df, on='CyCOGID', how='left')

pd.reset_option('max_colwidth')
ko_map_df


In [None]:
# there are many different CyCOG orthologies annotated with the same KO group -- look at some of these

ko_map_counts = ko_map_df['KOID'].value_counts()
print('There are {} of {} KOs with a non-unique CyCOGID-KOID mapping:'.format(
    len(ko_map_counts[ko_map_counts.gt(1)]), ko_map_df['KOID'].nunique()))
print(ko_map_counts[ko_map_counts.gt(1)].head(20))

ko_map_df[ko_map_df['KOID'] == 'K06147'].head(10)


# Get KEGG pathways

In [None]:
# get full reference brite hierarchy of kegg orthologies 

url = f'http://rest.kegg.jp/get/br:ko00001/json'
file = requests.get(url, allow_redirects=True)

# url = f'https://rest.kegg.jp/list/ko/'
# file = requests.get(url, allow_redirects=True)

file


In [None]:
# parse pathways into dictionary

pathway_dict = {}
pathway_names = {}
ko_dict = {}
ko_names = {}

# exclude pathways specific to eukaryotes, multicellular organisms, and humans
excluded_systems = ['09144 Cellular community - eukaryotes', '09150 Organismal Systems', '09160 Human Diseases']

# iterate through json
for supersystem in json.loads(file.content)['children']:
    print(f'{supersystem["name"]}')
    for system in supersystem['children']:
        print(f'\t{system["name"]}')
        # skip over excluded systems (pathways specific to eukaryotes, multicellular organisms, and humans)
        if supersystem["name"] in excluded_systems or system["name"] in excluded_systems:
            print('\t\tEXCLUDED')
            continue
        for pathway in system['children']:
            path_id = f'ko{pathway["name"].split(" ")[0]}'
            path_members = []
            if 'children' in pathway.keys():
                # parse ko numbers of each pathway
                for ko in pathway['children']:
                    ko_id = ko['name'].split(' ')[0]
                    ko_names[ko_id] = ' '.join(ko['name'].split(' ')[1:]).strip(' ')
                    path_members.append(ko_id)
                    # add pathway to ko_dict
                    if ko_id in ko_dict.keys():
                        ko_dict[ko_id].append(path_id)
                    else:
                        ko_dict[ko_id] = [path_id]
                pathway_dict[path_id] = path_members
                pathway_names[path_id] = pathway["name"][6:]
            print(f'\t\t{path_id}: {pathway["name"]} ({len(path_members)})')
      

In [None]:
# There are some KO descriptions from the most recent KEGG download that do not match the descriptions 
# of the old annotation. A rough screen shows 243/1752. I want to look at these in more detail

name_df = ko_map_df[['KOID', 'DescriptionKO']].drop_duplicates().reset_index(drop=True)

names = []
substrings = []
for i, koid in enumerate(name_df['KOID']):
    # append KEGG names
    if koid in ko_names.keys():
        name = ko_names[koid]
    else:
        name = np.nan
    names.append(name)
    # check if previous KO Description is a substring of current KEGG names
    substrings.append(str(name_df.loc[i, 'DescriptionKO']) in str(name))
# add to dataframe  
name_df['KEGGNames'] = names
name_df['SubstringMatch'] = substrings

# # fix display options
# pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', None)

# # scren all those that don't have a match
# name_df[~name_df['SubstringMatch']]

# manual inspection of some incongruent (updated?) KOs:
problem_kos = ['K02500', 'K02428', 'K02259', 'K01594', 'K00870', 'K02501', 'K05808', 'K03606', 'K01144', 
               'K03186', 'K01234', 'K03651', 'K07011', 'K03082', 'K01003', 'K05663', 'K03152']
name_df[name_df['KOID'].isin(problem_kos)]


In [None]:
# match up pathway information to pangenome ko list

pathway_df = ko_map_df[ko_map_df['KOID'].notna()].reset_index(drop=True)

data_dict = {}
for koid in pathway_df['KOID'].unique():
    if koid in ko_dict.keys():
        pathways = ko_dict[koid]
    else:
        continue
    for pathway in pathways:
        if pathway not in data_dict.keys():
            data_dict[pathway] = pathway_df['KOID'].eq(koid).astype(int)
        else:
            data_dict[pathway] = data_dict[pathway] + pathway_df['KOID'].eq(koid).astype(int)

pathway_df = pd.merge(pathway_df, pd.DataFrame(data_dict), left_index=True, right_index=True)
pathway_df


In [None]:
# look at pathway statistics and compile dictionary of CyCOGs associated with each pathway

cycog_pathway_dict = {}

for path, cycog_count in pd.DataFrame(data_dict).sum().sort_values(ascending=False).items():
    ko_count = pathway_df.loc[pathway_df[path].gt(0), 'KOID'].nunique()
    total_kos = len(pathway_dict[path])
    print(f'{path}: {pathway_names[path]}\n\t{cycog_count} CyCOGs representing {ko_count}/{total_kos} unique KOs')
    # append list of cycogs associated with each pathway to dictionary
    cycog_pathway_dict[path] = pathway_df.loc[pathway_df[path].gt(0), 'CyCOGID'].to_list()
    

# Calculate Enrichment

In [None]:
# set up parameters and data

# significance level
alpha = 0.01
enrichments_df = pd.DataFrame()

# helper function for fdrcorrection
def adjust_pvals(pvals):
    return multitest.fdrcorrection(pvals, alpha=alpha)[1]

# read in cluster data
data_dir = Path('../../data')
pro_ds = xr.open_dataset(data_dir / '5-models/pro-aligned-models.nc')
syn_ds = xr.open_dataset(data_dir / '5-models/syn-aligned-models.nc')
out_dir = data_dir / '7-interpretation'
if not out_dir.is_dir():
    out_dir.mkdir(parents=True)

# save pathway_df to out_dir
pathway_df.to_csv(out_dir / 'cycog-kegg-pathways.csv', index=False)

# analyze only "robust" clusters
pro_clusters = [
    'pro1', 'pro2', 'pro3', 'pro4', 'pro5', 'pro6', 'pro7', 'pro8', 'pro9', 'pro10', 'pro11', 'pro12', 'pro13', 'pro14', 'pro15'
]
syn_clusters = [
    'syn1', 'syn2', 'syn3', 'syn4', 'syn5', 'syn6', 'syn7', 'syn8', 'syn9', 'syn10', 'syn11', 'syn12', 'syn13', 'syn14', 'syn15'
]


In [None]:
# pull out median bootstrap weights

# pull out pro gene weight data
pro_gene_df = pro_ds.median(dim=['Bootstrap', 'Replicate']).GeneWeight.T.to_pandas()
pro_gene_df = pro_gene_df.rename(columns=dict(zip(pro_gene_df.columns, [f"pro{i}" for i in pro_gene_df.columns])))
# select only robust components
pro_gene_df = pro_gene_df.loc[:, pro_clusters]

# pull out syn gene weight data
syn_gene_df = syn_ds.median(dim=['Bootstrap', 'Replicate']).GeneWeight.T.to_pandas()
syn_gene_df = syn_gene_df.rename(columns=dict(zip(syn_gene_df.columns, [f"syn{i}" for i in syn_gene_df.columns])))
# select only robust components
syn_gene_df = syn_gene_df.loc[:, syn_clusters]

syn_gene_df
    

In [None]:
# run Mann-Whitney U test for every pathway against every pro cluster

input_df = pro_gene_df

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
pro_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
pro_enrich_df = pro_enrich_df[~pro_enrich_df.isna().all(axis=1)]
# adjust p-values
pro_enrich_df = pro_enrich_df.apply(adjust_pvals, raw=True)

pro_enrich_df


In [None]:
# list the most enriched pathways for each cluster

for cluster in pro_enrich_df.columns:
    pathways = pro_enrich_df[pro_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        enrichments_df.loc[path, 'description'] = pathway_names[path]
        enrichments_df.loc[path, cluster] = pval
        print(f'\t{path} (p={pval:.2e}): {pathway_names[path]}')


In [None]:
# run Mann-Whitney U test for every pathway against every syn cluster

input_df = syn_gene_df

p_vals = []
for pathway in cycog_pathway_dict.keys():
    x = input_df.loc[input_df.index.isin(cycog_pathway_dict[pathway]), :]
    y = input_df.loc[~input_df.index.isin(cycog_pathway_dict[pathway]), :]
    result = mannwhitneyu(x, y, alternative='greater', axis=0, method='asymptotic')
    p_vals.append(result.pvalue)
syn_enrich_df = pd.DataFrame(p_vals, index=cycog_pathway_dict.keys(), columns=input_df.columns)
# drop the pathways that are all NAs
syn_enrich_df = syn_enrich_df[~syn_enrich_df.isna().all(axis=1)]
# adjust p-values
syn_enrich_df = syn_enrich_df.apply(adjust_pvals, raw=True)

syn_enrich_df


In [None]:
# list the most enriched pathways for each cluster

for cluster in syn_enrich_df.columns:
    pathways = syn_enrich_df[syn_enrich_df[cluster].lt(alpha)][cluster].sort_values(ascending=True)
    print(f'\n{cluster} ({len(pathways)} enriched pathways)')
    for path, pval in pathways.items():
        enrichments_df.loc[path, 'description'] = pathway_names[path]
        enrichments_df.loc[path, cluster] = pval
        print(f'\t{path} (p={pval:.2E}): {pathway_names[path]}')


In [None]:
# reformat and save enrichments dataframe

enrichments_df = enrichments_df.sort_index().reset_index().rename(columns={'index': 'KEGG pathway'})
enrichments_df.to_csv(out_dir / 'kegg-enrichments.csv', index=False)

enrichments_df
