In [1]:
import os
import pandas as pd
import seaborn as sns
from scipy.stats import fisher_exact
import numpy as np
import glob


In [86]:
# define function for making presence/absence table per species
def gen_pres_abs(input_path, metadata, metad_col, group1, group2):
    pres_abs_table = pd.read_csv(input_path, sep='\t', index_col=0)
    pres_abs_table_T = pres_abs_table.T #transpose
    
    # merge presence abscence with metadata
    pres_ab_md = pd.merge(pres_abs_table_T, metadata, left_index=True, right_index=True)
    
    # calculate total group 1 and 2 sample numbers tested
    md_tot = pres_ab_md[metad_col].value_counts()
    group_1_tot = md_tot[group1]
    group_2_tot = md_tot[group2]
    group_names = [group1, group2]
    tot_md_df = pd.DataFrame([group_1_tot, group_2_tot], index=group_names, columns=['total_count'])
    
    # get list of gene names 
    gene_names = pres_abs_table_T.columns
    
    return pres_ab_md, tot_md_df, gene_names
    
# define contigency table function
def make_contig_table(pres_abs_table, tot_md_df, metad_col, gene):
    pres_ab_md_contig = pres_abs_table[[gene, metad_col]]
    pres_ab_md_contig = pres_ab_md_contig.groupby(by=[metad_col]).sum()
    # finish contigency table
    pres_ab_md_contig_yes_no = pd.merge(pres_ab_md_contig, tot_md_df, left_index=True, right_index=True)
    pres_ab_md_contig_yes_no['no'] = pres_ab_md_contig_yes_no['total_count'] - pres_ab_md_contig_yes_no[gene]
    pres_ab_md_contig_yes_no.drop(columns=['total_count'], inplace=True)
    return pres_ab_md_contig_yes_no


# define fisher calculation for both groups
def fisher_test(contig_table, species, group1, group2):
    group_1_lofl = contig_table.iloc[[0]].to_numpy().tolist()
    group_1_list = []
    for item in group_1_lofl:
        group_1_list.extend(item)

    group_2_lofl = contig_table.iloc[[1]].to_numpy().tolist()
    group_2_list = []
    for item in group_2_lofl:
        group_2_list.extend(item)

    table = [group_1_list, group_2_list]
    oddsr, p = fisher_exact(table, alternative='two-sided')
    
    gene_name = contig_table.columns[0]
    return gene_name, oddsr, p, species, f'{group1},{group2}'
