In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

In [4]:
file_path = 'results/Actinokineospora/cog_categories_summary_position_0.csv'
cog_tta_1 = pd.read_csv(file_path, index_col = 'Strain')

In [5]:
cog_tta_1 = cog_tta_1.loc[:, [col for col in cog_tta_1.columns if len(col) == 1]]
# Remove underrepresented categories
cog_tta_1 = cog_tta_1.drop(['-', 'S', 'B', 'N', 'A', 'Z', 'W'], axis=1, errors='ignore')

In [6]:
cog_tta_1

Unnamed: 0_level_0,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,T,U,V
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
GCF_003182415.1,1,0,2,1,1,0,0,0,5,2,0,1,1,3,0,0,3
GCF_003663795.1,5,0,8,0,7,0,3,0,7,4,6,6,6,12,4,0,2
GCF_004362515.1,2,0,12,2,18,3,4,1,22,9,12,5,5,7,8,0,4
GCF_014648415.1,3,0,2,1,6,1,4,1,6,6,5,3,5,1,5,0,4
GCF_016907695.1,2,0,10,1,9,2,5,0,11,6,6,8,9,14,1,0,4
GCF_024171925.1,2,1,12,0,8,4,6,1,12,9,14,14,10,12,7,1,4
GCF_024760565.1,3,0,1,1,6,0,3,2,7,6,5,3,5,1,3,0,3
GCF_030268905.1,6,1,10,0,9,0,1,2,18,4,6,5,11,11,5,0,2
GCF_030268945.1,5,0,9,0,10,2,1,2,16,5,6,4,13,11,5,0,2
GCF_900101685.1,1,1,9,1,4,1,4,0,7,7,7,1,4,9,5,0,3


In [7]:
# Read the file for COG categories for TTA containing genes
file_path = "results/Actinokinesospora/cog_categories_summary_TTA.csv"
cog_tta = pd.read_csv(file_path)

In [8]:
cog_tta.rename(columns={"GCF_ID": "Strain"}, inplace=True)

In [9]:
# Remove ".gbk" from the "Strain" column
cog_tta["Strain"] = cog_tta["Strain"].str.replace(".gbk", "", regex=False)

In [10]:
cog_tta.set_index('Strain', inplace = True)

In [11]:
cog_tta = cog_tta.loc[:, [col for col in cog_tta_1.columns if len(col) == 1]]
# Remove underrepresented categories
cog_tta = cog_tta.drop(['-', 'S', 'B', 'N', 'A', 'Z', 'W'], axis=1, errors='ignore')

In [12]:
# Read file with summary of COG categories for all genes
file_path = '/input/eggnog/Actinokineospora/cog_summary.xlsx'
cog_all = pd.read_excel(file_path, index_col = 'Strain')

In [13]:
cog_all = cog_all.loc[:, [col for col in cog_tta_1.columns if len(col) == 1]]
cog_all = cog_all.drop(['-', 'S', 'B', 'N', 'A', 'Z', 'W'], axis=1, errors='ignore')

In [14]:
cog_all

Unnamed: 0_level_0,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,T,U,V
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
GCF_003182415.1,252,65,386,96,203,163,253,187,659,208,239,151,215,256,219,40,108
GCF_003663795.1,246,63,432,110,188,160,203,204,619,208,230,188,192,216,280,45,106
GCF_004362515.1,256,61,416,107,275,150,257,190,657,197,258,184,193,224,207,50,100
GCF_014648415.1,284,59,393,101,276,159,218,208,634,190,219,182,198,232,207,46,125
GCF_016907695.1,246,57,446,107,187,164,201,205,626,215,238,201,196,213,270,44,117
GCF_024171925.1,252,65,434,111,185,159,219,211,669,198,257,203,218,245,287,49,118
GCF_024760565.1,270,59,379,96,268,154,221,206,640,208,217,180,192,224,194,45,116
GCF_030268905.1,246,67,414,111,197,153,208,203,641,207,242,191,207,218,287,47,115
GCF_030268945.1,243,57,418,107,200,154,207,202,617,196,244,195,204,218,269,41,116
GCF_900101685.1,228,62,356,92,176,150,229,203,536,250,226,150,166,200,185,32,85


In [15]:
# Function to perform hypergeometric test for each strain and COG category
def perform_hypergeometric_test(cog_tta_1, cog_all, cog_tta):
    results = []

    for strain in cog_tta_1.index:
        if strain in cog_all.index and strain in cog_tta.index:
            for cog in cog_tta_1.columns:
                # Get values for hypergeometric test
                k = cog_tta_1.loc[strain, cog]  # Successes in sample (TTA in bin 1)
                K = cog_tta.loc[strain, cog]  # Total TTA in COG category
                n = cog_all.loc[strain, cog]  # Sample size (all genes in COG category)
                N = cog_all.loc[strain].sum()  # Population size (all genes across all COGs)

                # Perform hypergeometric test only if valid numbers
                if k > 0 and K > 0 and n > 0 and N > 0:
                    p_value = stats.hypergeom.sf(k-1, N, K, n)
                else:
                    p_value = 1.0  # Default to non-significant if values are invalid

                # Store results
                results.append({
                    "Strain": strain,
                    "COG": cog,
                    "k (TTA in bin 1)": k,
                    "K (Total TTA in COG)": K,
                    "n (Total in COG)": n,
                    "N (Total genes)": N,
                    "p-value": p_value
                })

    # Convert to DataFrame
    results_df = pd.DataFrame(results)

    return results_df

# Assuming the dataframes are already loaded as cog_tta_1, cog_all, and cog_tta
hypergeometric_results = perform_hypergeometric_test(cog_tta_1, cog_all, cog_tta)

In [16]:
hypergeometric_results.head()

Unnamed: 0,Strain,COG,k (TTA in bin 1),K (Total TTA in COG),n (Total in COG),N (Total genes),p-value
0,GCF_003182415.1,C,1,3,252,3700,0.190772
1,GCF_003182415.1,D,0,2,65,3700,1.0
2,GCF_003182415.1,E,2,4,386,3700,0.05648
3,GCF_003182415.1,F,1,2,96,3700,0.051226
4,GCF_003182415.1,G,1,4,203,3700,0.202125


In [17]:
hypergeometric_results.to_excel('/results/Actinokineospora/Actinokineospora_enrichment_pos_0.xlsx')

In [18]:
filtered_df = hypergeometric_results[hypergeometric_results["p-value"] < 0.05]

In [19]:
# Count how many strains show enrichment (p-value < 0.05) for each category
enrichment_summary = hypergeometric_results[hypergeometric_results["p-value"] < 0.05].groupby("COG")["Strain"].nunique().reset_index()

# Rename columns for clarity
enrichment_summary.columns = ["Category", "Number of Enriched Strains"]

In [20]:
cog_all = cog_all.reset_index()

In [21]:
# Get the total number of strains from cog_summary_overall
total_strains = cog_all["Strain"].nunique()

# Calculate the percentage of strains in which each category is enriched
enrichment_summary["Percentage Enriched"] = (enrichment_summary["Number of Enriched Strains"] / total_strains) * 100

In [22]:
# Map full COG category names
enrichment_summary["Category"] = enrichment_summary["Category"].map(cog_full_names)

In [23]:
# Sorting the enrichment_summary dataframe by the "Percentage Enriched" column in descending order
enrichment_summary = enrichment_summary.sort_values(by="Category", ascending=True)

In [24]:
enrichment_summary

Unnamed: 0,Category,Number of Enriched Strains,Percentage Enriched
2,Amino acid transport and metabolism,8,72.727273
4,Carbohydrate transport and metabolism,10,90.909091
1,Cell cycle control,1,9.090909
10,Cell wall/membrane/envelope biogenesis,10,90.909091
5,Coenzyme transport and metabolism,2,18.181818
15,Defense mechanisms,8,72.727273
0,Energy production and conversion,6,54.545455
12,Inorganic ion transport and metabolism,10,90.909091
6,Lipid transport and metabolism,6,54.545455
3,Nucleotide transport and metabolism,1,9.090909


In [25]:
enrichment_summary.to_excel('/results/Actinokineospora_enrichment_summary_pos_0_TTA.xlsx')