In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import hypergeom

### Number of genes TTA/total

In [7]:
file_path = '/results/TTA_summary_Actinokineospora.csv'
TTA_summary = pd.read_csv(file_path)

In [8]:
TTA_summary["GCF_ID"] = TTA_summary["GCF_ID"].str.replace(r"\.gbk$", "", regex=True)

In [41]:
TTA_summary.head()

Unnamed: 0,GCF_ID,Organism,Total Genes,Genes with UUA Codons,Percentage with UUA Codons
0,GCF_024171925.1,Actinokineospora diospyrosa,6890,784,11.37881
1,GCF_030268945.1,Actinokineospora globicatena,6566,720,10.96558
2,GCF_030268905.1,Actinokineospora globicatena,6650,707,10.631579
3,GCF_016907695.1,Actinokineospora baliensis,6698,612,9.137056
4,GCF_900111175.1,Actinokineospora terrae,6611,585,8.848888


### COG per strain

In [13]:
# Load summary for COGs for TTA containing genes
file_path = "/results/cog_categories_summary_TTA.csv"
cog_summary_tta = pd.read_csv(file_path)

In [14]:
cog_summary_tta

Unnamed: 0,GCF_ID,-,BDLTU,C,CE,CG,CH,CO,D,DL,...,NU,O,P,Q,QT,S,T,U,Unknown,V
0,GCF_900111175.1,60,0,16,1,0,3,0,2,0,...,3,19,20,39,1,102,37,3,25,20
1,GCF_004362515.1,45,1,11,0,0,1,0,4,0,...,3,12,15,20,2,84,20,2,28,10
2,GCF_024760565.1,20,1,6,0,0,2,1,1,0,...,1,6,5,7,1,48,10,2,15,6
3,GCF_900101685.1,39,0,10,0,1,2,0,7,0,...,1,7,8,23,2,98,17,3,25,12
4,GCF_014648415.1,26,1,7,0,0,0,1,0,0,...,2,5,7,7,1,49,14,3,24,8
5,GCF_030268905.1,69,0,22,0,2,2,0,6,1,...,4,20,23,48,1,99,38,6,37,18
6,GCF_003663795.1,55,0,15,0,1,5,0,1,0,...,1,17,18,40,0,88,38,2,27,16
7,GCF_030268945.1,71,0,18,1,2,5,0,3,0,...,4,17,30,48,2,114,44,3,29,18
8,GCF_003182415.1,10,0,3,0,0,0,0,2,0,...,0,4,2,11,1,19,4,0,8,5
9,GCF_024171925.1,83,1,19,1,2,6,0,6,0,...,4,31,24,42,1,110,35,6,37,24


In [15]:
# Rename 'GCF_ID' to 'Strain' and set it as the index
cog_summary_tta.rename(columns={"GCF_ID": "Strain"}, inplace=True)
cog_summary_tta.set_index("Strain", inplace=True)

In [16]:
# Keep only columns with single-letter COG categories
cog_summary_tta = cog_summary_tta.loc[:, [col for col in cog_summary_tta.columns if len(col) == 1]]

In [17]:
cog_summmary_tta = cog_summary_tta.drop(['-', 'S', 'B', 'N', 'A', 'Z', 'W'], axis=1, errors='ignore')

In [18]:
cog_summmary_tta = cog_summary_tta.drop('-', axis=1, errors='ignore')

In [19]:
file_path = 'input/eggnog//cog_summary.xlsx'
cog_summary_overall = pd.read_excel(file_path, index_col = 'Strain')

In [20]:
cog_summary_overall = cog_summary_overall.loc[:, [col for col in cog_summary_tta.columns if len(col) == 1]]

In [21]:
# Remove underepresented categories
cog_summary_overall = cog_summary_overall.drop(['-', 'S', 'B', 'N', 'A', 'Z', 'W'], axis=1, errors='ignore')

In [22]:
cog_summary_overall.reset_index(inplace = True)

In [23]:
cog_summary_tta.reset_index(inplace = True)

In [24]:
# Remove ".gbk" from the "Strain" column
cog_summary_tta["Strain"] = cog_summary_tta["Strain"].str.replace(".gbk", "", regex=False)

In [34]:
# Initialize an empty list for storing enrichment results
enrichment_results = []

# Debugging counters
successful_calculations = 0
skipped_categories = 0
skipped_strains = 0
error_log = []

# Ensure 'Strain' is treated correctly in DataFrames
if "Strain" not in cog_summary_tta.columns:
    error_log.append("Error: 'Strain' column not found in cog_summary_tta")
if "Strain" not in cog_summary_overall.columns:
    error_log.append("Error: 'Strain' column not found in cog_summary_overall")
if "GCF_ID" not in TTA_summary.columns:
    error_log.append("Error: 'GCF_ID' column not found in TTA_summary")

# Iterate through each strain in cog_summary_tta
for _, row in cog_summary_tta.iterrows():
    strain_id = row["Strain"]
    
    # Check if strain_id exists in both cog_summary_overall and TTA_summary
    if strain_id in cog_summary_overall["Strain"].values and strain_id in TTA_summary["GCF_ID"].values:
        
        total_genes = TTA_summary.loc[TTA_summary["GCF_ID"] == strain_id, "Total Genes"].values[0]
        total_tta = TTA_summary.loc[TTA_summary["GCF_ID"] == strain_id, "Genes with UUA Codons"].values[0]
        
        # Iterate through each category in cog_summary_tta
        for category in cog_summary_tta.columns[1:]:  # Skipping 'Strain' column
            
            # Ensure the category exists in cog_summary_overall
            if category in cog_summary_overall.columns:
                try:
                    K = cog_summary_overall.loc[cog_summary_overall["Strain"] == strain_id, category].values[0]
                    k = row[category]  # Get value from cog_summary_tta directly
                    
                    # Ensure values are valid for statistical calculations
                    if pd.notna(K) and pd.notna(k) and K > 0 and k >= 0:
                        p_value = hypergeom.sf(k-1, total_genes, K, total_tta)
                        enrichment_results.append((strain_id, category, k, K, total_tta, total_genes, p_value))
                        successful_calculations += 1
                    else:
                        skipped_categories += 1
                except (KeyError, IndexError) as e:
                    error_log.append((strain_id, category, str(e)))
                    skipped_categories += 1
            else:
                skipped_categories += 1
    else:
        skipped_strains += 1
        error_log.append((strain_id, "Strain not found in one of the datasets"))

# Convert results to a DataFrame
enrichment_df = pd.DataFrame(
    enrichment_results,
    columns=["Strain", "Category", "TTA Genes", "Total in Category", "Total TTA", "Total Genes", "P-Value"]
)


In [35]:
enrichment_df.head(5)

Unnamed: 0,Strain,Category,TTA Genes,Total in Category,Total TTA,Total Genes,P-Value
0,GCF_900111175.1,C,16,251,585,6611,0.941367
1,GCF_900111175.1,D,2,69,585,6611,0.987434
2,GCF_900111175.1,E,26,423,585,6611,0.986046
3,GCF_900111175.1,F,2,110,585,6611,0.999593
4,GCF_900111175.1,G,27,194,585,6611,0.011618


In [36]:
enrichment_df.to_excel('/results/Actinokineospora_enrichment_cog.xlsx')

In [37]:
# Count how many strains show enrichment (P-Value < 0.05) for each category
enrichment_summary = enrichment_df[enrichment_df["P-Value"] < 0.05].groupby("Category")["Strain"].nunique().reset_index()

# Rename columns for clarity
enrichment_summary.columns = ["Category", "Number of Enriched Strains"]

In [38]:
# Get the total number of strains from cog_summary_overall
total_strains = cog_summary_overall["Strain"].nunique()

# Calculate the percentage of strains in which each category is enriched
enrichment_summary["Percentage Enriched"] = (enrichment_summary["Number of Enriched Strains"] / total_strains) * 100

In [39]:
enrichment_summary

Unnamed: 0,Category,Number of Enriched Strains,Percentage Enriched
0,G,4,36.363636
1,K,1,9.090909
2,L,9,81.818182
3,M,1,9.090909
4,Q,8,72.727273
5,T,4,36.363636
6,V,6,54.545455


In [40]:
enrichment_summary.to_excel('/results/Actinokineospora_enrichment_summary_TTA_significant_percentages.xlsx')