In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# File with TTA containing Actinokineospora genes 
file_path = '/results/Actinokineospora/all_first_TTA_Actinokineospora.csv'
TTA_df = pd.read_csv(file_path)

In [5]:
# File with eggnog redults for Actinokineospora
file_path = '/input/eggnog/Actinokineospora/concatenated_annotations.csv'
concatenated_df = pd.read_csv(file_path)

In [6]:
cog_df = pd.merge(TTA_df, concatenated_df, on = 'Locus_ID', how = 'left')

In [7]:
cog_df.columns

Index(['Locus_ID', 'Product', 'Location', 'UUA Codon Position',
       'Relative_Position', 'Gene_Length', 'UUA Codon Count', 'Start_Codon',
       'Stop_Codon', 'Strand', 'Sequence', 'Organism', 'GCF_ID',
       'seed_ortholog', 'evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl',
       'COG_category', 'Description', 'Preferred_name', 'GOs', 'EC', 'KEGG_ko',
       'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE',
       'KEGG_TC', 'CAZy', 'BiGG_Reaction', 'PFAMs', 'Strain'],
      dtype='object')

In [10]:
columns_to_keep = ['Locus_ID', 'Product', 'Location', 'UUA Codon Position',
       'Relative_Position', 'Gene_Length', 'UUA Codon Count', 'Start_Codon',
       'Stop_Codon', 'Strand', 'Organism', 'GCF_ID',
       'COG_category','GOs', 'EC','Strain']

In [11]:
cog_df = cog_df[columns_to_keep]

In [12]:
cog_df.to_excel('/results/Actinokineospora/cog_TTA_Actinokineospora.xlsx')

### Count COGs per strain

In [13]:
# File with summary of how many genes there are for each category per each Actinokineospora genome
file_path = '/input/eggnog/Actinokineospora/cog_summary.xlsx'
cog_summary_overall = pd.read_excel(file_path, index_col = 'Strain')

In [14]:
cog_summary_overall

Unnamed: 0_level_0,-,A,BDLTU,BQ,C,CE,CG,CH,CI,CK,...,QT,QU,QV,S,T,U,UW,V,W,Z
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_003182415.1,686,2,1,1,252,3,11,22,1,1,...,17,0,0,1254,219,40,0,108,0,0
GCF_003663795.1,703,2,1,2,246,2,10,26,1,1,...,8,1,0,1207,280,45,0,106,1,0
GCF_004362515.1,676,1,1,1,256,2,5,13,1,1,...,14,0,0,1273,207,50,0,100,0,0
GCF_014648415.1,614,2,1,2,284,2,11,18,2,1,...,18,0,0,1228,207,46,1,125,0,0
GCF_016907695.1,724,2,1,2,246,2,6,23,1,1,...,8,1,1,1226,270,44,0,117,0,0
GCF_024171925.1,722,2,1,2,252,2,8,25,1,1,...,10,1,0,1239,287,49,0,118,0,1
GCF_024760565.1,612,1,1,2,270,2,11,19,2,1,...,18,0,0,1216,194,45,1,116,0,0
GCF_030268905.1,693,2,1,1,246,1,8,32,1,1,...,6,1,0,1231,287,47,0,115,0,0
GCF_030268945.1,711,2,1,1,243,2,10,26,1,1,...,6,1,0,1195,269,41,0,116,0,0
GCF_900101685.1,703,2,1,1,228,2,8,15,1,1,...,16,0,0,1199,185,32,0,85,0,0


### Count TTA-containing genes per category per genome

In [15]:
def count_cog_categories_per_strain(df, cog_column="COG_category", strain_column="GCF_ID"):
    """
    Count occurrences of each COG category for each strain.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing the data.
        cog_column (str): Name of the column containing COG categories.
        strain_column (str): Name of the column containing strain names.
    
    Returns:
        pd.DataFrame: Summary DataFrame with counts of each COG category per strain.
    """
    if cog_column not in df.columns or strain_column not in df.columns:
        raise ValueError(f"'{cog_column}' or '{strain_column}' column not found in the DataFrame.")

    # Standardize and fill missing values
    df[strain_column] = df[strain_column].astype(str).str.strip()
    df[cog_column] = df[cog_column].fillna("Unknown")

    # Group by strain and COG category and count occurrences
    summary_df = df.groupby([strain_column, cog_column]).size().reset_index(name="Count")

    # Pivot table
    summary_df = summary_df.pivot(index=strain_column, columns=cog_column, values="Count").fillna(0).astype(int)

    # Ensure all strains are included
    all_strains = df[strain_column].unique()
    summary_df = summary_df.reindex(all_strains, fill_value=0)

    return summary_df

# Run the function
cog_summary_tta = count_cog_categories_per_strain(cog_df, cog_column="COG_category", strain_column="GCF_ID")

# Save the summary to a CSV file
output_file = '/results/cog_categories_summary_TTA.csv'
cog_summary_tta.to_csv(output_file)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[strain_column] = df[strain_column].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cog_column] = df[cog_column].fillna("Unknown")


In [16]:
# Keep only columns with single-letter COG categories
filtered_df = cog_summary_tta.loc[:, [col for col in cog_summary_tta.columns if len(col) == 1]]

In [17]:
# Remove underrepresented COG categories
filtered_df = filtered_df.drop(['-', 'S', 'B', 'N', 'A', 'Z', 'W'], axis=1, errors='ignore')

In [18]:
filtered_df

COG_category,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,T,U,V
GCF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
GCF_900111175.1,16,2,26,2,27,5,15,10,56,27,20,19,20,39,37,3,20
GCF_004362515.1,11,4,24,9,28,10,8,6,46,21,23,12,15,20,20,2,10
GCF_024760565.1,6,1,4,4,10,3,6,3,28,20,10,6,5,7,10,2,6
GCF_900101685.1,10,7,29,2,10,8,11,6,48,32,22,7,8,23,17,3,12
GCF_014648415.1,7,0,6,4,9,3,7,3,30,13,8,5,7,7,14,3,8
GCF_030268905.1,22,6,38,4,32,12,19,9,77,32,27,20,23,48,38,6,18
GCF_003663795.1,15,1,26,5,22,5,18,6,55,26,23,17,18,40,38,2,16
GCF_030268945.1,18,3,37,4,33,14,19,11,75,27,28,17,30,48,44,3,18
GCF_003182415.1,3,2,4,2,4,1,2,1,17,7,3,4,2,11,4,0,5
GCF_024171925.1,19,6,40,7,23,15,24,7,98,34,40,31,24,42,35,6,24
