In [1]:
from Bio import SeqIO
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from os import path
from pymodulon.io import *
from pymodulon.plotting import *
import os

---
According to the web search results, not all of the biosynthetic gene clusters in streptomyces are antibiotics. Some of them are involved in the production of other types of natural products, such as pigments, terpenes, siderophores, and osmolytes. Here is a list of the biosynthetic gene clusters that are known or predicted to produce antibiotics, along with their corresponding compounds and references:

- **SGR PTMs**: This gene cluster is responsible for the biosynthesis of **SGR PTMs**, a group of post-translationally modified peptides with antibacterial activity against Gram-positive bacteria¹.
- **Paulomycin**: This gene cluster encodes the enzymes and regulators for the synthesis of **paulomycin**, a glycosylated polyene macrolide with antifungal and antitumor properties².
- **Cyclofaulknamycin**: This gene cluster is involved in the formation of **cyclofaulknamycin**, a cyclic peptide with antibacterial activity against methicillin-resistant Staphylococcus aureus (MRSA)³.
- **synechobactin**: This gene cluster directs the production of **synechobactin**, a siderophore with antimicrobial activity against Gram-negative bacteria⁴.
- **fluostatins**: This gene cluster is responsible for the biosynthesis of **fluostatins**, a family of polyketides with antifungal and cytotoxic activities⁵.
- **goadsporin**: This gene cluster encodes the enzymes and regulators for the synthesis of **goadsporin**, a linear heptapeptide with antibacterial activity against Gram-positive bacteria⁶.
- **SAL-2242**: This gene cluster is involved in the formation of **SAL-2242**, a cyclic lipopeptide with antibacterial activity against Gram-positive bacteria⁷.
- **WS9326**: This gene cluster directs the production of **WS9326**, a cyclic peptide with antibacterial activity against Gram-positive bacteria.
- **surugamide**: This gene cluster is responsible for the biosynthesis of **surugamide**, a cyclic lipopeptide with antibacterial activity against Gram-positive bacteria.
- **dudomycin A**: This gene cluster encodes the enzymes and regulators for the synthesis of **dudomycin A**, a glycosylated anthracycline with antitumor activity.
- **desferrioxamin B**: This gene cluster is involved in the formation of **desferrioxamin B**, a siderophore with antibacterial activity against Gram-negative bacteria.
- **minimycin**: This gene cluster directs the production of **minimycin**, a glycopeptide with antibacterial activity against Gram-positive bacteria.
- **streptamidine**: This gene cluster is responsible for the biosynthesis of **streptamidine**, an aminoglycoside with antibacterial activity against Gram-negative bacteria.
- **candicidin**: This gene cluster encodes the enzymes and regulators for the synthesis of **candicidin**, a polyene macrolide with antifungal activity.
- **antimycin**: This gene cluster is involved in the formation of **antimycin**, a depsipeptide with antifungal and antitumor activities.

The other biosynthetic gene clusters in streptomyces are not antibiotics, but they may have other biological functions or applications. For example:

- **hopene**: This gene cluster is responsible for the biosynthesis of **hopene**, a pentacyclic triterpene that serves as a membrane lipid and a precursor for hopanoids.
- **hexacosalactone A**: This gene cluster encodes the enzymes and regulators for the synthesis of **hexacosalactone A**, a butyrolactone that acts as an autoregulator for secondary metabolism in streptomyces.
- **geosmin**: This gene cluster is involved in the formation of **geosmin**, a sesquiterpene that gives streptomyces their characteristic earthy odor and flavor.
- **julichrome**: This gene cluster directs the production of **julichrome**, a siderophore that binds iron and enhances growth and sporulation in streptomyces.
- **ectoine**: This gene cluster is responsible for the biosynthesis of **ectoine**, an osmolyte that protects streptomyces from salt stress and dehydration.
- **isorenieratene**: This gene cluster encodes the enzymes and regulators for the synthesis of **isorenieratene**, a carotenoid that gives streptomyces their yellow-orange color and protects them from oxidative damage.
- **valinomycin**: This gene cluster is involved in the formation of **valinomycin**, a cyclic depsipeptide that acts as a potassium ionophore and disrupts the membrane potential of cells.
---

In [4]:
data_dir = data_dir = path.join('../..','data','processed')
ica_data = load_json_model(path.join(data_dir,'salb.json.gz'))

In [6]:
# Read the CSV file containing the imodulon names
imodulon_names_df = pd.read_csv(path.join(data_dir, 'imodulon_table.csv'))
# Extract the names into a list
imodulon_names = imodulon_names_df['Unnamed: 0'].tolist()
all_genes = []  # List to store all genes from each imodulon

for imodulon_name in imodulon_names:
    imodulome_data = ica_data.view_imodulon(imodulon_name)
    genes = imodulome_data.index.tolist()
    for gene in genes:
        all_genes.append((gene, imodulon_name))

# Convert the collected gene data to a pandas DataFrame
df = pd.DataFrame(all_genes, columns=['Gene', 'Imodulon'])

# Save the DataFrame to a CSV file
df.to_csv(path.join(data_dir,'genes_with_imodulon.csv'), index=False)

In [10]:
# path of gene_cluster_files
gene_cluster_files = ["../../data/external/S_albus_antismash/NC_020990.1.region001.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region002.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region003.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region004.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region005.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region006.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region007.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region008.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region009.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region010.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region011.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region012.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region013.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region014.gbk","/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region015.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region016.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region017.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region018.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region019.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region020.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region021.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region022.gbk", "/Users/nilmat/Documents/Streptomyces_main/BiGMeC_salb/S_albus_antismash/NC_020990.1.region023.gbk"] 

# List to store gene information
genes_list = []

# Iterate over each gene cluster file
for gene_cluster_file in gene_cluster_files:
    with open(gene_cluster_file, "r") as file:
        # Parse the gene cluster file
        record = SeqIO.read(file, "genbank")

        # Iterate over each gene in the gene cluster file
        for feature in record.features:
            if feature.type == "CDS":
                # Extract the gene information
                gene = {"locus_tag": feature.qualifiers["locus_tag"][0]}

                # Append the gene information to the genes_list
                genes_list.append(gene)
        


# Print the extracted gene information
for gene in genes_list:
    print(gene) 

# Write the extracted gene information to a file
with open("gene_list.txt", "w") as file:
    for gene in genes_list:
        file.write(str(gene) + "\n")


{'locus_tag': 'XNR_RS00010'}
{'locus_tag': 'XNR_RS00015'}
{'locus_tag': 'XNR_RS00020'}
{'locus_tag': 'XNR_RS00025'}
{'locus_tag': 'XNR_RS00030'}
{'locus_tag': 'XNR_RS00035'}
{'locus_tag': 'XNR_RS00040'}
{'locus_tag': 'XNR_RS00045'}
{'locus_tag': 'XNR_RS00050'}
{'locus_tag': 'XNR_RS00055'}
{'locus_tag': 'XNR_RS00060'}
{'locus_tag': 'XNR_RS00065'}
{'locus_tag': 'XNR_RS00070'}
{'locus_tag': 'XNR_RS00075'}
{'locus_tag': 'XNR_RS00080'}
{'locus_tag': 'XNR_RS00085'}
{'locus_tag': 'XNR_RS00090'}
{'locus_tag': 'XNR_RS31480'}
{'locus_tag': 'XNR_RS00100'}
{'locus_tag': 'XNR_RS31485'}
{'locus_tag': 'XNR_RS00110'}
{'locus_tag': 'XNR_RS31490'}
{'locus_tag': 'XNR_RS00115'}
{'locus_tag': 'XNR_RS00120'}
{'locus_tag': 'XNR_RS00125'}
{'locus_tag': 'XNR_RS00130'}
{'locus_tag': 'XNR_RS00135'}
{'locus_tag': 'XNR_RS00140'}
{'locus_tag': 'XNR_RS00145'}
{'locus_tag': 'XNR_RS00150'}
{'locus_tag': 'XNR_RS00155'}
{'locus_tag': 'XNR_RS00160'}
{'locus_tag': 'XNR_RS30400'}
{'locus_tag': 'XNR_RS00165'}
{'locus_tag': 

In [5]:
# Load the CSV files into pandas DataFrames
df_bgc = pd.read_csv("../../data/external/S_albus_antismash/bgc_gene_list.csv")
df_imodulon = pd.read_csv("../../data/processed/genes_with_imodulon.csv")

# Merge the DataFrames
df_merged = pd.merge(df_bgc, df_imodulon, on="gene_id", how="outer")


print(df_merged)

# Write the merged DataFrame to a CSV file
file_path = os.path.join(data_dir, "merged_gene_list.csv")
df_merged.to_csv(file_path, index=False)

          gene_id  BGC     Imodulon
0     XNR_RS00010  1.0   DNA Damage
1     XNR_RS00015  1.0          NaN
2     XNR_RS00020  1.0          NaN
3     XNR_RS00025  1.0          NaN
4     XNR_RS00030  1.0   DNA Damage
...           ...  ...          ...
2985  XNR_RS27840  NaN        UC-14
2986  XNR_RS28670  NaN        UC-14
2987  XNR_RS28680  NaN        UC-14
2988  XNR_RS12320  NaN  hrt Complex
2989  XNR_RS12325  NaN  hrt Complex

[2990 rows x 3 columns]


In [30]:
#Group by BGC and imodulon
grouped = df_merged.groupby(['BGC', 'Imodulon'])

#calculate the total number of genes per imodulon
total_genes = grouped['gene_id'].count().reset_index().rename(columns={'gene_id':'total_bgc_genes'})

# Grouping by Imodulon and counting the non-null gene_id values in each group
gene_counts = df_merged.groupby('Imodulon')['gene_id'].count()

# Joining the gene counts with the total_genes DataFrame based on Imodulon
result = pd.merge(total_genes, gene_counts, left_on='Imodulon', right_index=True, how='left')

# Renaming the columns for clarity
result.rename(columns={'gene_id': 'genes_in_imodulon'}, inplace=True)

# Calculating the percentage of genes in each BGC that are in each imodulon
result['percentage'] = result['total_bgc_genes'] / result['genes_in_imodulon'] * 100

# Printing the result DataFrame
print(result)

# Write the result DataFrame to a CSV file
file_path = os.path.join(data_dir, "result.csv")
result.to_csv(file_path, index=False)

      BGC              Imodulon  total_bgc_genes  genes_in_imodulon  \
0     1.0  BGC-1 and 23 Related               11                 17   
1     1.0           BGC-General                7                197   
2     1.0      Del14, Unknown-5                1                187   
3     1.0              Fructose                5                  5   
4     1.0           Nuo Complex                2                183   
..    ...                   ...              ...                ...   
251  23.0            Unknown-15                9                241   
252  23.0            Unknown-16                4                514   
253  23.0            Unknown-17                1                411   
254  23.0             Unknown-2                1                248   
255  23.0             Unknown-9                5                431   

     percentage  
0     64.705882  
1      3.553299  
2      0.534759  
3    100.000000  
4      1.092896  
..          ...  
251    3.734440  
252