In [1]:
from Bio import SeqIO
import os
import pandas as pd

### For each genbank file get file with antiSMASH summary

In [2]:
# Set the base folder path
# Folder contains folder for each genbank file with antismash output
base_folder_path = "/input/antiSMASH_8/Actinokineospora"

# Initialize a list to store data from all files for concatenation
summary_data_list = []

# Traverse the folder structure
for root, dirs, files in os.walk(base_folder_path):
    for filename in files:
        # Process only GenBank files that do not start with GCF or NBC
        if filename.endswith(".gbk") and not (filename.startswith("GCF") or filename.startswith("NBC")):
            filepath = os.path.join(root, filename)

            # Extract strain name from folder structure
            folder_name = os.path.basename(root)
            strain = "_".join(folder_name.split("_")[:2])

            # Initialize a list to store data for this file
            final_data_list = []

            records = list(SeqIO.parse(filepath, "genbank"))

            region_number = None
            cand_clusters_info = {}
            max_cand_clusters = 0

            # First, collect candidate cluster info and region number
            for record in records:
                for feature in record.features:
                    if feature.type == "region" and "region_number" in feature.qualifiers:
                        region_number = feature.qualifiers["region_number"][0]
                    elif feature.type.lower() == "cand_cluster":
                        cc_number = feature.qualifiers.get("candidate_cluster_number", ["unknown"])[0]
                        start_pos, end_pos = int(feature.location.start), int(feature.location.end)
                        products = feature.qualifiers.get("product", [])
                        cand_clusters_info[cc_number] = {
                            "start_pos": start_pos,
                            "end_pos": end_pos,
                            "products": products,
                        }
                        max_cand_clusters = max(max_cand_clusters, int(cc_number))

            # Now process CDS features and determine their membership in candidate clusters
            for record in records:
                for feature in record.features:
                    if feature.type == "CDS":
                        cds_locus_id = feature.qualifiers.get("locus_tag", ["unknown"])[0]
                        cds_start, cds_end = int(feature.location.start), int(feature.location.end)
                        cds_product = feature.qualifiers.get("product", ["unknown"])[0]
                        gene_kind = feature.qualifiers.get("gene_kind", ["unknown"])[0]
                        is_core_gene = gene_kind == "biosynthetic"

                        # Initialize CDS entry with placeholders
                        cds_entry = {
                            "Locus_ID": cds_locus_id,
                            "CDS_Product": cds_product,
                            "Gene_Kind": gene_kind,
                            "Is_Core_Gene": is_core_gene,
                            "Region_Nr": region_number,
                            "Strain": strain,
                            **{f"Cand_Cluster_{i}": "" for i in range(1, max_cand_clusters + 1)},
                            **{f"Cluster_Product_{i}": "" for i in range(1, max_cand_clusters + 1)},
                        }

                        # Assign CDS to the appropriate candidate cluster(s)
                        for cc_number, cc_info in cand_clusters_info.items():
                            if cds_start >= cc_info["start_pos"] and cds_end <= cc_info["end_pos"]:
                                index = int(cc_number)
                                cds_entry[f"Cand_Cluster_{index}"] = cc_number
                                cds_entry[f"Cluster_Product_{index}"] = ", ".join(cc_info["products"])

                        final_data_list.append(cds_entry)

            # Create the DataFrame for this file
            df_candidate_clusters = pd.DataFrame(final_data_list)

            # Create a new list for the correct order of columns
            new_columns_order = [
                "Locus_ID",
                "CDS_Product",
                "Gene_Kind",
                "Is_Core_Gene",
                "Region_Nr",
                "Strain",
            ]

            # Append Cand_Cluster_X and Cluster_Product_X columns in alternating order
            for i in range(1, max_cand_clusters + 1):
                new_columns_order.append(f"Cand_Cluster_{i}")
                new_columns_order.append(f"Cluster_Product_{i}")

            # Reindex the DataFrame with the new column order
            df_candidate_clusters = df_candidate_clusters.reindex(columns=new_columns_order)

            # Replace empty strings with NaN and then replace NaNs with 'None'
            df_candidate_clusters.replace("", pd.NA, inplace=True)
            df_candidate_clusters.fillna("None", inplace=True)

            # Save the file-specific DataFrame in the same folder
            strain_file_path = os.path.join(root, f"{strain}_antismash.csv")
            df_candidate_clusters.to_csv(strain_file_path, index=False)

            # Append data to the summary list
            summary_data_list.append(df_candidate_clusters)

# Concatenate all data into a summary DataFrame
df_summary = pd.concat(summary_data_list, ignore_index=True)

# Save the summary DataFrame in the base folder
summary_file_path = os.path.join(base_folder_path, "summary_antismash_Actinokineospora.csv")
df_summary.to_csv(summary_file_path, index=False)
