In [5]:
import os
import pandas as pd
from Bio import SeqIO
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import re

### Reverse complement

In [6]:
def find_TTA_genes_summary(genbank_file):
    TTA_genes = []
    first_TTA_appearance = []
    product_counter = Counter()
    unique_locus_ids = set()
    skipped_sequences = []

    # Extract Organism name from the GenBank file
    organism_name = None
    for record in SeqIO.parse(genbank_file, "genbank"):
        organism_name = record.annotations.get("organism", "Unknown")
        break

    for record in SeqIO.parse(genbank_file, "genbank"):
        for feature in record.features:
            if feature.type == "CDS":
                locus_id = feature.qualifiers.get("locus_tag", ["Unknown"])[0]
                unique_locus_ids.add(locus_id)

                # Extract sequence
                sequence = feature.extract(record.seq)

                # Adjust for codon_start
                codon_start = int(feature.qualifiers.get("codon_start", [1])[0]) - 1
                coding_sequence = sequence[codon_start:]

                # Convert to uppercase string
                coding_sequence_str = str(coding_sequence).upper()

                # Check if sequence length is divisible by 3
                if len(coding_sequence_str) % 3 != 0:
                    skipped_sequences.append(locus_id)
                    continue  # Skip this sequence if not divisible by 3

                # Divide the sequence into codons
                codons = [
                    coding_sequence_str[i:i+3]
                    for i in range(0, len(coding_sequence_str), 3)
                    if len(coding_sequence_str[i:i+3]) == 3
                ]

                # Check for and handle stop codon
                stop_codon = codons[-1] if len(codons) > 0 else None
                valid_stop_codons = {"TAA", "TAG", "TGA"}
                if stop_codon in valid_stop_codons:
                    codons = codons[:-1]  # Remove the stop codon from codon list

                # Length of the gene
                gene_length = len(coding_sequence_str)

                # Count occurrences of TTA codons, here can be also any other codon
                TTA_count = codons.count("TTA")
                first_appearance_found = False

                # Iterate through codons to find TTA codons
                for i, codon in enumerate(codons):
                    # Here can be any other codon
                    if codon == "TTA": 
                        relative_position = (i / len(codons)) * 100

                        product = feature.qualifiers.get("product", ["Unknown"])[0]
                        location = f"{feature.location.start}:{feature.location.end}"

                        # Add all occurrences of TTA codons
                        TTA_genes.append({
                            "Locus_ID": locus_id,
                            "Product": product,
                            "Location": location,
                            "TTA Codon Position": i + 1,
                            "Relative_Position": relative_position,
                            "Gene_Length": gene_length,
                            "TTA Codon Count": TTA_count,
                            "Start_Codon": codons[0] if codons else "N/A",
                            "Stop_Codon": stop_codon,
                            "Strand": "Forward" if feature.location.strand == 1 else "Reverse",
                            "Sequence": coding_sequence_str,
                            "Organism": organism_name
                        })

                        # Add only the first appearance of TTA codon
                        if not first_appearance_found:
                            first_TTA_appearance.append({
                                "Locus_ID": locus_id,
                                "Product": product,
                                "Location": location,
                                "TTA Codon Position": i + 1,
                                "Relative_Position": relative_position,
                                "Gene_Length": gene_length,
                                "TTA Codon Count": TTA_count,
                                "Start_Codon": codons[0] if codons else "N/A",
                                "Stop_Codon": stop_codon,
                                "Strand": "Forward" if feature.location.strand == 1 else "Reverse",
                                "Sequence": coding_sequence_str,
                                "Organism": organism_name
                            })
                            first_appearance_found = True

                        # Count the product
                        if product != "Unknown":
                            product_counter[product] += 1

    # Calculate summary statistics
    total_genes = len(unique_locus_ids)
    num_TTA_genes = len(TTA_genes)
    percentage_TTA = (num_TTA_genes / total_genes) * 100 if total_genes > 0 else 0

    # Create summary DataFrame
    summary_df = pd.DataFrame({
        "Total Genes": [total_genes],
        "Genes with TTA Codons": [num_TTA_genes],
        "Percentage with TTA Codons": [percentage_TTA],
        "Organism": [organism_name]
    })

    # Create DataFrame for TTA genes
    TTA_genes_df = pd.DataFrame(TTA_genes)
    TTA_genes_df["Organism"] = organism_name

    # Create DataFrame for first appearance of TTA codons
    first_TTA_df = pd.DataFrame(first_TTA_appearance)

    # Create DataFrame for products
    most_common_products_df = pd.DataFrame(product_counter.items(), columns=["Product", "Count"])
    most_common_products_df["Organism"] = organism_name
    most_common_products_df = most_common_products_df.sort_values(by="Count", ascending=False).reset_index(drop=True)

    # Log skipped sequences
    skipped_sequences_df = pd.DataFrame({"Skipped Locus_ID": skipped_sequences})

    return summary_df, TTA_genes_df, most_common_products_df, first_TTA_df, skipped_sequences_df


In [7]:
main_folder = '/Users/annasve/Desktop/data/ncbi_genomes/all_genbank/Streptomyces/'
output_path = '/Users/annasve/Desktop/data/ncbi_genomes/analysis/rare_codon_percentage_and_location/Streptomyces/TTA/'

all_summary_df = []
all_TTA_genes_df = []
all_first_TTA_df = []
all_most_common_products_df = []
all_skipped_sequences = []

In [8]:
# Iterate over all files in the main folder
for filename in os.listdir(main_folder):
    if filename.endswith(".gbk") or filename.endswith(".gbff"):  # Only process GenBank files
        file_path = os.path.join(main_folder, filename)
        
        # Get GCF ID from filename (first two parts when split by '_')
        parts = filename.split('_')
        gcf_id = '_'.join(parts[:2])  # Combine part 0 and part 1

        try:
            # Run the function for each file
            summary_df, TTA_genes_df, most_common_products_df, first_TTA_df, skipped_sequences_df = find_TTA_genes_summary(file_path)
            
            # Add GCF_ID and Organism columns to summary and TTA_genes dataframes
            organism_name = summary_df["Organism"].iloc[0]  # Extract organism name from the summary dataframe

            summary_df["GCF_ID"] = gcf_id
            summary_df["Organism"] = organism_name
            
            TTA_genes_df["GCF_ID"] = gcf_id
            TTA_genes_df["Organism"] = organism_name

            first_TTA_df["GCF_ID"] = gcf_id  # Corrected line
            first_TTA_df["Organism"] = organism_name

            skipped_sequences_df["Organism"] = organism_name
            
            # Append to the lists
            all_summary_df.append(summary_df)
            all_TTA_genes_df.append(TTA_genes_df)
            all_first_TTA_df.append(first_TTA_df)
            all_skipped_sequences.append(skipped_sequences_df)

            
            # Add GCF_ID to most_common_products_df and append
            most_common_products_df["GCF_ID"] = gcf_id
            all_most_common_products_df.append(most_common_products_df)
        except Exception as e:
            # Print the filename and the error message
            print(f"Error processing file {filename}: {e}")

# Concatenate all dataframes
try:
    final_summary_df = pd.concat(all_summary_df, ignore_index=True)
    final_TTA_genes_df = pd.concat(all_TTA_genes_df, ignore_index=True)
    final_most_common_products_df = pd.concat(all_most_common_products_df, ignore_index=True)
    final_all_first_TTA_df = pd.concat(all_first_TTA_df, ignore_index=True)
except Exception as e:
    print(f"Error during concatenation: {e}")

In [10]:
# Sort by 'Percentage with TTA Codons' in descending order
final_summary_df = final_summary_df.sort_values(by='Percentage with TTA Codons', ascending = False)

In [None]:
# All information on codon
final_summary_df.to_csv('/results/Actinokineospora/summary_TTA_Actinokineospra.csv', index = False)

In [12]:
# All TTA containing genes
final_TTA_genes_df.to_csv('/results/Actinokineospora/all_genes_TTA_Actinokineospora.csv', index = False)

In [13]:
# Gene products with highest occurences of TTA
final_most_common_products_df.to_csv('/results/Actinokineospora/gene_products_Actinokineospora.csv', index = False)

In [15]:
# First occurences of TTA within genes
all_first_TTA_df.to_csv('/results/Actinokineospora/all_first_TTA_Actinokineospora.csv', index = False)