In [6]:
import os
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm

In [7]:
# Define the base directory
base_dir = '/input/all_genbank/'

# Prepare output data
records = []

# List all genus folders
genus_folders = [folder for folder in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, folder))]

# Count total GenBank files for progress bar
total_files = sum(
    len([f for f in os.listdir(os.path.join(base_dir, genus)) if f.endswith(('.gbff', '.gbk'))])
    for genus in genus_folders
)

# Initialize progress bar
with tqdm(total=total_files, desc="Processing GenBank files") as pbar:
    for genus in genus_folders:
        genus_path = os.path.join(base_dir, genus)
        for file_name in os.listdir(genus_path):
            if file_name.endswith(('.gbff', '.gbk')):
                file_path = os.path.join(genus_path, file_name)
                gcf_id = "_".join(file_name.split("_")[:2])
                for record in SeqIO.parse(file_path, "genbank"):
                    organism = record.annotations.get("organism", "Unknown")
                    aga_count = 0
                    tta_count = 0
                    for feature in record.features:
                        if feature.type == "tRNA" and "anticodon" in feature.qualifiers:
                            anticodon = feature.qualifiers['anticodon'][0]
                            if "seq:tct" in anticodon.lower():  # AGA codon = anticodon 'TCT'
                                aga_count += 1
                            elif "seq:taa" in anticodon.lower():  # TTA codon = anticodon 'TAA'
                                tta_count += 1
                    records.append({
                        "Genus": genus,
                        "Organism": organism,
                        "GCF_ID": gcf_id,
                        "tRNA_AGA": aga_count,
                        "tRNA_TTA": tta_count
                    })
                pbar.update(1)

# Create DataFrame
trna_df = pd.DataFrame(records)

Processing GenBank files: 100%|█████████████| 1937/1937 [11:15<00:00,  2.87it/s]


ModuleNotFoundError: No module named 'ace_tools'

In [11]:
df_unique = trna_df.groupby(['Genus', 'Organism', 'GCF_ID'], as_index=False)[['tRNA_AGA', 'tRNA_TTA']].max()

In [15]:
df_unique.to_excel('/Users/annasve/Desktop/data/ncbi_genomes/tRNA_per_strain.xlsx', index = False)

In [13]:
df_unique.head(50)

Unnamed: 0,Genus,Organism,GCF_ID,tRNA_AGA,tRNA_TTA
0,Actinokineospora,Actinokineospora alba,GCF_004362515.1,1,1
1,Actinokineospora,Actinokineospora baliensis,GCF_016907695.1,1,1
2,Actinokineospora,Actinokineospora cianjurensis,GCF_003663795.1,1,1
3,Actinokineospora,Actinokineospora diospyrosa,GCF_024171925.1,1,1
4,Actinokineospora,Actinokineospora fastidiosa,GCF_014648415.1,1,1
5,Actinokineospora,Actinokineospora globicatena,GCF_030268905.1,1,1
6,Actinokineospora,Actinokineospora globicatena,GCF_030268945.1,1,1
7,Actinokineospora,Actinokineospora iranica,GCF_900101685.1,1,1
8,Actinokineospora,Actinokineospora sp. UTMC 2448,GCF_024760565.1,1,1
9,Actinokineospora,Actinokineospora spheciospongiae,GCF_003182415.1,1,1
