In [None]:
import pandas as pd
from Bio import SeqIO
import os

In [None]:
datapath = "/data/cephfs-1/work/groups/buchauer/h5n1risk/genomes/individualGenomes/"
fasta_files = [f for f in os.listdir(datapath) if f.endswith('_vir.fasta')]
gtf_files = [os.path.splitext(os.path.basename(f))[0] + ".gtf" for f in fasta_files] 

In [None]:
all_records = []

for input_fasta in fasta_files:
    fasta_path = datapath + input_fasta
    records = list(SeqIO.parse(fasta_path, "fasta"))

    for record in records:
        all_records.append({
            "input_fasta": input_fasta,
            "seq_id": record.id,
            "seq_length": len(record.seq),
            "seq_description": record.description.split(" | ")[-1]
        })

# Create DataFrame
df = pd.DataFrame(all_records)

In [None]:
df.sort_values(by="seq_id")

In [None]:
for input_fasta, output_gtf in zip(fasta_files, gtf_files):
    with open(datapath + output_gtf, "w") as out_gtf:
        for record in SeqIO.parse(datapath + input_fasta, "fasta"):
            gene_name = record.id
            chrom = record.id 
            start = 1
            end = len(record.seq)
            strand = "+"  
            
            for feature in ["gene", "exon"]:
                out_gtf.write(
                    f"{chrom}\tcustom\t{feature}\t{start}\t{end}\t.\t{strand}\t.\t"
                    f'gene_id "{gene_name}"; transcript_id "{gene_name}";\n'
                )
    
    print(f"{output_gtf}")