In [29]:
def get_ids(filename):
    ids = []
    with open(filename, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                id_part = line.split(":")[0]
                ids.append(id_part)
    return ids


In [39]:
from Bio import SeqIO
from textwrap import wrap

def extract_to_fasta(filename, id_list, output_fasta):
    c=0
    with open(output_fasta, "w") as out:
        for record in SeqIO.parse(filename, "genbank"):
            acc = record.name
            if acc not in id_list:
                continue
            c=0
            definition = record.description
            for feature in record.features:
                if feature.type == "CDS":
                    c+=1
                    product = feature.qualifiers.get("product", ["unknown_product"])[0]
                    translation = feature.qualifiers.get("translation", [""])[0]
                    header = f">{acc}.{c}| product: {product} | {definition}"
                    out.write(header + "\n")
                    seq_lines = wrap(translation, 60)
                    out.write("\n".join(seq_lines) + "\n")

In [74]:
from Bio import SeqIO
from textwrap import wrap
from Bio.Seq import Seq

def extract_to_fasta_noCDS(filename, id_list, output_fasta):
    with open(output_fasta, "w") as out:
        for record in SeqIO.parse(filename, "genbank"):
            acc = record.name
            if acc not in id_list:
                continue
            definition = record.description
            nucl_seq = str(record.seq)
            for frame in range(3):
                sub_seq = nucl_seq[frame:]                             
                #sub_seq = sub_seq[:len(sub_seq) - len(sub_seq) % 3]   
                prot_seq = str(Seq(sub_seq).translate(to_stop=True))
                if len(prot_seq) < 10:
                    continue
                header_prot = f">{acc}.n+{frame} | {definition}"
                out.write(header_prot + "\n")
                for line in wrap(prot_seq, 50):
                    out.write(line + "\n")

In [40]:
conflict_annot_ids = get_ids("Astroviridae_15102025_conflictingannot.txt")
conflict_annot_ids
filename = "Astroviridae_15102025.gb"
output_fasta = "conflict_annot_ids.fasta"
extract_to_fasta(filename, conflict_annot_ids, output_fasta)


In [37]:
not_annot_targetCDS_ids = get_ids("Astroviridae_15102025_noannot_targetCDS.txt")
not_annot_targetCDS_ids
filename = "Astroviridae_15102025.gb"
output_fasta = "not_annot_targetCDS_ids.fasta"
extract_to_fasta(filename, not_annot_targetCDS_ids, output_fasta)


In [75]:
not_annot_CDS_ids = get_ids("Astroviridae_15102025_noannotCDS.txt")
not_annot_CDS_ids
filename = "Astroviridae_15102025.gb"
output_fasta = "not_annot_CDS_ids.fasta"
extract_to_fasta_noCDS(filename, not_annot_CDS_ids, output_fasta)

In [129]:
import pandas as pd
from Bio import SeqIO
tsv_file = "interpro_выдача/not_annot_targetCDS_ids_part2.tsv"
df = pd.read_csv(tsv_file, sep="\t")
fasta_file = "interpro_input/not_annot_targetCDS_ids_part2.fasta"
fasta_dict = {}
print(df["ID белка"])
for record in SeqIO.parse(fasta_file, "fasta"):
    parts = record.description.split("|", 1)
    if len(parts) > 1:
        fasta_id = parts[0]  # первое слово после '|'
    else:
        fasta_id = record.id  # fallback, если '|' нет
    fasta_id = f"{fasta_id}|"
    description = record.description
    product = None
    if "product:" in description:
        product = description.split("product:")[1].split("|")[0].strip()
    fasta_dict[fasta_id] = product

df["product"] = df["ID белка"].map(fasta_dict)
df = df[df["product"].notna()]

cols = df.columns.tolist()
cols.insert(1, cols.pop(cols.index("product")))  # вставляем 'product' на 2-е место
df = df[cols]
output_file = "not_annot_targetCDS_ids_part2_2.tsv"
df.to_csv(output_file, sep="\t", index=False)

0      OQ709194.1|
1      OQ709194.1|
2      OQ709194.1|
3      OQ709194.1|
4      OQ709194.1|
          ...     
540    MW347540.1|
541    MW347540.1|
542    MW347540.1|
543    MW347540.1|
544    MW347540.1|
Name: ID белка, Length: 545, dtype: object
