In [139]:
def get_ids(filename):
    ids = []
    with open(filename, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                id_part = line.split(":")[0]
                ids.append(id_part)
    return ids


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [140]:
from Bio import SeqIO
from textwrap import wrap

def extract_to_fasta(filename, id_list, output_fasta):
    c=0
    with open(output_fasta, "w") as out:
        for record in SeqIO.parse(filename, "genbank"):
            acc = record.name
            if acc not in id_list:
                continue
            c=0
            definition = record.description
            for feature in record.features:
                if feature.type == "CDS":
                    c+=1
                    product = feature.qualifiers.get("product", ["unknown_product"])[0]
                    translation = feature.qualifiers.get("translation", [""])[0]
                    header = f">{acc}.{c}| product: {product} | {definition}"
                    out.write(header + "\n")
                    seq_lines = wrap(translation, 60)
                    out.write("\n".join(seq_lines) + "\n")

In [74]:
from Bio import SeqIO
from textwrap import wrap
from Bio.Seq import Seq

def extract_to_fasta_noCDS(filename, id_list, output_fasta):
    with open(output_fasta, "w") as out:
        for record in SeqIO.parse(filename, "genbank"):
            acc = record.name
            if acc not in id_list:
                continue
            definition = record.description
            nucl_seq = str(record.seq)
            for frame in range(3):
                sub_seq = nucl_seq[frame:]                             
                #sub_seq = sub_seq[:len(sub_seq) - len(sub_seq) % 3]   
                prot_seq = str(Seq(sub_seq).translate(to_stop=True))
                if len(prot_seq) < 10:
                    continue
                header_prot = f">{acc}.n+{frame} | {definition}"
                out.write(header_prot + "\n")
                for line in wrap(prot_seq, 50):
                    out.write(line + "\n")

In [40]:
conflict_annot_ids = get_ids("Astroviridae_15102025_conflictingannot.txt")
conflict_annot_ids
filename = "Astroviridae_15102025.gb"
output_fasta = "conflict_annot_ids.fasta"
extract_to_fasta(filename, conflict_annot_ids, output_fasta)


In [37]:
not_annot_targetCDS_ids = get_ids("Astroviridae_15102025_noannot_targetCDS.txt")
not_annot_targetCDS_ids
filename = "Astroviridae_15102025.gb"
output_fasta = "not_annot_targetCDS_ids.fasta"
extract_to_fasta(filename, not_annot_targetCDS_ids, output_fasta)


In [75]:
not_annot_CDS_ids = get_ids("Astroviridae_15102025_noannotCDS.txt")
not_annot_CDS_ids
filename = "Astroviridae_15102025.gb"
output_fasta = "not_annot_CDS_ids.fasta"
extract_to_fasta_noCDS(filename, not_annot_CDS_ids, output_fasta)

In [129]:
import pandas as pd
from Bio import SeqIO
tsv_file = "interpro_выдача/not_annot_targetCDS_ids_part2.tsv"
df = pd.read_csv(tsv_file, sep="\t")
fasta_file = "interpro_input/not_annot_targetCDS_ids_part2.fasta"
fasta_dict = {}
print(df["ID белка"])
for record in SeqIO.parse(fasta_file, "fasta"):
    parts = record.description.split("|", 1)
    if len(parts) > 1:
        fasta_id = parts[0]  # первое слово после '|'
    else:
        fasta_id = record.id  # fallback, если '|' нет
    fasta_id = f"{fasta_id}|"
    description = record.description
    product = None
    if "product:" in description:
        product = description.split("product:")[1].split("|")[0].strip()
    fasta_dict[fasta_id] = product

df["product"] = df["ID белка"].map(fasta_dict)
df = df[df["product"].notna()]

cols = df.columns.tolist()
cols.insert(1, cols.pop(cols.index("product")))  # вставляем 'product' на 2-е место
df = df[cols]
output_file = "not_annot_targetCDS_ids_part2_2.tsv"
df.to_csv(output_file, sep="\t", index=False)

0      OQ709194.1|
1      OQ709194.1|
2      OQ709194.1|
3      OQ709194.1|
4      OQ709194.1|
          ...     
540    MW347540.1|
541    MW347540.1|
542    MW347540.1|
543    MW347540.1|
544    MW347540.1|
Name: ID белка, Length: 545, dtype: object


In [135]:
import pandas as pd
tsv_file = "interpro_выдача/not_annot_targetCDS_ids_part2.tsv"
df = pd.read_csv(tsv_file, sep="\t")
df = df[df["Источник предсказания"] != "MobiDBLite"].copy()
df.to_csv("interpro_выдача/not_annot_targetCDS_ids_part2.tsv", sep="\t", index=False)

In [138]:
import pandas as pd
tsv_files = [
    "interpro_выдача/not_annot_targetCDS_ids_part1.tsv",
    "interpro_выдача/not_annot_targetCDS_ids_part2.tsv",
    "interpro_выдача/not_annot_CDS.tsv", 
    "interpro_выдача/conflict_annot_ids.tsv"
]

for tsv_file in tsv_files:
    df = pd.read_csv(tsv_file, sep="\t")
    ids_with_pfam = df.loc[df['Источник предсказания'] == 'Pfam', 'ID белка'].unique()
    pfam_df = df[(df['ID белка'].isin(ids_with_pfam)) & (df['Источник предсказания'] == 'Pfam')]
    base_name = tsv_file.rsplit("/", 1)[-1].replace(".tsv", "")
    pfam_file = f"interpro_выдача/{base_name}_pfam.csv"
    non_pfam_file = f"interpro_выдача/{base_name}_not_pfam.csv"
    pfam_df.to_csv(pfam_file, index=False)
    non_pfam_df = df[~df['ID белка'].isin(ids_with_pfam)]
    non_pfam_df.to_csv(non_pfam_file, index=False)
    print(f"Обработан файл {tsv_file}: Pfam -> {pfam_file}, Non-Pfam -> {non_pfam_file}")


Обработан файл interpro_выдача/not_annot_targetCDS_ids_part1.tsv: Pfam -> interpro_выдача/not_annot_targetCDS_ids_part1_pfam.csv, Non-Pfam -> interpro_выдача/not_annot_targetCDS_ids_part1_not_pfam.csv
Обработан файл interpro_выдача/not_annot_targetCDS_ids_part2.tsv: Pfam -> interpro_выдача/not_annot_targetCDS_ids_part2_pfam.csv, Non-Pfam -> interpro_выдача/not_annot_targetCDS_ids_part2_not_pfam.csv
Обработан файл interpro_выдача/not_annot_CDS.tsv: Pfam -> interpro_выдача/not_annot_CDS_pfam.csv, Non-Pfam -> interpro_выдача/not_annot_CDS_not_pfam.csv
Обработан файл interpro_выдача/conflict_annot_ids.tsv: Pfam -> interpro_выдача/conflict_annot_ids_pfam.csv, Non-Pfam -> interpro_выдача/conflict_annot_ids_not_pfam.csv


In [154]:
import pandas as pd

csv_file_interpro = "interpro_выдача/not_annot_targetCDS_ids_part2_pfam.csv"
csv_file = "../ORF_names.csv"

df = pd.read_csv(csv_file_interpro)
print(df)
print(df.columns.tolist())
keywords_df = pd.read_csv(csv_file, header=None).fillna('') 
keywords_list = keywords_df[[0, 1]].values.tolist()
results = []

        ID белка                    product                         Хэш-белка  \
0    OQ709194.1|                polyprotein  18d966d090947c45f8538882a1f19b67   
1    OQ709194.1|                polyprotein  18d966d090947c45f8538882a1f19b67   
2    OQ802757.1|  nonstructural polyprotein  c78aeaafb716a38f041aafa6fad40af7   
3    OQ802714.1|  nonstructural polyprotein  e03db3e4bc1779e965cb203a0478205b   
4    MW239205.1|       hypothetical protein  a0968cbf78f8ffe73e82bd81a694e5af   
..           ...                        ...                               ...   
119  MT568535.1|                polyprotein  ee025da0c0f35e0304d2d3af2b58eab4   
120  MT568535.1|                polyprotein  ee025da0c0f35e0304d2d3af2b58eab4   
121  MW347540.1|  nonstructural polyprotein  3dda82de8bf678a74bfade9665c808ac   
122  MW347540.1|  nonstructural polyprotein  3dda82de8bf678a74bfade9665c808ac   
123  MW347540.1|  nonstructural polyprotein  3dda82de8bf678a74bfade9665c808ac   

     Длина белка Источник п

In [176]:
import pandas as pd

csv_files = [
    "interpro_выдача/not_annot_targetCDS_ids_part1_pfam.csv",
    "interpro_выдача/not_annot_targetCDS_ids_part2_pfam.csv",
    "interpro_выдача/not_annot_CDS_pfam.csv", 
    "interpro_выдача/conflict_annot_ids_pfam.csv"
]

csv_file_orf = "../ORF_names.csv"

orf_df = pd.read_csv(csv_file_orf, header=None).fillna('')
orf_list = orf_df[[0, 1]].values.tolist()

all_results = []

for csv_file in csv_files:
    df = pd.read_csv(csv_file)    
    for _, row in df.iterrows():
        description = str(row.get('Описание предсказания', '')).lower()
        matched = False
        
        for keyword, orf in orf_list:
            if keyword and keyword not in ['2', '3', 'S'] :
                words = keyword.lower().split()
                if any(word in description for word in words):
                    all_results.append({
                        'ID белка': row['ID белка'],
                        'Название семейства/белка': row.get('Название семейства/белка', ''),
                        'Описание предсказания': row.get('Описание предсказания', ''),
                        'Источник предсказания': row.get('Источник предсказания', ''),
                        'Вхождение из orf': keyword,
                        'ORF': orf,
                        'Start': row.get('Начало', ''),
                        'End': row.get('Конец', ''),
                        'Source File': csv_file
                    })
                    matched = True
                    break
        
        if not matched:
            all_results.append({
                'ID белка': row['ID белка'],
                'Название семейства/белка': row.get('Название семейства/белка', ''),
                'Описание предсказания': row.get('Описание предсказания', ''),
                'Источник предсказания': row.get('Источник предсказания', ''),
                'Вхождение из orf': '-',
                'ORF': '-',
                'Start': row.get('Начало', ''),
                'End': row.get('Конец', ''),
                'Source File': csv_file
            })

results_df = pd.DataFrame(all_results)
results_df.to_csv("matches_from_all_files.csv", index=False)

print(f"Обработано {len(csv_files)} файлов, всего записано {len(results_df)} строк с совпадениями и без.")


Обработано 4 файлов, всего записано 256 строк с совпадениями и без.
