In [8]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
import gdt

from Bio import Entrez

In [9]:
nth_iteration = 1

In [10]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../test/Test_group16"
AN_missing_dbxref = "../test/Test_group16/AN_missing_dbxref"
#most_recent_gdt_file = "../test/Test_group16/Test_group16.gdt"
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

Entrez.email = 'dupin@alunos.utfpr.edu.br'
Entrez.api_key = 'b3abc1ac7ae9ac035af84ec1abf895878d09'

In [11]:
most_recent_gdt_file = ""

In [12]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_dbxref = Path(AN_missing_dbxref).resolve()

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Data directory {DATA_DIR} does not exist or is not a directory.")

if not AN_missing_dbxref.exists() and not AN_missing_dbxref.is_file():
    raise FileNotFoundError(f"AN missing dbxref {AN_missing_dbxref} does not exist or is not a file.")

if not most_recent_gdt_file:
    if nth_iteration > 1:
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")
    else:
        print(f"If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.")
        print(f"Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")

If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.
Otherwise, ignore this message.


In [13]:
_, logger = gdt.logger_setup.logger_creater(log_file=DATA_DIR / '0_test_2.log', console_level="DEBUG", file_level="TRACE")
logger.debug("Running from notebook AN_missing_dbxref")

2025-05-08 15:57:30,197 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/test/Test_group16/0_test_2.log
2025-05-08 15:57:30,198 - DEBUG - Running from notebook AN_missing_dbxref


In [15]:
with open(AN_missing_dbxref, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 118


In [16]:
missing_dbxref_path = DATA_DIR / "missing_dbxref"
missing_dbxref_path.mkdir(exist_ok=True)

In [17]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
else:
    gene_dict = {}

temp_gene_dict = {}

In [20]:
temp_list = []
for AN in ANs:
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    if not an_path.exists():
        logger.error(f'Error: {AN} does not exist (an_path: {an_path})')
        continue
    
    df = gdt.gff3_utils.load_gff3(an_path, query_string='type == "gene"', usecols=['seqid', 'start', 'end', 'type', 'attributes'])
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df['gene_id'] = df['attributes'].str.split(';').str[0].str.replace('ID=', '', regex=False)
    df['in_gene_dict'] = df['gene_id'].isin(gene_dict)
    df_missing = df[~df['in_gene_dict']].copy()

    temp_list.extend(df_missing[['gene_id', 'seqid']].to_dict('records'))

In [21]:
agg_dbxref = (pd.DataFrame(temp_list).groupby('gene_id')['seqid']
                .agg(list)
                .sort_index())  # Sort by gene_id

In [22]:
with open(missing_dbxref_path / 'dbxref_problems', 'w+') as f1:
    for gene_id, seqid in agg_dbxref.items():
        f1.write(f'{gene_id} || {" ".join(seqid)}\n')