### Imports and functions

In [9]:
import concurrent.futures as cf
import os
import re
from collections import defaultdict
from datetime import datetime
from itertools import islice
from pathlib import Path

import pandas as pd
from Bio import Entrez

import gdt

RE_ID = re.compile(r"ID=([^;]+)")
RE_gene = re.compile(r"gene=([^;]+)")
RE_dbxref_GeneID = re.compile(r"Dbxref=.*GeneID:([^;,]+)")


def increment_gdict_file(path):
    """Increment the GDICT file name by 1.

    Example: fungi-ncbi_pilot_03.gdict -> fungi-ncbi_pilot_04.gdict
    """
    plist = path.stem.split("_")
    if plist[-1] == "stripped":
        plist[-1] = "pilot"
        plist.append(0)

    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDICT file name: {path.name}. Expected format: <preferred_name>_##.gdict, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number


def most_recent_gdict(dir_path, prefix):
    """Get the most recent gdict file in the directory.

    Arguments:
        dir_path (Path): Directory to search for GDICT files.
        prefix (str): Prefix of the GDICT files to search for. It will match files like "*<prefix>*.gdict".

    Returns:
        Path: Path to the most recent GDICT file.

    """
    temp_files = list(
        dir_path.glob(f"*{prefix}*.gdict")
    )  # maybe change to check for numbers after prefix?
    if not temp_files:
        return dir_path / f"{prefix}_00.gdict"
    return gdt.natural_sort(temp_files, key=lambda x: x.stem)[-1]


def data_process(
    df_missing,
    an,
    gene_dict,
    temp_gene_dict,
    gct,
    temp_count,
    log,
    use_col="desc",
    temp_name="temp_desc",
    c_text="ncbi_desc",
    gn_tag="NCBI",
):
    """Process the data in the dataframe and update the gene_dict and corresponding temp_gene_dict.

    Args:
        df_missing (pd.DataFrame): DataFrame containing missing gene information.
        an (str): Annotation source identifier.
        gene_dict (GeneDict): Dictionary to store gene information.
        temp_gene_dict (GeneDict): Temporary dictionary for gene descriptions.
        gct (str): Genetic compartment, MIT or PLT (for now).
        temp_count (int): Counter for temporary labels.
        log (GDTLogger): Logger instance for logging debug information.
        use_col (str): Column to use for checking against gene_dict, defaults to 'desc'.
        temp_name (str): Name for the temporary dictionary.
        c_text (str): Text to be used in the '#c' field of the GeneDbxref object.
        gn_tag (str): Tag for the source of the gene description.

    Returns:
        tuple: Updated gene_dict, temp_gene_dict, and temp_count.

    """
    for row in df_missing.itertuples():
        check_var = getattr(row, use_col)
        extra = f" | ncbi_desc: {row.desc}" if use_col != "desc" else ""

        log.trace(
            f"gene_id: {row.gene_id} | dbxref: {row.dbxref} | att: {row.attributes}"
        )
        log.trace(f"\t{check_var = } | {use_col = }")
        log.trace(
            f"\t {row.desc = } | {row.gene = } | {row.gene_symbol = } | {row.other_aliases = }"
        )

        if check_var in gene_dict:
            gene_label = gene_dict[check_var].label
            log.trace(
                f"\t[1st T]check_var found in gene_dict, L: |{gene_label}| adding: {row.gene_id} #dx {an}:{row.dbxref} #c {c_text}: {check_var}{extra}"
            )
            gene_dict[row.gene_id] = gdt.DbxrefGeneID(
                label=gene_label,
                an_source=an,
                GeneID=row.dbxref,
                c=f"{c_text}: {check_var}{extra}",
            )

        else:
            log.trace(
                f"\t[1st F]check_var not found in gene_dict | checking {temp_name} | Label: {check_var}"
            )

            if check_var in temp_gene_dict:
                gene_label = temp_gene_dict[check_var].label
                log.trace(
                    f"\t[2nd T]check_var found in {temp_name}, L: |{gene_label}| adding: {row.gene_id} #dx {an}:{row.dbxref} #c {c_text}: {check_var}{extra}"
                )
                temp_gene_dict[row.gene_id] = gdt.DbxrefGeneID(
                    label=gene_label,
                    an_source=an,
                    GeneID=row.dbxref,
                    c=f"{c_text}: {check_var}{extra}",
                )
            else:

                temp_count += 1
                label = f"{gct}-TEMP-{temp_count}"
                log.trace(
                    f"\t[2nd F]check_var not found in {temp_name}, new label |{label}| adding: {row.gene_id} #dx {an}:{row.dbxref} #c {c_text}: {check_var}{extra}"
                )
                temp_gene_dict[check_var] = gdt.GeneDescription(
                    label=label,
                    source=gn_tag,
                    c=None,
                )

                temp_gene_dict[row.gene_id] = gdt.DbxrefGeneID(
                    label=label,
                    an_source=an,
                    GeneID=row.dbxref,
                    c=f"{c_text}: {check_var}{extra}",
                )

    return gene_dict, temp_gene_dict, temp_count


def batched(iterable, n):
    """Batch data into tuples of length n. The last batch may be shorter."""
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch


def check_envs(env_path):
    """Check and load environment variables for NCBI Entrez email and API key.

    If the 'dotenv' package is installed, it will load the environment variables
    from the specified .env file. If not, it will only check the environment variables.
    If the email is not set, it raises a ValueError with instructions on how to set it.
    If the API key is not set, it will print a warning about the default request limits.
    """
    try:
        import dotenv

        dotenv.load_dotenv(env_path)

    except ImportError:
        print(
            "Since the 'dotenv' package is not installed, check_envs will only check environment variables."
        )

    email = os.getenv("ENTREZ_EMAIL")
    api_key = os.getenv("ENTREZ_API_KEY")

    if email:
        print(f"Loading email: {email}")
        Entrez.email = os.getenv("ENTREZ_EMAIL")
    else:
        raise ValueError(
            "Entrez email is not set in the environment variables either,\n"
            "so please set it in the Setup cell, or set the environment variable 'ENTREZ_EMAIL' directly,\n"
            "or create an .env file with it, and pass its path to check_envs."
        )

    if api_key:
        print("NCBI API Key found, loading it.")
        Entrez.api_key = api_key
    elif not Entrez.api_key:
        print(
            "Entrez API key is defined, neither set up in the environment variables.\n"
            "This will limit the number of requests per second to NCBI to 3, as per\n"
            "Biopython Entrez guidelines: https://biopython.org/docs/1.76/api/Bio.Entrez.html "
        )

### Setup

#### A.

In [11]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.
# gct = genetic_compartment

DATA_DIR = "/home/brenodupin/matheus/tigre/sandbox/metazoans_mit"
newest_gdict_file = "metazoans_mit_pilot_01.gdict"
global_query_string = gdt.QS_GENE_TRNA_RRNA
remove_orfs = True
in_folder = True
gct = "MIT"
gff_ext = ".gff3"
gff_suffix = ""
workers = os.cpu_count() - 1 or 1

Entrez.email = ""
Entrez.api_key = ""

print(f"Chosen feature query string: '{global_query_string}'\n")

# just checking
DATA_DIR = Path(DATA_DIR).resolve()
if not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Path {DATA_DIR} is not a directory.")

MISC_DIR = DATA_DIR / "misc"
GDT_DIR = MISC_DIR / "gdt"
GDT_DIR.mkdir(511, True, True)  # 511 = 0o777

AN_missing_gene_dict = MISC_DIR / "AN_missing_gene_dict.txt"

if not AN_missing_gene_dict.is_file():
    raise FileNotFoundError(
        f"Missing {AN_missing_gene_dict}, did you run gdt-cli filter?"
    )
    pass

if "newest_gdict_file" in globals():
    gdict_path = GDT_DIR / newest_gdict_file
    if not gdict_path.is_file():
        print(
            f"Not found {gdict_path.name}, does it exist in misc/gdt?\nGDICTs in {GDT_DIR}:"
        )
        [print(f" - {f.name}") for f in sorted(GDT_DIR.glob("*.gdt"))]
        raise FileNotFoundError(
            f"Most recent GDIDT file {gdict_path.name} does not exist in {GDT_DIR}."
        )
else:
    print(
        "Warning: 'newest_gdict_file' variable not set.\n\n"
        "If you have a previous GDICT file:\n"
        "• Set the newest_gdict_file variable\n"
        "• Re-run this cell\n\n"
        "If you intend to run this without a GDICT file, this warning can be ignored."
    )
    # to simplify the code, a exetution without newest_gdict_file
    # basically the same as with one, but with and empty gdt file
    gdict_path = GDT_DIR / "pilot_00.gdict"
    gdt.create_empty_gdt(gdict_path)

if in_folder:
    gff_builder = gdt.GFFPathBuilder().use_folder_builder(
        DATA_DIR,
        gff_suffix,
        gff_ext,
    )
else:
    gff_builder = gdt.GFFPathBuilder().use_standard_builder(
        DATA_DIR,
        gff_suffix,
        gff_ext,
    )
print(f"Using GFF builder: {gff_builder}\n")

if not Entrez.email:
    print("Entrez email not set, checking if it's in a .env or in the environment.")
    check_envs(".env")

print("\nEverything is set up, no problems found.")

Chosen feature query string: 'type in ('gene', 'tRNA', 'rRNA')'

Using GFF builder: GFFPathBuilder(build='folder', base='/home/brenodupin/matheus/tigre/sandbox/metazoans_mit', ext='', suffix='.gff3')

Entrez email not set, checking if it's in a .env or in the environment.
Loading email: dupin@alunos.utfpr.edu.br
NCBI API Key found, loading it.

Everything is set up, no problems found.


#### b.

In [12]:
log_file = MISC_DIR / "03_missing_gene_dict.log"

log = gdt.create_simple_logger(
    print_to_console=True,
    console_level="INFO",
    save_to_file=True,
    file_level="TRACE",
    log_file=log_file,
)
log.info("Running from notebook AN_missing_gene_dict.ipynb")

2025-08-18 18:49:11,318 - INFO - Running from notebook AN_missing_gene_dict.ipynb


### TEMP First Pass

#### A.

In [13]:
with open(AN_missing_gene_dict, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
log.info(f"Found {len(ANs)} ANs in {AN_missing_gene_dict}")
log.trace(f"ANs: {ANs}")

2025-08-18 18:49:12,847 - INFO - Found 7303 ANs in /home/brenodupin/matheus/tigre/sandbox/metazoans_mit/misc/AN_missing_gene_dict.txt


#### B.

In [14]:
# Load the GDICT file (even if empty)
gene_dict = gdt.read_gdict(gdict_path, lazy_info=False)
log.info(f"GeneDict loaded from {gdict_path.name}")
log.debug(f"path: {gdict_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, gene_dict)

# set up the temporary gene dictionary
temp_gene_dict = gdt.GeneDict()
temp_count = 0

2025-08-18 18:49:18,253 - INFO - GeneDict loaded from metazoans_mit_pilot_01.gdict
2025-08-18 18:49:18,254 - INFO - Header:
2025-08-18 18:49:18,254 - INFO - 	version 0.0.2
2025-08-18 18:49:18,255 - INFO - 	Metazoans
2025-08-18 18:49:18,255 - INFO - 	2025-04-11 18:11 - Conversion from gdt to gdt2
2025-08-18 18:49:18,256 - INFO - GDT Info:
2025-08-18 18:49:18,256 - INFO - 	Labels: 818
2025-08-18 18:49:18,257 - INFO - 	Total Entries   : 267844
2025-08-18 18:49:18,257 - INFO - 	GeneDescriptions: 4647
2025-08-18 18:49:18,257 - INFO - 	GeneGenerics    : 2608
2025-08-18 18:49:18,258 - INFO - 	DbxrefGeneIDs   : 260589


#### C.

In [None]:
def get_data(an, gff_path, gene_dict, log, return_df=False):
    if not gff_path.exists():
        log.error(f"Error: {an} does not exist (an_path: {gff_path})")
        return None

    df = gdt.load_gff3(gff_path, query_string=global_query_string)
    df = gdt.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)
    df["gene"] = df["attributes"].str.extract(RE_gene, expand=False)
    df["has_gene"] = df["gene"].notna()
    df["dbxref"] = df["attributes"].str.extract(RE_dbxref_GeneID, expand=False)

    if return_df:
        return df

    before = len(df)
    df = df[~df["in_gene_dict"]]
    log.debug(f"{an}, total dbxrefs {before}, not in gene_dict {len(df)}")
    query_ids = df["dbxref"].unique()
    log.trace(f"query_ids: {list(query_ids)}")
    return query_ids


def query_ncbi(batch_queries):
    """Query NCBI for GeneID information."""
    ans = []
    all_dbxref = []
    for an, query in batch_queries:
        ans.append(an)
        all_dbxref.extend(query)

    big_query = ",".join(set(all_dbxref))
    try:
        with Entrez.esummary(db="gene", id=big_query) as search_handle:
            search_results = Entrez.read(search_handle)["DocumentSummarySet"]["DocumentSummary"]  # type: ignore

    except (RuntimeError, KeyError, Exception) as ex:
        return ans, None, ex

    return (
        ans,
        pd.DataFrame(
            [
                {
                    "dbxref": x.attributes["uid"],
                    "other_aliases": x.get("OtherAliases", "no_other_aliases"),
                    "desc": x.get("Description", "no_description"),
                    "gene_symbol": x.get("Name", "no_gene_symbol"),
                }
                for x in search_results
            ]
        ),
        False,  # no error
    )


def get_dbxref_data(AN, gene_dict, log):
    an_path = gff_builder.build(AN)
    dbxrefs = get_data(AN, an_path, gene_dict, log, return_df=False)

    if dbxrefs is None:
        return False, AN, None

    return True, AN, dbxrefs

In [None]:
errors = []

log.info(
    " ---- [Starting 'AN_missing_gene_dict parsing gene= + NCBI Description'] ----"
)
an_dbxref = [() for _ in range(len(ANs))]  # Initialize a list of lists for each AN

with cf.ThreadPoolExecutor(max_workers=workers) as ex:
    futures = {
        ex.submit(get_dbxref_data, AN, gene_dict, log): i for i, AN in enumerate(ANs)
    }

    for future in cf.as_completed(futures):
        i, (an, result) = future.result()
        if result[1] is False:
            log.error(f"Error querying NCBI for {an}: {result[2]}")
            log.error(f"Skipping {an} due to error.")
            log.trace(f'big query: {",".join(set(result[0]))}')
            errors.append((an, str(result[2])))
            continue

        an_dbxref[i] = (an, result)

if errors:
    log.error("Errors found during processing:")
    for an, error in errors:
        log.error(f" - {an}: {error}")
        raise ValueError("There were errors during processing, see log for details.")

In [None]:
errors = []

log.info(
    " ---- [Starting 'AN_missing_gene_dict parsing gene= + NCBI Description'] ----"
)
an_dbxref = []
for i, AN in enumerate(ANs):
    an_path = gff_builder.build(AN)
    dbxrefs = get_data(AN, an_path, gene_dict, log, return_df=False)

    if dbxrefs is None:
        errors.append((AN, f"File not found, expected path: {an_path}"))
        continue

    an_dbxref.append((AN, dbxrefs))

if errors:
    log.error("Errors found during processing:")
    for an, error in errors:
        log.error(f" - {an}: {error}")
        raise ValueError("There were errors during processing, see log for details.")


batch_size = 9
batches = batched(an_dbxref, batch_size)
log.debug(f"Total batches: {len(an_dbxref)/ batch_size} (batch size: {batch_size})")

with cf.ProcessPoolExecutor(max_workers=8) as ex:
    for ans, result, error in ex.map(query_ncbi, batches):
        log.info(
            f"Processing batch ANs: {ans}, {len(result) if result is not None else 'None'} results"
        )

        if error:
            log.warning(
                "Error in batch result, running this batch stepwise, querying each AN separately."
            )
            log.debug(f"ANs: {ans}")
            log.debug(f"Error: {error}")

        for an in ans:
            log.debug(f"-- [Processing: {an}] --")

            an_path = gff_builder.build(an)
            df = get_data(an, an_path, gene_dict, log, return_df=True)

            df_missing = df[~df["in_gene_dict"]].copy()

            if error:  # trying to recover from batch error, by single querying each AN
                _, result, error_single = query_ncbi(
                    [(an, list(df_missing["dbxref"].unique()))]
                )
                if error_single:
                    log.error(f"Error querying NCBI for {an}: {error_single}")
                    log.error(f"Skipping {an} due to error.")
                    log.trace(
                        f'big query: {",".join(set(df_missing["dbxref"].unique()))}'
                    )
                    errors.append((an, str(error_single)))
                    continue

            df_merged = df_missing.merge(result, on="dbxref", how="left", copy=False)
            log.debug(
                f"After merging with {len(result)}, NCBI results: {len(df_merged)}."
            )

            if len(df_merged) != len(df_missing):
                log.error(
                    f"There should not be a mismatch in the number of rows: {len(df_merged)} != {len(df_missing)}"
                )
                continue

            # in case NCBI did not return any results for some dbxrefs, or no gff gene
            df_merged["other_aliases"] = df_merged["other_aliases"].fillna(
                "no_other_aliases"
            )
            df_merged["desc"] = df_merged["desc"].fillna("no_description")
            df_merged["gene_symbol"] = df_merged["gene_symbol"].fillna("no_gene_symbol")
            df_merged["gene"] = df_merged["gene"].fillna("no_gff_gene")

            # process gene= first
            df_gene = df_merged[df_merged["has_gene"]]
            if not df_gene.empty:
                log.debug(f"Found {len(df_gene)} feature(s) with `gene=`.")

                gene_dict, temp_gene_dict, temp_count = data_process(
                    df_gene,
                    AN,
                    gene_dict,
                    temp_gene_dict,
                    gct,
                    temp_count,
                    log,
                    use_col="gene",
                    temp_name="temp_gene",
                    c_text="gff_gene",
                    gn_tag="gff_gene",
                )

            # process of ncbi description second
            df_desc = df_merged[~df_merged["has_gene"]]
            if not df_desc.empty:
                log.debug(f"Found {len(df_desc)} feature(s) with no `gene=`.")

                gene_dict, temp_gene_dict, temp_count = data_process(
                    df_desc,
                    AN,
                    gene_dict,
                    temp_gene_dict,
                    gct,
                    temp_count,
                    log,
                )

    log.info(
        " ---- [Finished 'AN_missing_gene_dict parsing gene= + NCBI Description'] ----"
    )
    if errors:
        log.warning(f"Errors: {len(errors)}")
        for an, msg in errors:
            log.warning(f"  {an} - {msg}")

        log.warning(
            "Entrez.read errors: This is usually a sporadic event or invalid database references."
        )
        log.warning("Next steps to diagnose:")
        log.warning(
            "1. Manually verify a few dbxrefs from your GFF file by searching them in NCBI"
        )
        log.warning("2. If the dbxrefs are valid in NCBI:")
        log.warning("   - Save the current gene_dict as your latest GDICT file,")
        log.warning("     and update it at the Setup.")
        log.warning("   - Re-run this section (the issue was likely temporary)")
        log.warning("3. If the dbxrefs are invalid/obsolete:")
        log.warning("   - Option A: Remove said ANs from your dataset")
        log.warning("   - Option B: Manually remove ANs from AN_missing_gene_dict.txt)")
        log.warning("     and add those ANs to AN_missing_dbxref.txt instead.")

2025-08-18 18:49:28,866 - INFO -  ---- [Starting 'AN_missing_gene_dict parsing gene= + NCBI Description'] ----
2025-08-18 19:00:55,719 - INFO - Processing batch ANs: ['NC_000834.1', 'NC_000845.1', 'NC_000857.1', 'NC_000886.1', 'NC_000878.1', 'NC_000934.1', 'NC_001328.1', 'NC_001453.1', 'NC_001325.1'], 216 results
2025-08-18 19:00:56,660 - INFO - Processing batch ANs: ['NC_000891.1', 'NC_001566.1', 'NC_001322.1', 'NC_001643.1', 'NC_001573.1', 'NC_001644.1', 'NC_001640.1', 'NC_001626.1', 'NC_001665.2'], 216 results
2025-08-18 19:00:57,598 - INFO - Processing batch ANs: ['NC_001700.1', 'NC_001788.1', 'NC_001808.1', 'NC_001913.1', 'NC_001821.1', 'NC_002008.4', 'NC_002073.3', 'NC_002012.1', 'NC_002069.2'], 216 results
2025-08-18 19:00:58,560 - INFO - Processing batch ANs: ['NC_001941.1', 'NC_002080.2', 'NC_002083.1', 'NC_002078.1', 'NC_002084.1', 'NC_002355.1', 'NC_002503.2', 'NC_002545.1', 'NC_002504.1'], 217 results
2025-08-18 19:00:59,556 - INFO - Processing batch ANs: ['NC_002631.2', 'N

In [8]:
raise

RuntimeError: No active exception to reraise

In [None]:
errors = []

log.info(
    " ---- [Starting 'AN_missing_gene_dict parsing gene= + NCBI Description'] ----"
)
for i, AN in enumerate(ANs):
    log.debug(f"-- [Processing: {AN}] --")

    an_path = gff_builder.build(AN)
    if not an_path.exists():
        log.error(f"Error: {AN} does not exist (an_path: {an_path})")
        errors.append((AN, f"File not found, expected path: {an_path}"))
        continue

    df = gdt.load_gff3(an_path, query_string=global_query_string)
    df = gdt.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)
    df["gene"] = df["attributes"].str.extract(RE_gene, expand=False)
    df["has_gene"] = df["gene"].notna()
    df["dbxref"] = df["attributes"].str.extract(RE_dbxref_GeneID, expand=False)

    df_missing = df[~df["in_gene_dict"]].copy()
    log.debug(f"Total: {len(df)}, querying {len(df_missing)}")
    query_ids = ",".join(df_missing["dbxref"].unique())
    log.trace(f"query_ids: {query_ids}")

    if df_missing.empty:
        log.warning(
            f"An earlier query added all missing gene_ids, making {AN} a 'good to go', with nothing to do."
            "This breaks the 1 to 1 relationship between #dx entries and features with dbxrefs. "
            "This happens when two features from different ANs, with different dbxref values, have the same 'ID=' "
        )
        log.trace(f"All gene_ids: {df['gene_id'].to_list()}")
        continue

    try:
        with Entrez.esummary(db="gene", id=query_ids) as search_handle:
            search_results = Entrez.read(search_handle)["DocumentSummarySet"]["DocumentSummary"]  # type: ignore

    except (RuntimeError, KeyError, Exception) as ex:
        log.error(f"{AN} got the error in try/except: {ex}")
        errors.append((AN, f"Entrez.read: {ex}"))
        continue

    if len(search_results) != len(df_missing["dbxref"].unique()):
        log.warning(
            f"Number of search results ({len(search_results)}) does not match number of dbxrefs ({len(df_missing['dbxref'].unique())}) for {AN}."
        )
        missing_dbxrefs = set(df_missing["dbxref"].unique()) - set(
            x.attributes["uid"] for x in search_results
        )
        log.trace(f"Missing dbxrefs: {missing_dbxrefs}")
        log.warning("The missing dbxrefs will be under the 'no_description' label.")

    # format the search results into a DataFrame
    temp_df = pd.DataFrame(
        [
            {
                "dbxref": x.attributes["uid"],
                "other_aliases": x.get("OtherAliases", "no_other_aliases"),
                "desc": x.get("Description", "no_description"),
                "gene_symbol": x.get("Name", "no_gene_symbol"),
            }
            for x in search_results
        ]
    )

    df_merged = df_missing.merge(temp_df, on="dbxref", how="left", copy=False)
    log.debug(f"After merging with {len(temp_df)} NCBI results: {len(df_merged)}.")

    # in case NCBI did not return any results for some dbxrefs, or no gff gene
    df_merged["other_aliases"] = df_merged["other_aliases"].fillna("no_other_aliases")
    df_merged["desc"] = df_merged["desc"].fillna("no_description")
    df_merged["gene_symbol"] = df_merged["gene_symbol"].fillna("no_gene_symbol")
    df_merged["gene"] = df_merged["gene"].fillna("no_gff_gene")

    # process gene= first
    df_gene = df_merged[df_merged["has_gene"]]
    if not df_gene.empty:
        log.debug(f"Found {len(df_gene)} feature(s) with `gene=`.")

        gene_dict, temp_gene_dict, temp_count = data_process(
            df_gene,
            AN,
            gene_dict,
            temp_gene_dict,
            gct,
            temp_count,
            log,
            use_col="gene",
            temp_name="temp_gene",
            c_text="gff_gene",
            gn_tag="gff_gene",
        )

    # process of ncbi description second
    df_desc = df_merged[~df_merged["has_gene"]]
    if not df_desc.empty:
        log.debug(f"Found {len(df_desc)} feature(s) with no `gene=`.")

        gene_dict, temp_gene_dict, temp_count = data_process(
            df_desc,
            AN,
            gene_dict,
            temp_gene_dict,
            gct,
            temp_count,
            log,
        )

log.info(
    " ---- [Finished 'AN_missing_gene_dict parsing gene= + NCBI Description'] ----"
)
if errors:
    log.warning(f"Errors: {len(errors)}")
    for an, msg in errors:
        log.warning(f"  {an} - {msg}")

    log.warning(
        "Entrez.read errors: This is usually a sporadic event or invalid database references."
    )
    log.warning("Next steps to diagnose:")
    log.warning(
        "1. Manually verify a few dbxrefs from your GFF file by searching them in NCBI"
    )
    log.warning("2. If the dbxrefs are valid in NCBI:")
    log.warning("   - Save the current gene_dict as your latest GDICT file,")
    log.warning("     and update it at the Setup.")
    log.warning("   - Re-run this section (the issue was likely temporary)")
    log.warning("3. If the dbxrefs are invalid/obsolete:")
    log.warning("   - Option A: Remove said ANs from your dataset")
    log.warning("   - Option B: Manually remove ANs from AN_missing_gene_dict.txt)")
    log.warning("     and add those ANs to AN_missing_dbxref.txt instead.")

#### D.

In [None]:
if temp_gene_dict:
    temp_path = most_recent_gdict(GDT_DIR, prefix="TEMP_Description")
    new_path, nth_iteration = increment_gdict_file(temp_path)
    log.info(f"Writing TEMP_Description to {new_path} | Iteration: {nth_iteration}")
    temp_gene_dict.header = [
        "version 0.0.2",
        f"TEMP_Description - {nth_iteration}",
        f"{gdt.time_now()} - Automatically generated from 'AN_missing_gene_dict parsing gene= + NCBI Description'",
    ]
    temp_gene_dict.to_gdict(new_path, overwrite=True)

#### E.

In [None]:
# saving gene_dict with the new data, dont forget to change the newest_gdict_file variable
new_path, nth_iteration = increment_gdict_file(gdict_path)
log.info(f"Writing GeneDict to {new_path} | Iteration: {nth_iteration}")
gene_dict.header.append(
    f"{gdt.time_now()} - Data added from 'AN_missing_gene_dict parsing gene= + NCBI Description'"
)
gene_dict.to_gdict(new_path, overwrite=True)
log.info(f"{new_path.name} was created in misc/gdt!")
log.info(
    "You must now add it to newest_gdict_file in the Setup cell, and rerun the cell"
)

In [None]:
raise

#### workaround

In [None]:
donor_path = Path(GDT_DIR / "TEMP_Description_01.gdict")
donor_gdict = gdt.read_gdict(donor_path, lazy_info=False)
log.info(f"Donor GDICT loaded from {donor_path.name}")
log.debug(f"path: {donor_path}")

log.info("Header:")
for x in donor_gdict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, donor_gdict)

In [None]:
recipient, donor = gdt.parse_via_comments(gene_dict, donor_gdict, lazy_info=False)

In [None]:
log.info(f"Donor GeneDict ({donor_path.name}) after parsing")

log.info("Header:")
for x in donor.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, donor)

In [None]:
temp_path = most_recent_gdict(GDT_DIR, prefix="TEMP_Description")
new_path, nth_iteration = increment_gdict_file(temp_path)
log.info(f"Writing GeneDict to {new_path} | Iteration: {nth_iteration}")

donor.to_gdict(new_path, overwrite=True)
log.info(f"{new_path.name} was created in misc/gdt!")

In [None]:
log.info("Recipient GeneDict (gene_dict) after parsing")

log.info("Header:")
for x in recipient.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, recipient)

In [None]:
# saving gene_dict with the new data, dont forget to change the newest_gdict_file variable
new_path, nth_iteration = increment_gdict_file(gdict_path)
log.info(f"Writing GeneDict to {new_path} | Iteration: {nth_iteration}")

recipient.to_gdict(new_path, overwrite=True)
log.info(f"{new_path.name} was created in misc/gdt!")
log.info(
    "You must now add it to newest_gdict_file in the Setup cell, and rerun the cell"
)

### TEMP Second Pass

#### A.

In [None]:
ANS_Symbol = set()

with open(MISC_DIR / "seed_TEMP_Symbol.txt", "r") as f:
    for line in f:
        line = line.strip()

        if not line or line.startswith("#") or line.startswith("[") or "#gd" in line:
            continue  # skip these lines

        try:
            ANS_Symbol.add(line.split("#dx", 1)[1].strip().split(":", 1)[0])
        except IndexError:
            log.warning(f"Check this line: {line}")
            continue
log.debug("If you had any warnings, check them and rerun this step.")
log.info(f"Found {len(ANS_Symbol)} ANs in seed_TEMP_Symbol.txt")

#### B.

In [None]:
# Load the GDICT file (even if empty),
# if you are running this right after the previous step,
# dont forget to change the newest_gdict_file variable
# and re-run the setup

gene_dict = gdt.read_gdict(gdict_path, lazy_info=False)
log.info(f"GeneDict loaded from {gdict_path.name}")
log.debug(f"path: {gdict_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, gene_dict)

temp_symbol_gene_dict = gdt.GeneDict()
temp_count = 0

#### C.

In [None]:
errors = []

log.info(" ---- [Starting 'AN_missing_gene_dict parsing NCBI Gene Symbol'] ----")
for i, AN in enumerate(ANS_Symbol):
    log.debug(f"-- [Processing: {AN}] --")

    an_path = gff_builder.build(AN)
    if not an_path.exists():
        log.error(f"Error: {AN} does not exist (an_path: {an_path})")
        errors.append((AN, "File not found"))
        continue

    df = gdt.load_gff3(an_path, query_string=global_query_string)
    df = gdt.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)
    df["gene"] = df["attributes"].str.extract(RE_gene, expand=False)
    df["has_gene"] = df["gene"].notna()
    df["dbxref"] = df["attributes"].str.extract(RE_dbxref_GeneID, expand=False)

    df_missing = df[~df["in_gene_dict"]].copy()
    if df_missing.empty:
        log.debug("All features are in gene_dict. Skipping.")
        continue

    # search NCBI
    with Entrez.esummary(
        db="gene", id=",".join(df_missing["dbxref"].unique())
    ) as search_handle:
        try:
            search_results = Entrez.read(search_handle)["DocumentSummarySet"]["DocumentSummary"]  # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            log.error(f"{ex} in Entrez.read for {AN}")
            errors.append((AN, "Entrez.read"))
            continue

    if len(search_results) != len(df_missing["dbxref"].unique()):
        log.warning(
            f"Number of search results ({len(search_results)}) does not match number of dbxrefs ({len(df_missing['dbxref'].unique())}) for {AN}."
        )
        missing_dbxrefs = set(df_missing["dbxref"].unique()) - set(
            x.attributes["uid"] for x in search_results
        )
        log.warning(f"Missing dbxrefs: {missing_dbxrefs}")
        log.warning("The missing dbxrefs will be under the 'no_description' tag.")

    # format the search results into a DataFrame
    temp_df = pd.DataFrame(
        [
            {
                "dbxref": x.attributes["uid"],
                "other_aliases": x.get("OtherAliases", "no_other_aliases"),
                "desc": x.get("Description", "no_description"),
                "gene_symbol": x.get("Name", "no_gene_symbol"),
            }
            for x in search_results
        ]
    )

    df_merged = df_missing.merge(temp_df, on="dbxref", how="left", copy=False)

    # in case NCBI did not return any results for some dbxrefs
    df_merged["other_aliases"] = df_merged["other_aliases"].fillna("no_other_aliases")
    df_merged["desc"] = df_merged["desc"].fillna("no_description")
    df_merged["gene_symbol"] = df_merged["gene_symbol"].fillna("no_gene_symbol")
    df_merged["gene"] = df_merged["gene"].fillna("no_gff_gene")

    # process of ncbi gene symbol
    gene_dict, temp_symbol_gene_dict, temp_count = data_process(
        df_merged,
        AN,
        gene_dict,
        temp_symbol_gene_dict,
        gct,
        temp_count,
        log,
        use_col="gene_symbol",
        temp_name="temp_symbol",
        c_text="ncbi_symbol",
    )


log.info(" ---- [Finished 'AN_missing_gene_dict parsing NCBI Gene Symbol'] ----")
if errors:
    log.warning(f"Errors: {len(errors)}")
    for an, msg in errors:
        log.warning(f"  {an} - {msg}")
    log.warning(
        "Entrez.read errors: This is usually a sporadic event or invalid database references."
    )
    log.warning("Next steps to diagnose:")
    log.warning(
        "1. Manually verify a few dbxrefs from your GFF file by searching them in NCBI"
    )
    log.warning("2. If the dbxrefs are valid in NCBI:")
    log.warning("   - Save the current gene_dict as your latest GDT file")
    log.warning("   - Re-run this section (the issue was likely temporary)")
    log.warning("3. If the dbxrefs are invalid/obsolete:")
    log.warning("   - Option A: Remove said ANs from your dataset")
    log.warning("   - Option B: Manually remove ANs from AN_missing_gene_dict.txt)")
    log.warning("     and add those ANs to AN_missing_dbxref.txt instead")

#### D.

In [None]:
if temp_symbol_gene_dict:
    temp_path = most_recent_gdict(GDT_DIR, prefix="TEMP_Symbol")
    new_path, symbol_iteration = increment_gdict_file(temp_path)
    log.info(f"Writing TEMP Symbol to {new_path} | Iteration: {symbol_iteration}")
    temp_symbol_gene_dict.header = [
        "version 0.0.2",
        f"TEMP Symbol - {symbol_iteration}",
        f"{gdt.time_now()} - Automatically generated from 'AN_missing_gene_dict parsing NCBI Gene Symbol'",
    ]
    temp_symbol_gene_dict.to_gdict(new_path, overwrite=True)
else:
    log.info(
        "No TEMP Symbol GDT file created, meaning no unknown gene symbols were found."
    )
    symbol_iteration = 0

#### E.

In [None]:
# saving gene_dict with the new data, dont forget to change the newest_gdict_file variable
new_path, nth_iteration = increment_gdict_file(gdict_path)
log.info(f"Writing GeneDict to {new_path} | pilot itr: {nth_iteration}")
gene_dict.header.append(
    f"{gdt.time_now()} - Data added from 'AN_missing_gene_dict parsing NCBI Gene Symbol'"
)
gene_dict.to_gdict(new_path, overwrite=True)
log.info(f"{new_path.name} was created in misc/gdt!")
log.info(
    "You must now add it to newest_gdict_file in the Setup cell, and rerun the cell"
)

### Genes exclusion of to_exclude.txt

#### A.

In [None]:
append_string = "discard-"
genes_to_exclude = "to_exclude.txt"

exclude_gene_ids = defaultdict(set)
with open(MISC_DIR / genes_to_exclude, "r") as f:
    for line in f:
        if (
            not line.strip()
            or line.startswith("#")
            or line.startswith("[")
            or "#gd" in line
        ):
            continue

        try:
            gene_id, an = line.split("#c", 1)[0].split("#dx", 1)
            an = an.split(":", 1)[0].strip()
            exclude_gene_ids[an].add(gene_id.strip())
        except ValueError:
            log.warning(f"Check this line: {line.strip()}")
            continue

log.info(f"Found {len(exclude_gene_ids)} ANs in {genes_to_exclude}")

#### B.

In [None]:
log.info(" ---- [Starting 'AN_missing_gene_dict excluding gene IDs from GFF3s'] ----")
for an in exclude_gene_ids.keys():
    log.info(f"Processing {an} for excluding {len(exclude_gene_ids[an])} gene IDs")
    log.trace(f" excluding gene IDs: {exclude_gene_ids[an]}")
    an_path = gff_builder.build(an)
    with open(an_path, "r") as f:
        lines = f.readlines()

    headers, index = [], 0
    while lines[index].startswith("#"):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile(
        "|".join([re.escape(f"ID={x};") for x in exclude_gene_ids[an]])
    )
    log.trace(f"Pattern for exclusion: {pattern.pattern}")
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()):
            continue
        line = line.split("\t")

        # line[2] is type, line[8] is attributes
        if pattern.search(line[8]):
            if append_string not in line[2]:
                line[2] = append_string + line[2]

        contents.append("\t".join(line))

    with open(an_path, "w") as f:
        f.write("\n".join(headers))
        f.write("\n")
        f.write("\n".join(contents))
        f.write("\n\n")

log.info(" ---- [Finished 'AN_missing_gene_dict excluding gene IDs from GFF3s'] ----")