In [None]:
# imports and functions
import re
import gdt
import pandas as pd
from Bio import Entrez
from pathlib import Path
from datetime import datetime


RE_ID = re.compile(r"ID=([^;]+)")
RE_gene = re.compile(r"gene=([^;]+)")
RE_dbxref = re.compile(r"Dbxref=GeneID:([^;,]+)")
# RE_dbxref = re.compile(r"GeneID:([^;,]+)")

# todo talk about the dbxref2


def increment_gdt_file(path):
    """
    Increment the GDT file name by 1.
    Example: fungi-ncbi_pilot_03.gdt -> fungi-ncbi_pilot_04.gdt
    """
    plist = path.stem.split("_")
    if plist[-1] == "stripped":
        plist[-1] = "pilot"
        plist.append(0)

    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDT file name: {path.name}. Expected format: <preferred_name>_##.gdt, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number


def get_most_recent_gdt(dir_path, prefix="TEMP_"):
    """
    Get the most recent gdt file in the directory.
    Arguments:
        dir_path (Path): Directory to search for GDT files.
        prefix (str): Prefix of the GDT files to search for. It will match files like "<prefix>*.gdt".
    Returns:
        Path: Path to the most recent GDT file.
    """
    temp_files = list(dir_path.glob(f"*{prefix}*.gdt"))
    if not temp_files:
        return dir_path / f"{prefix}00.gdt"
    return gdt.gene_dict_impl.natural_sort(temp_files, key=lambda x: x.stem)[-1]


def data_process(
    df_missing,
    AN,
    gene_dict,
    temp_gene_dict,
    organelle_type,
    temp_count,
    log,
    use_NCBI_symbol=False,
    use_gene=False,
    temp_name="temp_desc",
    c_text="ncbi_desc",
    gn_tag="NCBI",
):
    """
    Process the data in the dataframe and update the gene_dict and corresponding temp_gene_dict.
    Args:
        df_missing (pd.DataFrame): DataFrame containing missing gene information.
        AN (str): Annotation source identifier.
        gene_dict (dict): Dictionary to store gene information.
        temp_gene_dict (dict): Temporary dictionary for gene descriptions.
        organelle_type (str): Type of organelle, MT or PT.
        temp_count (int): Counter for temporary labels.
        log (Logger): Logger instance for logging debug information.
        use_NCBI_symbol (bool): Whether to use NCBI gene symbols. Default is False, so it uses gene descriptions.
        use_gene (bool): Whether to use the 'gene' field instead of NCBI information.
        temp_name (str): Name for the temporary dictionary.
        c_text (str): Text to be used in the '#c' field of the GeneDbxref object.
        gn_tag (str): Tag for the source of the gene description.
    Returns:
        tuple: Updated gene_dict, temp_gene_dict, and temp_count.
    """
    for row in df_missing.itertuples():
        check_var = row.gene_symbol if use_NCBI_symbol else row.desc
        check_var = row.gene if use_gene else check_var
        check_desc = (
            f"{check_var} | ncbi_desc: {row.desc}" if use_NCBI_symbol else check_var
        )

        log.debug(
            f"gene_id: {row.gene_id} | dbxref: {row.dbxref} | s: {row.start} | att: {row.attributes}"
        )
        log.trace(f"\t{check_var = } | {use_NCBI_symbol = } | {use_gene = }")
        log.trace(f"\t{row.other_aliases = } | {row.desc = } | {row.gene_symbol = }")

        if check_var in gene_dict:
            gene_label = gene_dict[check_var].label
            log.debug(
                f"\t[1st T]Label in gene_dict, L: |{gene_label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}"
            )
            gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                label=gene_label,
                an_source=AN,
                dbxref=row.dbxref,
                c=f"{c_text}: {check_desc}",
            )

        else:
            log.trace(
                f"\t[1st F]Label not found gene_dict | checking {temp_name} | Label: {check_var}"
            )

            if check_var in temp_gene_dict:
                gene_label = temp_gene_dict[check_var].label
                log.debug(
                    f"\t[2nd T]Label in {temp_name}, L: |{gene_label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}"
                )
                temp_gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f"{c_text}: {check_desc}",
                )
            else:

                temp_count += 1
                label = f"{organelle_type}-TEMP-{temp_count}"
                log.debug(
                    f"\t[2nd F]Label not in {temp_name}, new label |{label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}"
                )
                temp_gene_dict[check_var] = gdt.gene_dict_impl.GeneDescription(
                    label=label, source=gn_tag, c=None
                )

                temp_gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                    label=label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f"{c_text}: {check_desc}",
                )

    return gene_dict, temp_gene_dict, temp_count

### Setup

In [None]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "/home/brenodupin/matheus/gdt/sandbox/fungi_mt_model2"
most_recent_gdt_filename = "fungi_mt_model_pilot_01.gdt"
global_query_string = gdt.gff3_utils.QS_GENE_TRNA_RRNA
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"


Entrez.email = "dupin@alunos.utfpr.edu.br"
Entrez.api_key = "b3abc1ac7ae9ac035af84ec1abf895878d09"
print(f"Chosen feature query string: '{global_query_string}'\n")

# just checking
DATA_DIR = Path(DATA_DIR).resolve()
if not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Path {DATA_DIR} is not a directory.")

MISC_DIR = DATA_DIR / "misc"
GDT_DIR = MISC_DIR / "gdt"
GDT_DIR.mkdir(511, True, True)  # 511 = 0o777

AN_missing_gene_dict = MISC_DIR / "AN_missing_gene_dict.txt"

if not AN_missing_gene_dict.is_file():
    raise FileNotFoundError(
        f"Missing {AN_missing_gene_dict}, did you run geneDict filter?"
    )

if "most_recent_gdt_filename" in globals():
    gdt_path = GDT_DIR / most_recent_gdt_filename
    if not gdt_path.is_file():
        print(
            f"Not found {gdt_path.name}, does it exist in misc/gdt?\nGDTs in {GDT_DIR}:"
        )
        [print(f" - {f.name}") for f in sorted(GDT_DIR.glob("*.gdt"))]
        raise FileNotFoundError(
            f"Most recent GDT file {gdt_path.name} does not exist in {GDT_DIR}."
        )
else:
    print(
        "Warning: 'most_recent_gdt_filename' variable not set.\n\n"
        "If you have a previous GDT file:\n"
        "• Set the most_recent_gdt_filename variable\n"
        "• Re-run this cell\n\n"
        "If you intend to run this without a GDT file, this warning can be ignored."
    )
    # to simplify the code, a exetution without most_recent_gdt_filename
    # basically the same as with one, but with and empty gdt file
    gdt_path = GDT_DIR / "pilot_00.gdt"
    gdt.gene_dict_impl.create_empty_gdt(gdt_path)

In [None]:
log_file = MISC_DIR / "01_missing_gene_dict.log"

_, log = gdt.logger_setup.logger_creater(
    log_file=log_file, console_level="DEBUG", file_level="TRACE"
)
log.debug("Running from notebook AN_missing_gene_dict")

### TEMP using gff 'gene=' + NCBI description

In [None]:
with open(AN_missing_gene_dict, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"{len(ANs) = }")

In [None]:
# Load the GDT file (even if empty)
gene_dict = gdt.gene_dict_impl.create_gene_dict(gdt_path, max_an_sources=0)
log.info(f"Loaded gene_dict from {gdt_path}")
log.info("Header:")
[log.info(f"\t{x}") for x in gene_dict["gdt_header"]]
log.info("GDT Info:")
[log.info(f"\t{x}") for x in gene_dict["gdt_info"]]

temp_gene_dict = {}

In [None]:
temp_gene_dict = {}
temp_count = 0
errors = []

log.info(" ---- [Starting TEMP process] ----")
for i, AN in enumerate(ANs):
    log.debug(f"-- [Processing: {AN}] --")

    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    if not an_path.exists():
        log.error(f"Error: {AN} does not exist (an_path: {an_path})")
        errors.append((AN, "File not found"))
        continue

    df = gdt.gff3_utils.load_gff3(an_path, query_string=global_query_string)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["gene"] = df["attributes"].str.extract(RE_gene, expand=False)
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)
    df["has_gene"] = df["gene"].notna()

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    df["dbxref"] = df["attributes"].str.extract(RE_dbxref, expand=False)

    df_missing = df[~df["in_gene_dict"] & ~df["has_gene"]].copy()
    df_gene = df[~df["in_gene_dict"] & df["has_gene"]].copy()

    # process of 'gene='
    if not df_gene.empty:
        log.debug(
            f"Found {len(df_gene)} feature(s) with gene= in {AN}, not in gene_dict."
        )
        df_gene[["other_aliases", "desc", "gene_symbol"]] = [
            "no_other_aliases",
            "no_description",
            "no_gene_symbol",
        ]
        gene_dict, temp_gene_dict, temp_count = data_process(
            df_gene,
            AN,
            gene_dict,
            temp_gene_dict,
            organelle_type,
            temp_count,
            log,
            use_gene=True,
            temp_name="temp_gene",
            c_text="gff_gene",
            gn_tag="gff_gene",
        )

    if df_missing.empty:
        log.debug(
            "All features are either in gene_dict or have a gene= attribute. Skipping."
        )
        continue

    # search NCBI
    with Entrez.esummary(
        db="gene", id=",".join(df_missing["dbxref"].unique())
    ) as search_handle:
        try:
            search_results = Entrez.read(search_handle)["DocumentSummarySet"]["DocumentSummary"]  # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            log.error(f"{ex} in Entrez.read for {AN}")
            errors.append((AN, "Entrez.read"))
            continue

    if len(search_results) != len(df_missing["dbxref"].unique()):
        log.warning(
            f"Number of search results ({len(search_results)}) does not match number of dbxrefs ({len(df_missing['dbxref'].unique())}) for {AN}."
        )
        missing_dbxrefs = set(df_missing["dbxref"].unique()) - set(
            x.attributes["uid"] for x in search_results
        )
        log.warning(f"Missing dbxrefs: {missing_dbxrefs}")
        log.warning("The missing dbxrefs will be under the 'no_description' tag.")

    # format the search results into a DataFrame
    temp_df = pd.DataFrame(
        [
            {
                "dbxref": x.attributes["uid"],
                "other_aliases": x.get("OtherAliases", "no_other_aliases"),
                "desc": x.get("Description", "no_description"),
                "gene_symbol": x.get("Name", "no_gene_symbol"),
            }
            for x in search_results
        ]
    )

    df_merged = df_missing.merge(temp_df, on="dbxref", how="left", copy=False)

    # in case NCBI did not return any results for some dbxrefs
    df_merged["other_aliases"] = df_merged["other_aliases"].fillna("no_other_aliases")
    df_merged["desc"] = df_merged["desc"].fillna("no_description")
    df_merged["gene_symbol"] = df_merged["gene_symbol"].fillna("no_gene_symbol")

    # process of ncbi description
    gene_dict, temp_gene_dict, temp_count = data_process(
        df_merged, AN, gene_dict, temp_gene_dict, organelle_type, temp_count, log
    )


log.info(" ---- [Finished] ----")
if errors:
    log.warning(f"Errors: {len(errors)}")
    for an, msg in errors:
        log.warning(f"  {an} - {msg}")
    log.warning(
        "Entrez.read errors: This is usually a sporadic event or invalid database references."
    )
    log.warning("Next steps to diagnose:")
    log.warning(
        "1. Manually verify a few dbxrefs from your GFF file by searching them in NCBI"
    )
    log.warning("2. If the dbxrefs are valid in NCBI:")
    log.warning("   - Save the current gene_dict as your latest GDT file")
    log.warning("   - Re-run this section (the issue was likely temporary)")
    log.warning("3. If the dbxrefs are invalid/obsolete:")
    log.warning("   - Option A: Remove said ANs from your dataset")
    log.warning("   - Option B: Manually remove ANs from AN_missing_gene_dict.txt)")
    log.warning("     and add those ANs to AN_missing_dbxref.txt instead")

In [None]:
if temp_gene_dict:
    temp_path = get_most_recent_gdt(GDT_DIR, prefix="TEMP_")
    new_path, nth_iteration = increment_gdt_file(temp_path)
    log.info(f"Writing TEMP GDT file: {new_path} | Iteration: {nth_iteration}")
    temp_gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(temp_gene_dict)
    temp_gene_dict["gdt_header"] = [
        "version 0.0.2",
        f"TEMP - {nth_iteration}",
        f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - Automatically generated GDT file from AN_missing_gene_dict",
    ]
    gdt.gene_dict_impl.write_gdt_file(temp_gene_dict, new_path, overwrite=True)

In [None]:
# saving gene_dict with the new data, dont forget to change the most_recent_gdt_filename variable
new_path, nth_iteration = increment_gdt_file(gdt_path)
log.info(f"Writing gene_dict file: {new_path} | Iteration: {nth_iteration}")
gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
gene_dict["gdt_header"].append(
    f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - Data added from TEMP {nth_iteration:02}"
)
gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

### TEMP using NCBI Symbol

In [None]:
ANS_Symbol = set()

with open(MISC_DIR / "seed_TEMP_Symbol.txt", "r") as f:
    for line in f:
        line = line.strip()

        if not line or line.startswith("#") or line.startswith("[") or "#gd" in line:
            continue

        ANS_Symbol.add(line.split("#dx", 1)[1].strip().split(":", 1)[0])

In [None]:
# Load the GDT file (even if empty),
# if you are running this right after the previous step,
# dont forget to change the most_recent_gdt_filename variable
# and re-run the setup

gene_dict = gdt.gene_dict_impl.create_gene_dict(gdt_path, max_an_sources=0)
log.info(f"Loaded gene_dict from {gdt_path}\nHeader:")
[log.debug(f"\t{x}") for x in gene_dict["gdt_header"]]
log.debug("\nGDT Info:")
[log.debug(f"\t{x}") for x in gene_dict["gdt_info"]]

temp_gene_dict = {}

In [None]:
temp_symbol_gene_dict = {}
temp_count = 0
errors = []
log.info(" ---- [Starting TEMP process] ----")
for i, AN in enumerate(ANS_Symbol):
    log.debug(f"-- [Processing: {AN}] --")

    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    if not an_path.exists():
        log.error(f"Error: {AN} does not exist (an_path: {an_path})")
        errors.append((AN, "File not found"))
        continue

    df = gdt.gff3_utils.load_gff3(an_path, query_string=global_query_string)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    df["dbxref"] = df["attributes"].str.extract(RE_dbxref, expand=False)

    df_missing = df[~df["in_gene_dict"]].copy()
    if df_missing.empty:
        log.debug("All features are in gene_dict. Skipping.")
        continue

    # search NCBI
    with Entrez.esummary(
        db="gene", id=",".join(df_missing["dbxref"].unique())
    ) as search_handle:
        try:
            search_results = Entrez.read(search_handle)["DocumentSummarySet"]["DocumentSummary"]  # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            log.error(f"{ex} in Entrez.read for {AN}")
            errors.append((AN, "Entrez.read"))
            continue

    if len(search_results) != len(df_missing["dbxref"].unique()):
        log.warning(
            f"Number of search results ({len(search_results)}) does not match number of dbxrefs ({len(df_missing['dbxref'].unique())}) for {AN}."
        )
        missing_dbxrefs = set(df_missing["dbxref"].unique()) - set(
            x.attributes["uid"] for x in search_results
        )
        log.warning(f"Missing dbxrefs: {missing_dbxrefs}")
        log.warning("The missing dbxrefs will be under the 'no_description' tag.")

    # format the search results into a DataFrame
    temp_df = pd.DataFrame(
        [
            {
                "dbxref": x.attributes["uid"],
                "other_aliases": x.get("OtherAliases", "no_other_aliases"),
                "desc": x.get("Description", "no_description"),
                "gene_symbol": x.get("Name", "no_gene_symbol"),
            }
            for x in search_results
        ]
    )

    df_merged = df_missing.merge(temp_df, on="dbxref", how="left", copy=False)

    # in case NCBI did not return any results for some dbxrefs
    df_merged["other_aliases"] = df_merged["other_aliases"].fillna("no_other_aliases")
    df_merged["desc"] = df_merged["desc"].fillna("no_description")
    df_merged["gene_symbol"] = df_merged["gene_symbol"].fillna("no_gene_symbol")

    # process of ncbi gene sym,bol
    gene_dict, temp_symbol_gene_dict, temp_count = data_process(
        df_merged,
        AN,
        gene_dict,
        temp_symbol_gene_dict,
        organelle_type,
        temp_count,
        log,
        temp_name="temp_symbol",
        use_NCBI_symbol=True,
        c_text="ncbi_symbol",
    )


log.info(" ---- [Finished] ----")
if errors:
    log.warning(f"Errors: {len(errors)}")
    for an, msg in errors:
        log.warning(f"  {an} - {msg}")
    log.warning(
        "Entrez.read errors: This is usually a sporadic event or invalid database references."
    )
    log.warning("Next steps to diagnose:")
    log.warning(
        "1. Manually verify a few dbxrefs from your GFF file by searching them in NCBI"
    )
    log.warning("2. If the dbxrefs are valid in NCBI:")
    log.warning("   - Save the current gene_dict as your latest GDT file")
    log.warning("   - Re-run this section (the issue was likely temporary)")
    log.warning("3. If the dbxrefs are invalid/obsolete:")
    log.warning("   - Option A: Remove said ANs from your dataset")
    log.warning("   - Option B: Manually remove ANs from AN_missing_gene_dict.txt)")
    log.warning("     and add those ANs to AN_missing_dbxref.txt instead")

In [None]:
if temp_symbol_gene_dict:
    temp_path = get_most_recent_gdt(GDT_DIR, prefix="TEMP_Symbol_")
    new_path, symbol_iteration = increment_gdt_file(temp_path)
    log.info(
        f"Writing TEMP Symbol GDT file: {new_path} | Iteration: {symbol_iteration}"
    )
    temp_symbol_gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(
        temp_symbol_gene_dict
    )
    temp_symbol_gene_dict["gdt_header"] = [
        "version 0.0.2",
        f"TEMP Symbol - {symbol_iteration}",
        "Automagically generated by AN_missing_gene_dict.ipynb | TEMP Symbol using NCBI gene symbol",
    ]
    gdt.gene_dict_impl.write_gdt_file(temp_symbol_gene_dict, new_path, overwrite=True)
else:
    log.info(
        "No TEMP Symbol GDT file created, meaning no unknown gene symbols were found."
    )
    symbol_iteration = 0

In [None]:
# saving gene_dict with the new data, dont forget to change the most_recent_gdt_filename variable
new_path, nth_iteration = increment_gdt_file(gdt_path)
log.info(f"Writing gene_dict file: {new_path} | pilot itr: {nth_iteration}")
gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
gene_dict["gdt_header"].append(
    f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - Data added from TEMP Symbol {symbol_iteration:02d}"
)
gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

### Genes Discard using dbxref

In [None]:
remove_string = "discard-"
genes_to_remove = "genome_features_to_remove.txt"

In [None]:
remove_gene_ids = {}
with open(MISC_DIR / genes_to_remove, "r") as f:
    for line in f:
        if (
            not line.strip()
            or line.startswith("#")
            or line.startswith("[")
            or "#gd" in line
        ):
            continue

        gene_id, an = line.split("#c", 1)[0].split("#dx", 1)
        gene_id = gene_id.strip()
        an = an.split(":", 1)[0].strip()

        if an not in remove_gene_ids:
            remove_gene_ids[an] = set([gene_id])
        else:
            remove_gene_ids[an].add(gene_id)

In [None]:
log.info(f"Removing {len(remove_gene_ids)} gene IDs from GFF files.")
for an in remove_gene_ids.keys():
    log.trace(f"Processing {an} for removal of gene IDs {remove_gene_ids[an]}")
    an_path = DATA_DIR / f"{an}{gff_suffix}"
    with open(an_path, "r") as f:
        lines = f.readlines()

    headers, index = [], 0
    while lines[index].startswith("#"):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile("|".join([f"ID={x};" for x in remove_gene_ids[an]]))
    log.trace(f"Pattern for removal: {pattern.pattern}")
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()):
            continue
        line = line.split("\t")

        # line[2] is type line, line[8] is attributes
        if pattern.search(line[8]):
            if remove_string not in line[2]:
                line[2] = remove_string + line[2]

        contents.append("\t".join(line))

    with open(an_path, "w") as f:
        f.write("\n".join(headers))
        f.write("\n")
        f.write("\n".join(contents))
        f.write("\n\n")

log.info(f"Finished removing gene IDs from {len(remove_gene_ids)} GFF files.")