In [1]:
# imports and functions
import re
import gdt
import pandas as pd

from pathlib import Path
from datetime import datetime
from collections import defaultdict

RE_ID = re.compile(r"ID=([^;]+)")
RE_parent = re.compile(r"Parent=([^;]+)")

# Regex patterns for extracting GFF attributes
RE_gene = re.compile(r"gene=([^;]+)")
RE_product = re.compile(r"product=([^;]+)")
RE_description = re.compile(r"description=([^;]+)")
RE_name = re.compile(r"Name=([^;]+)")
RE_note = re.compile(r"Note=([^;]+)")
RE_gene_synonym = re.compile(r"gene_synonym=([^;]+)")

# Features to extract from GFF files (must match the regex names above without RE_ prefix)
# ORDER MATTERS: Listed from most descriptive/best for identification to least descriptive
# This order determines the priority for filling the feature_name column - earlier entries take precedence
features_name = ["gene", "product", "description", "name", "note", "gene_synonym"]

# To add a new feature for extraction:
# 1. Create a regex pattern with the naming convention: RE_{feature_name}
#    Use any regex pattern that captures the desired value from GFF files.
#    Common pattern: RE_{feature_name} = re.compile(r"{attribute_name}=([^;]+)")
#    but you can customize the regex to capture exactly what you need.
#
# 2. Add the feature name (without RE_ prefix) to the features_name list
#    IMPORTANT: Place it in the appropriate position based on how descriptive/useful
#    it is for identification. More descriptive features should come first in the list.
#
# Example - to extract 'locus_tag' attributes:
# RE_locus_tag = re.compile(r"locus_tag=([^;]+)")  # or any custom regex
# features_name = ["locus_tag", "gene", "product", "description", "name", "note", "gene_synonym"]  # if locus_tag is most descriptive
# # OR
# features_name = ["gene", "product", "description", "locus_tag", "name", "note", "gene_synonym"]  # if locus_tag is moderately descriptive
# # OR
# features_name = ["gene", "product", "description", "name", "note", "gene_synonym", "locus_tag"] # if locus_tag is least descriptive
#
# CRITICAL: The regex variable name (after RE_) must exactly match the name
# you add to features_name for the extraction to work properly

re_features = {}
for name in features_name:
    try:
        re_features[name] = globals()[f"RE_{name}"]
    except KeyError:
        print(f"Warning: No regex found for '{name}' (expected variable: RE_{name})")


def increment_gdt_file(path):
    """
    Increment the GDT file name by 1.
    Example: fungi-ncbi_pilot_03.gdt -> fungi-ncbi_pilot_04.gdt
    """
    plist = path.stem.split("_")
    if plist[-1] == "stripped":
        plist[-1] = "pilot"
        plist.append(0)

    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDT file name: {path.name}. Expected format: <preferred_name>_##.gdt, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number


def get_most_recent_gdt(dir_path, prefix="TEMP_"):
    """
    Get the most recent gdt file in the directory.
    Arguments:
        dir_path (Path): Directory to search for GDT files.
        prefix (str): Prefix of the GDT files to search for. It will match files like "<prefix>*.gdt".
    Returns:
        Path: Path to the most recent GDT file.
    """
    temp_files = list(
        dir_path.glob(f"*{prefix}*.gdt")
    )  # TODO maybe change to check for numbers after prefix?
    if not temp_files:
        return dir_path / f"{prefix}_00.gdt"
    return gdt.gdt_impl.natural_sort(temp_files, key=lambda x: x.stem)[-1]


def time_now():
    """Returns current time in YYYY-MM-DD HH:MM format."""
    return datetime.now().strftime("%Y-%m-%d %H:%M")

In [16]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../example/fungi_mt"
most_recent_gdt_filename = "fungi_mt_pilot_06.gdt"
global_query_string = gdt.gff3_utils.QS_GENE_TRNA_RRNA
remove_orfs = True
gct = "MIT"
gff_suffix = ".gff3"

print(f"Chosen feature query string: '{global_query_string}'")


# just checking
DATA_DIR = Path(DATA_DIR).resolve()
if not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Path {DATA_DIR} is not a directory.")

MISC_DIR = DATA_DIR / "misc"
GDT_DIR = MISC_DIR / "gdt"
GDT_DIR.mkdir(511, True, True)  # 511 = 0o777

AN_missing_dbxref_GeneID = MISC_DIR / "AN_missing_dbxref_GeneID.txt"

if not AN_missing_dbxref_GeneID.is_file():
    raise FileNotFoundError(
        f"Missing {AN_missing_dbxref_GeneID}, did you run gdt-cli filter?"
    )

if "most_recent_gdt_filename" in globals():
    gdt_path = GDT_DIR / most_recent_gdt_filename
    if not gdt_path.is_file():
        print(
            f"Not found {gdt_path.name}, does it exist in misc/gdt?\nGDTs in {GDT_DIR}:"
        )
        [print(f" - {f.name}") for f in sorted(GDT_DIR.glob("*.gdt"))]
        raise FileNotFoundError(
            f"Most recent GDT file {gdt_path.name} does not exist in {GDT_DIR}."
        )
else:
    print(
        "Warning: 'most_recent_gdt_filename' variable not set.\n\n"
        "You should have a GDT file from AN_missing_gene_dict.ipynb:\n"
        "• Set the most_recent_gdt_filename variable\n"
        "• Re-run this cell\n\n"
        "If you intend to run this without a GDT file (e.g., because your GFF files "
        "don't have dbxrefs and AN_missing_dbxref_GeneID.ipynb isn't needed), this warning can be ignored."
    )
    # to simplify the code, a exetution without most_recent_gdt_filename is
    # basically the same as with one, but with and empty gdt file
    gdt_path = GDT_DIR / "pilot_00.gdt"
    gdt.gdt_impl.create_empty_gdt(gdt_path)

Chosen feature query string: 'type in ('gene', 'tRNA', 'rRNA')'


In [3]:
log_file = MISC_DIR / "01_missing_dbxref_GeneID.log"

log = gdt.logger_setup.create_simple_logger(
    print_to_console=True,
    console_level="DEBUG",
    save_to_file=True,
    file_level="TRACE",
    log_file=log_file,
)
log.debug("Running from notebook AN_missing_dbxref_GeneID.ipynb")

2025-06-14 15:50:41,223 - DEBUG - Simple log setup complete.
2025-06-14 15:50:41,224 - DEBUG - Console logging level DEBUG
2025-06-14 15:50:41,224 - DEBUG - File logging level TRACE at /home/brenodupin/matheus/gdt/example/fungi_mt/misc/01_missing_dbxref_GeneID.log
2025-06-14 15:50:41,224 - DEBUG - Running from notebook AN_missing_dbxref_GeneID.ipynb


### Deeper investigation using other gff attributes, primarily gene=

In [4]:
with open(AN_missing_dbxref_GeneID, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
log.info(f"Found {len(ANs)} ANs in {AN_missing_dbxref_GeneID}")
log.trace(f"ANs: {ANs}")

2025-06-14 15:50:53,724 - INFO - Found 3 ANs in /home/brenodupin/matheus/gdt/example/fungi_mt/misc/AN_missing_dbxref_GeneID.txt


In [5]:
# Load the GDT file (even if empty)
gene_dict = gdt.gdt_impl.create_gene_dict(gdt_path, max_an_sources=0)
log.info(f"GeneDict loaded from {gdt_path.name}")
log.debug(f"path: {gdt_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
for x in gene_dict.info:
    log.info(f"\t{x}")

2025-06-14 15:50:55,787 - INFO - GeneDict loaded from fungi_mt_pilot_04.gdt
2025-06-14 15:50:55,788 - DEBUG - path: /home/brenodupin/matheus/gdt/example/fungi_mt/misc/gdt/fungi_mt_pilot_04.gdt
2025-06-14 15:50:55,789 - INFO - Header:
2025-06-14 15:50:55,789 - INFO - 	version 0.0.2
2025-06-14 15:50:55,790 - INFO - 	Fungi_mt
2025-06-14 15:50:55,790 - INFO - 	2025-04-09 17:54 - Conversion from gdt to gdt2
2025-06-14 15:50:55,791 - INFO - 	2025-06-05 18:11 - Stripped GDT version from original GDT file Fungi_mt.gdt
2025-06-14 15:50:55,791 - INFO - 	2025-06-11 20:37 - Data added from TEMP 01
2025-06-14 15:50:55,791 - INFO - 	2025-06-11 20:50 - Data added from TEMP Symbol 01
2025-06-14 15:50:55,792 - INFO - GDT Info:
2025-06-14 15:50:55,793 - INFO - 	Labels: 56
2025-06-14 15:50:55,793 - INFO - 	Total Entries   : 820
2025-06-14 15:50:55,793 - INFO - 	GeneDescriptions: 540
2025-06-14 15:50:55,794 - INFO - 	GeneGenerics    : 0
2025-06-14 15:50:55,795 - INFO - 	DbxrefGeneIDs   : 280


In [6]:
temp_list = []
to_drop = ["source", "type", "start", "end", "score", "strand", "phase", "attributes"]

for AN in ANs:
    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    df = gdt.gff3_utils.load_gff3(
        an_path, query_string=global_query_string, usecols=gdt.GFF3_COLUMNS
    )
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df = df[~df["gene_id"].isin(gene_dict)]

    # Procedually extract features based on the regex patterns defined
    for name, pattern in re_features.items():
        df[name] = df["attributes"].str.extract(pattern, expand=False)

    if df[features_name].isna().all(axis=1).any():
        log.warning(f"{AN} has row(s) with no identifiable atribute.")
        log.warning(
            "Please modify this script to add a new possible identifiable attribute or just remove the AN from your dataset"
        )
        log.debug(df[df[features_name].isna().all(axis=1)])

    temp_list.extend(df.to_dict("records"))

features_info_df = pd.DataFrame(temp_list)
features_info_df = features_info_df.drop(columns=to_drop, errors="ignore")

drop_cols = [col for col in features_name if features_info_df[col].isna().all()]

# Procedurally fill 'best_feature' based on the order of features_name
features_info_df["best_feature"] = features_info_df[features_name[0]]
for col in features_name[1:]:
    features_info_df["best_feature"] = features_info_df["best_feature"].fillna(
        features_info_df[col]
    )

features_info_df = features_info_df.drop(columns=drop_cols)
features_info_df = features_info_df.sort_values(by="best_feature")
log.debug(f"Features info df, writing to {MISC_DIR / 'features_info.tsv'}")
features_info_df.to_csv(MISC_DIR / "features_info.tsv", sep="\t", index=False)

2025-06-14 15:51:00,524 - DEBUG - Features info df, writing to /home/brenodupin/matheus/gdt/example/fungi_mt/misc/features_info.tsv


In [7]:
add_gdt_compliance = True
comment = "Manual from missing_dbxref_GeneID feature names"

if add_gdt_compliance:
    gdt_str = f' #gd MANUAL{ " #c " + comment if comment else "" }'
else:
    gdt_str = ""

# df with 2 columns, one for feature_names and one for in_gene_dict
new_df = pd.DataFrame({"best_feature": features_info_df["best_feature"].unique()})
new_df["in_gene_dict"] = new_df["best_feature"].isin(gene_dict)

unique_names = new_df[~new_df["in_gene_dict"]]["best_feature"]
unique_names = gdt.gdt_impl.natural_sort(unique_names)
with open(MISC_DIR / "feature_names.txt", "w+") as f1:
    for name in unique_names:
        f1.write(f"{name}{gdt_str}\n")

The user must now parse feature_names.txt  

Features that can be easily identifiable must be added to the current  
version of the gdt, and features that need deeper investigation should be  
copied to a new file name 'feature_unks.txt'
  
The script will now try to automatically add the gene_ids with feature names   
that __are not in 'feature_unks.txt'__ to gene_dict.

In [8]:
# Check if the names exist in the gene_dict
features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")

names_unk = set()
with open(MISC_DIR / "feature_unks.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [13]:
log.info(
    "Verifing that all feature_names.txt values (excluding those in feature_unks.txt) exist in the most_recent_gdt_filename."
)
gene_dict = gdt.gdt_impl.create_gene_dict(gdt_path, max_an_sources=0)
log.info(f"GeneDict loaded from {gdt_path.name}")
log.debug(f"path: {gdt_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
for x in gene_dict.info:
    log.info(f"\t{x}")

names_not_in_dict = set()
all_names = set(features_info_df["best_feature"].unique()) - names_unk
for name in all_names:
    if name not in gene_dict:
        names_not_in_dict.add(name)

if names_not_in_dict:
    log.debug(
        f"Warning: {len(names_not_in_dict)} name(s) not in most_recent_gdt_filename!"
    )
    log.debug(
        "These names are not in feature_unk, so you marked them as identifiable. Please identify them or add them feature_unk."
    )
    log.debug(
        "It could also be that you forgot to update and/or reload the most_recent_gdt_filename with the changes you made above."
    )
    [log.debug(name) for name in names_not_in_dict]
    raise ValueError(
        f"Error: {len(names_not_in_dict)} names not in most_recent_gdt_filename!"
    )

2025-06-14 16:00:19,924 - INFO - Verifing that all feature_names.txt values (excluding those in feature_unks.txt) exist in the most_recent_gdt_filename.
2025-06-14 16:00:19,928 - INFO - GeneDict loaded from fungi_mt_pilot_05.gdt
2025-06-14 16:00:19,930 - DEBUG - path: /home/brenodupin/matheus/gdt/example/fungi_mt/misc/gdt/fungi_mt_pilot_05.gdt
2025-06-14 16:00:19,930 - INFO - Header:
2025-06-14 16:00:19,931 - INFO - 	version 0.0.2
2025-06-14 16:00:19,932 - INFO - 	Fungi_mt
2025-06-14 16:00:19,933 - INFO - 	2025-04-09 17:54 - Conversion from gdt to gdt2
2025-06-14 16:00:19,933 - INFO - 	2025-06-05 18:11 - Stripped GDT version from original GDT file Fungi_mt.gdt
2025-06-14 16:00:19,934 - INFO - 	2025-06-11 20:37 - Data added from TEMP 01
2025-06-14 16:00:19,934 - INFO - 	2025-06-11 20:50 - Data added from TEMP Symbol 01
2025-06-14 16:00:19,935 - INFO - GDT Info:
2025-06-14 16:00:19,935 - INFO - 	Labels: 56
2025-06-14 16:00:19,936 - INFO - 	Total Entries   : 825
2025-06-14 16:00:19,937 - 

In [14]:
comment = "automated insertion from missing_dbxref_GeneID best feature"
unique_gene_ids = features_info_df[~features_info_df["best_feature"].isin(names_unk)][
    "gene_id"
].unique()

log.info(
    " ---- [Starting 'Automated insertion of gene_ids with known features from features_info.tsv'] ----"
)
for gene_id in unique_gene_ids:
    df = features_info_df[features_info_df["gene_id"] == gene_id]

    # sanity check, are all feature_names the same?
    if df["best_feature"].nunique() != 1:
        log.warning(
            f"{gene_id} has multiple best_features: {df['best_feature'].unique()}"
        )
        log.debug("\tChecking if they have the same label in gene_dict...")

        labels = {gene_dict[feat].label for feat in df["best_feature"].unique()}
        if len(labels) != 1:
            log.error(f"\tError: {gene_id} has multiple labels: {labels}")
            raise ValueError(
                f"Error: {gene_id} has multiple labels: {labels}. "
                "Please edit features_info.tsv to resolve this issue."
            )
        else:
            log.debug(f"\tAll best_features have the same label: {labels.pop()}")

    label = gene_dict[df["best_feature"].iloc[0]].label
    an_sources = df["seqid"].unique().tolist()
    log.debug(
        f"Adding {gene_id} with label '{label}', an_sources: {an_sources}, comment: {comment}"
    )
    gene_dict[gene_id] = gdt.gdt_impl.GeneGeneric(
        label=label, an_sources=an_sources, c=comment
    )
log.info(
    " ---- [Finished 'Automated insertion of gene_ids with known features from features_info.tsv'] ----"
)

2025-06-14 16:00:25,761 - INFO -  ---- [Starting 'Automated insertion of gene_ids with known features from features_info.tsv'] ----
2025-06-14 16:00:25,765 - DEBUG - Adding gene-atp6 with label 'MIT--ATP6', an_sources: ['HE983611.1'], comment: automated insertion from missing_dbxref_GeneID best feature
2025-06-14 16:00:25,766 - DEBUG - Adding gene-KX657746.1:43686..44465 with label 'MIT--ATP6', an_sources: ['KX657746.1'], comment: automated insertion from missing_dbxref_GeneID best feature
2025-06-14 16:00:25,770 - DEBUG - Adding gene-atp8 with label 'MIT--ATP8', an_sources: ['HE983611.1'], comment: automated insertion from missing_dbxref_GeneID best feature
2025-06-14 16:00:25,771 - DEBUG - Adding gene-KX657746.1:43243..43389 with label 'MIT--ATP8', an_sources: ['KX657746.1'], comment: automated insertion from missing_dbxref_GeneID best feature
2025-06-14 16:00:25,772 - DEBUG - Adding gene-KX657746.1:60840..61070 with label 'MIT--ATP9', an_sources: ['KX657746.1'], comment: automated i

In [15]:
new_path, nth_iteration = increment_gdt_file(gdt_path)
log.info(f"Writing gene_dict file: {new_path} | Iteration: {nth_iteration}")
gene_dict.info = gdt.gdt_impl.get_gene_dict_info(gene_dict)
gene_dict.header.append(
    f"{time_now()} - Data added from 'Automated insertion of gene_ids with known features from features_info.tsv'"
)
gdt.gdt_impl.write_gdt_file(gene_dict, new_path, overwrite=True)
log.info(f"{new_path.name} was created in misc/gdt!")
log.info(
    "You must now add it to most_recent_gdt_filename in the Setup cell, and rerun the cell"
)

2025-06-14 16:00:29,972 - INFO - Writing gene_dict file: /home/brenodupin/matheus/gdt/example/fungi_mt/misc/gdt/fungi_mt_pilot_06.gdt | Iteration: 6
2025-06-14 16:00:29,976 - INFO - fungi_mt_pilot_06.gdt was created in misc/gdt!
2025-06-14 16:00:29,977 - INFO - You must now add it to most_recent_gdt_filename in the Setup cell, and rerun the cell


#### TEMP Mapping

In [None]:
# Make sure the new GDT file is the most recent one in the setup cell!
# Otherwise you will create a new gdt file without the previous changes.

# TODO CRITIAL HERE

In [17]:
gene_dict = gdt.gdt_impl.create_gene_dict(gdt_path, max_an_sources=0)
log.info(f"GeneDict loaded from {gdt_path.name}")
log.debug(f"path: {gdt_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
for x in gene_dict.info:
    log.info(f"\t{x}")

2025-06-14 16:02:32,393 - INFO - GeneDict loaded from fungi_mt_pilot_06.gdt
2025-06-14 16:02:32,394 - DEBUG - path: /home/brenodupin/matheus/gdt/example/fungi_mt/misc/gdt/fungi_mt_pilot_06.gdt
2025-06-14 16:02:32,395 - INFO - Header:
2025-06-14 16:02:32,395 - INFO - 	version 0.0.2
2025-06-14 16:02:32,396 - INFO - 	Fungi_mt
2025-06-14 16:02:32,397 - INFO - 	2025-04-09 17:54 - Conversion from gdt to gdt2
2025-06-14 16:02:32,397 - INFO - 	2025-06-05 18:11 - Stripped GDT version from original GDT file Fungi_mt.gdt
2025-06-14 16:02:32,398 - INFO - 	2025-06-11 20:37 - Data added from TEMP 01
2025-06-14 16:02:32,399 - INFO - 	2025-06-11 20:50 - Data added from TEMP Symbol 01
2025-06-14 16:02:32,401 - INFO - 	2025-06-14 16:00 - Data added from 'Automated insertion of gene_ids with known features from features_info.tsv'
2025-06-14 16:02:32,402 - INFO - GDT Info:
2025-06-14 16:02:32,403 - INFO - 	Labels: 56
2025-06-14 16:02:32,404 - INFO - 	Total Entries   : 981
2025-06-14 16:02:32,404 - INFO - 

In [18]:
names_unk = set()
with open(MISC_DIR / "feature_unks.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")
features_unk_df = (
    features_info_df[features_info_df["best_feature"].isin(names_unk)]
    .copy()
    .reset_index(drop=True)
)
unk_dict = features_unk_df.groupby("seqid")["gene_id"].agg(list).to_dict()

In [19]:
temp_unk = gdt.gdt_impl.GeneDict()
label_count = 0
change_gene_dict = False
log.info(
    " ---- [Starting 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'] ----"
)
for an in unk_dict.keys():
    gene_ids = unk_dict[an]
    log.debug(f"AN: {an}| gene_ids: {gene_ids}")
    an_path = DATA_DIR / f"{an}{gff_suffix}"

    df = gdt.gff3_utils.load_gff3(an_path, usecols=gdt.GFF3_COLUMNS)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["parent"] = df["attributes"].str.extract(RE_parent, expand=False)

    for name, pattern in re_features.items():
        df[name] = df["attributes"].str.extract(pattern, expand=False)

    # Procedurally fill 'best_feature' based on the order of features_name
    df["best_feature"] = df[features_name[0]]
    for col in features_name[1:]:
        df["best_feature"] = df["best_feature"].fillna(df[col])

    for gene_id in gene_ids:
        candidates = df[df["parent"] == gene_id]
        log.debug(f" gene_id: {gene_id} | number of candidates: {len(candidates)}")

        # Handle case where no candidates found
        if len(candidates) == 0:
            log.debug(f" no candidate with parent={gene_id} found")
            log.debug("  adding it to UNKNOWN label\n")
            temp_unk[gene_id] = gdt.gdt_impl.GeneGeneric(
                label=f"{gct}-UNKNOWN",
                an_sources=[an],
                c=f"unknown gene_id from {an}{gff_suffix} | "
                f"a: {df[df['gene_id'] == gene_id]['attributes'].iloc[0] if not df[df['gene_id'] == gene_id].empty else 'N/A'}",
            )
            continue

        # Handle feature name conflicts | TODO check their gene_dict labels
        # IMPORTANT
        if candidates["best_feature"].nunique() > 1:
            log.warning(
                " more than one canditate found, but with best_feature conflict, chosing the first one."
            )
            log.warning(f"  best_features: {candidates['best_feature'].unique()}")

        best_feature = candidates["best_feature"].iloc[0]
        log.debug(f"  chosen best_feature: {best_feature}")
        [
            log.debug(f"\tt: {x.type} | bf: {x.best_feature} | a: {x.attributes}")
            for x in candidates.itertuples()
        ]

        # Handle case where feature_name is in gene_dict
        if best_feature in gene_dict:
            change_gene_dict = True
            label = gene_dict[best_feature].label
            log.debug(f"  best_feature in gene_dict, label: {label}\n")
            gene_dict[gene_id] = gdt.gdt_impl.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_GeneID feature mapping, source: {best_feature} | type: {candidates['type'].iloc[0]}",
            )
            continue

        # Handle case where feature_name is NOT in gene_dict
        log.debug(f"  best_feature not in gene_dict: {best_feature}")
        log.debug("  checking in temp_unk")

        if best_feature in temp_unk:
            label = temp_unk[best_feature].label
            log.debug(f"  found in temp_unk, label: {label}\n")
            gene_dict[gene_id] = gdt.gdt_impl.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_GeneID feature mapping, source: {best_feature} | type: {candidates['type'].iloc[0]}",
            )
        else:
            label_count += 1
            label = f"{gct}-TEMP-{label_count}"
            log.debug(f"  not found in temp_unk, new label: {label}\n")
            temp_unk[best_feature] = gdt.gdt_impl.GeneDescription(
                label=label, source="MANUAL", c=None
            )

            temp_unk[gene_id] = gdt.gdt_impl.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_GeneID feature mapping, source: {best_feature} | type: {candidates['type'].iloc[0]}",
            )
log.info(
    " ---- [Finished 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'] ----"
)

2025-06-14 16:02:41,395 - INFO -  ---- [Starting 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'] ----
2025-06-14 16:02:41,396 - DEBUG - AN: JQ346808.1| gene_ids: ['gene-AFUA_m0010', 'gene-AFUA_m0020', 'gene-AFUA_m0030', 'gene-AFUA_m0040', 'gene-AFUA_m0050', 'gene-AFUA_m0060', 'gene-AFUA_m0070', 'gene-AFUA_m0080', 'gene-AFUA_m0090', 'gene-AFUA_m0100', 'gene-AFUA_m0110', 'gene-AFUA_m0120', 'gene-AFUA_m0130', 'gene-AFUA_m0140', 'gene-AFUA_m0150', 'gene-AFUA_m0160', 'gene-AFUA_m0170', 'gene-AFUA_m0180', 'gene-AFUA_m0190', 'gene-AFUA_m0200', 'gene-AFUA_m0210', 'gene-AFUA_m0220', 'gene-AFUA_m0230', 'gene-AFUA_m0240', 'gene-AFUA_m0250', 'gene-AFUA_m0260', 'gene-AFUA_m0270', 'gene-AFUA_m0280', 'gene-AFUA_m0290', 'gene-AFUA_m0300', 'gene-AFUA_m0310', 'gene-AFUA_m0320', 'gene-AFUA_m0330', 'gene-AFUA_m0340', 'gene-AFUA_m0350', 'gene-AFUA_m0360', 'gene-AFUA_m0370', 'gene-AFUA_m0380', 'gene-AFUA_m0390', 'gene-AFUA_m0400', 'gene-AFUA_m0410', 'gene-AFUA_m0420', 'gene-AFUA_m043

In [20]:
if temp_unk:
    temp_path = get_most_recent_gdt(GDT_DIR, prefix="TEMP_Mapping_")
    new_path, map_iteration = increment_gdt_file(temp_path)
    log.info(f"Writing TEMP Mapping GDT file: {new_path} | Iteration: {map_iteration}")
    temp_unk.info = gdt.gdt_impl.get_gene_dict_info(temp_unk)
    temp_unk.header = [
        "version 0.0.2",
        f"TEMP_Mapping - {map_iteration}",
        f"{time_now()} - Automatically generated from 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'",
    ]
    gdt.gdt_impl.write_gdt_file(temp_unk, new_path, overwrite=True)

2025-06-14 16:02:46,502 - INFO - Writing TEMP Mapping GDT file: /home/brenodupin/matheus/gdt/example/fungi_mt/misc/gdt/TEMP_Mapping__01.gdt | Iteration: 1


In [21]:
if change_gene_dict:
    new_path, nth_iteration = increment_gdt_file(gdt_path)
    log.info(f"Writing gene_dict file: {new_path} | pilot itr: {nth_iteration}")
    gene_dict.info = gdt.gdt_impl.get_gene_dict_info(gene_dict)
    gene_dict.header.append(
        f"{time_now()} - Data added from 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'"
    )
    gdt.gdt_impl.write_gdt_file(gene_dict, new_path)
    log.info(f"{new_path.name} was created in misc/gdt!")
    log.info(
        "You must now add it to most_recent_gdt_filename in the Setup cell, and rerun the cell"
    )

2025-06-14 16:02:48,809 - INFO - Writing gene_dict file: /home/brenodupin/matheus/gdt/example/fungi_mt/misc/gdt/fungi_mt_pilot_07.gdt | pilot itr: 7
2025-06-14 16:02:48,816 - INFO - fungi_mt_pilot_07.gdt was created in misc/gdt!
2025-06-14 16:02:48,817 - INFO - You must now add it to most_recent_gdt_filename in the Setup cell, and rerun the cell


### Genes exclusion of to_remove_2.txt

In [22]:
append_string = "discard-"
genes_to_exclude = "to_exclude_2.txt"

exclude_gene_ids = defaultdict(set)
with open(MISC_DIR / genes_to_exclude, "r") as f:
    for line in f:
        if (
            not line.strip()
            or line.startswith("#")
            or line.startswith("[")
            or "#gd" in line
        ):
            continue

        gene_id, an = line.split("#c", 1)[0].split("#gn", 1)
        exclude_gene_ids[an.strip()].add(gene_id.strip())

In [23]:
log.info(
    " ---- [Starting 'AN_missing_dbxref_GeneID excluding gene IDs from GFF3s'] ----"
)
for an in exclude_gene_ids.keys():
    log.info(f"Processing {an} for excluding {len(exclude_gene_ids[an])} gene IDs")
    log.trace(f" excluding gene IDs: {exclude_gene_ids[an]}")
    an_path = DATA_DIR / f"{an}{gff_suffix}"
    with open(an_path, "r") as f:
        lines = f.readlines()

    headers, index = [], 0
    while lines[index].startswith("#"):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile("|".join([f"ID={x};" for x in exclude_gene_ids[an]]))
    log.trace(f"Pattern for exclusion: {pattern.pattern}")
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()):
            continue
        line = line.split("\t")

        # line[2] is type, line[8] is attributes
        if pattern.search(line[8]):
            if append_string not in line[2]:
                line[2] = append_string + line[2]

        contents.append("\t".join(line))

    with open(an_path, "w") as f:
        f.write("\n".join(headers))
        f.write("\n")
        f.write("\n".join(contents))
        f.write("\n\n")

log.info(
    " ---- [Finished 'AN_missing_dbxref_GeneID excluding gene IDs from GFF3s'] ----"
)

2025-06-14 16:06:40,148 - INFO -  ---- [Starting 'AN_missing_dbxref_GeneID excluding gene IDs from GFF3s'] ----
2025-06-14 16:06:40,149 - INFO - Processing JQ346808.1 for excluding 2 gene IDs
2025-06-14 16:06:40,153 - INFO - Processing KX657746.1 for excluding 2 gene IDs
2025-06-14 16:06:40,155 - INFO -  ---- [Finished 'AN_missing_dbxref_GeneID excluding gene IDs from GFF3s'] ----
