# Removal of an GFF's file annotations from a GeneDict

### Imports and functions

In [1]:
import re
from pathlib import Path

import gdt

RE_ID = re.compile(r"ID=([^;]+)")

### Setup

#### A.

In [None]:
# Change all the variables below as needed
# gdict_path: path to the gdict file to be updated
# gff_path: path to the GFF3 file from which to gather gene IDs
# global_query_string: query string to be used when reading the GFF3 file (it should be same used to create the gdict)
# remove_orfs: if True, ORFs will not be considered when gathering gene IDs from the GFF3 file
# remove_uniques: if True, only gene IDs that we are **sure** only this file uses will be removed from the gdict,
#                 the other will require manual curation.
#                 If False, all gene IDs found in the GFF3 file need to be check manually.

gdict_path = "/home/brenodupin/matheus/gdt/resources/gdicts/fungi_mit.gdict"
gff_path = "/home/brenodupin/matheus/gdt/sandbox/update_gdicts/fungi_mit/AB568599.1/AB568599.1.gff3"
global_query_string = gdt.QS_GENE_TRNA_RRNA
remove_orfs = True
remove_uniques = False

gdict_path = Path(gdict_path)
gff_path = Path(gff_path)

if not gff_path.is_file():
    raise FileNotFoundError(f"File not found: {gff_path}")

BASE_PATH = Path().resolve()
print(f"Working directory: {BASE_PATH}")

#### B.

In [None]:
# feel free to change the logger settings as needed.
# if you do not wish any logging, just set 'print_to_console' and 'save_to_file' to False,
# the 'log' object needs to exist for the rest of the notebook cells to work, even if as a dummy
log_file = BASE_PATH / "01_missing_gene_dict.log"

log = gdt.create_logger(
    print_to_console=True,
    console_level="DEBUG",
    save_to_file=False,
    file_level="TRACE",
    log_file=log_file,
)
log.info("Running from notebook AN_missing_gene_dict.ipynb")

2025-09-19 14:29:10,135 - DEBUG - Logger initialized.
2025-09-19 14:29:10,135 - DEBUG - Console logging level DEBUG
2025-09-19 14:29:10,136 - INFO - Running from notebook AN_missing_gene_dict.ipynb


### Gathering gene IDs from the GFF file

#### A.

In [8]:
# Loading the GDICT file
gene_dict = gdt.read_gdict(gdict_path, lazy_info=False)
log.info(f"GeneDict loaded from {gdict_path.name}")
log.debug(f"path: {gdict_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, gene_dict)

2025-09-19 14:29:12,666 - INFO - GeneDict loaded from fungi_mit.gdict
2025-09-19 14:29:12,667 - DEBUG - path: /home/brenodupin/matheus/gdt/resources/gdicts/fungi_mit.gdict
2025-09-19 14:29:12,667 - INFO - Header:
2025-09-19 14:29:12,668 - INFO - 	version 0.0.2
2025-09-19 14:29:12,668 - INFO - 	Fungi_mit
2025-09-19 14:29:12,668 - INFO - 	2025-04-09 17:54 - Conversion from gdt to gdt2
2025-09-19 14:29:12,669 - INFO - 	2025-06-27 17:56 - Stripped GDICT version from original GDICT file
2025-09-19 14:29:12,669 - INFO - 	2025-06-27 17:56 - Labels removed
2025-09-19 14:29:12,670 - INFO - 	2025-06-27 17:56 - Labels renamed
2025-09-19 14:29:12,671 - INFO - 	2025-07-01 17:20 - Data added from 'AN_missing_gene_dict parsing gene= + NCBI Description'
2025-09-19 14:29:12,674 - INFO - 	2025-07-02 10:59 - Labels renamed
2025-09-19 14:29:12,676 - INFO - 	2025-07-02 11:09 - Data added from 'AN_missing_gene_dict parsing NCBI Gene Symbol'
2025-09-19 14:29:12,676 - INFO - 	2025-07-02 12:43 - Data added fro

#### B.

In [9]:
df = gdt.load_gff3(
    gff_path,
    query_string=global_query_string,
    usecols=("seqid", "type", "start", "end", "attributes"),
)
df = gdt.filter_orfs(df) if remove_orfs else df

seqid = df["seqid"].unique()[0]

df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
df["label"] = df["gene_id"].map(
    lambda x: gene_dict.get(x).label if x in gene_dict else "NOT_FOUND"
)
df["unique"] = df["gene_id"].str.contains(seqid, na=False)

keys_to_remove = df[df["unique"]]["gene_id"].to_list()

log.info(f"Number of features in GFF3: {len(df)}")

2025-09-19 14:29:14,646 - INFO - Number of features in GFF3: 42


#### C.

In [10]:
if remove_uniques:
    log.info(f"Number of unique features to remove: {len(keys_to_remove)}")
    for key in keys_to_remove:
        gene_dict.pop(key, None)

    new_path = gdict_path.parent / f"{gdict_path.stem}_removed_uniques.gdict"
    gene_dict.header.append(
        f"{gdt.time_now()} - {len(keys_to_remove)} unique features removed from {gff_path.name}"
    )
    gene_dict.to_gdict(new_path, overwrite=True)
    log.info(f"{new_path.name} was created in the same directory as the original gdict")
    log.info("You must now change 'gdict_path' to point to the new gdict file")

    # this is a safeguard that will work only if the notebook is not restarted between this cell and the next one,
    # the user must change the gdict_path variable manually if the notebook is restarted
    gdict_path = new_path

#### D.

In [None]:
rest_of_keys = df[~df["unique"]] if remove_uniques else df
rest_of_keys = rest_of_keys[["gene_id", "label"]]

log.info(f"Number of features that's need checking: {len(rest_of_keys)}")
with open(BASE_PATH / "keys_to_check.txt", "w") as f:
    for _, gene_id, label in rest_of_keys.itertuples():
        f.write(f"{gene_id} #label {label}\n")

2025-09-19 14:29:20,600 - INFO - Number of features that's need checking: 42


### Manual curation of `keys_to_check.txt`
At this point, you need to manually review the `keys_to_check.txt` file to identify which keys should be removed from the GeneDict. The goal is to remove only the keys that are **unique to this specific GFF file** and not shared with other GFF files.

How to identify keys to remove:

 - Keep keys with standard gene/trna/rrna names: Keys like `gene-nad2`, `gene-tRNA-Trp`, `gene-rrn18` should generally be kept because they represent standard gene nomenclature that might be used across multiple GFF files.
 - Remove keys with file-specific identifiers: Keys containing the sequence ID (like `AB568599.1`) or unique identifiers (like `A8G35_gp044`, `P416_r02`) should be removed as they are specific to this file only.

The curated list of keys to be removed should be saved intoa new file `keys_to_check.txt`, in the same directory as this notebook.

### Keys to remove from GeneDict

#### A.

In [None]:
file_path = BASE_PATH / "keys_to_remove.txt"
if not file_path.is_file():
    raise FileNotFoundError(f"File not found: {file_path}")

remove_keys = set()
with open(file_path, "r") as f:
    for line in f:
        key = line.strip()
        if key:
            clean_key = key.split("#label")[0].strip()  # take what's before #label
            remove_keys.add(clean_key)

#### B.

In [None]:
log.info(f"Number of features to remove: {len(remove_keys)}")
for key in remove_keys:
    gene_dict.pop(key, None)

new_path = gdict_path.parent / f"{gdict_path.stem}_removed_manual.gdict"
gene_dict.header.append(
    f"{gdt.time_now()} - {len(keys_to_remove)} unique features removed from {gff_path.name}"
)
gene_dict.to_gdict(new_path, overwrite=True)
log.info(f"{new_path.name} was created in the same directory as the original gdict")

# Checking conflicts in DbxrefGeneID annotations using 'gff_gene' vs 'ncbi_desc'

### Imports and functions

In [None]:
from pathlib import Path

import gdt

base = Path("../resources/gdicts").resolve()
base

In [None]:
gdict_path = ""

gdict_path = Path(gdict_path).resolve()

if not gdict_path.is_file():
    raise FileNotFoundError(f"File not found: {gdict_path}")

In [None]:
gdict = gdt.read_gdict(gdict_path, lazy_info=False)

total = 0
conflicts = 0
missing_label_gff3 = 0
missing_label_ncbi = 0

for key in gdict.keys():
    entry = gdict[key]
    if isinstance(entry, gdt.DbxrefGeneID):
        if "gff_gene:" in entry.c and "ncbi_desc:" in entry.c:

            gff3_gene = entry.c.split("gff_gene:", 1)[1].split(" |")[0].strip()
            ncbi_desc = entry.c.split("ncbi_desc:", 1)[1].strip()

            gff3_label = gdict.get(gff3_gene).label if gff3_gene in gdict else "not_found"
            ncbi_label = gdict.get(ncbi_desc).label if ncbi_desc in gdict else "not_found"

            if gff3_label != "not_found" and ncbi_label != "not_found":
                total += 1
            else:
                if gff3_label == "not_found":
                    missing_label_gff3 += 1
                if ncbi_label == "not_found":
                    missing_label_ncbi += 1
                continue

            if ncbi_label != gff3_label:
                conflicts += 1
                # print(f"{key} -> gff_gene: [{gff_gene}] ncbi_desc: [{ncbi_desc}]")
                # print(f"  CONFLICT: label: {entry.label}  gff_label: {gff_label} | ncbi_label: {ncbi_label}")

print(f"Total with both gff and ncbi: {total}")
if total == 0:
    print("  No entries with both gff_gene and ncbi_desc found.")
else:
    print(f"Conflicts: {conflicts} ({conflicts/total:.2%})")
    print(f"gff3_gene missing label: {missing_label_gff3}")
    print(f"ncbi_desc missing label: {missing_label_ncbi}")