## Imports and necessary functions

In [23]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
import gdt

from Bio import Entrez
from datetime import datetime


def increment_gdt_file(path):
    """
    Increment the GDT file name by 1.
    Example: fungi-ncbi_pilot_03.gdt -> fungi-ncbi_pilot_04.gdt
    """
    plist = path.stem.split("_")
    if plist[-1] == "stripped":
        plist[-1] = "pilot"
        plist.append(0)

    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDT file name: {path.name}. Expected format: <preferred_name>_##.gdt, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number


def get_most_recent_gdt(dir_path, prefix="TEMP_"):
    """
    Get the most recent gdt file in the directory.
    Arguments:
        dir_path (Path): Directory to search for GDT files.
        prefix (str): Prefix of the GDT files to search for. It will match files like "<prefix>*.gdt".
    Returns:
        Path: Path to the most recent GDT file.
    """
    temp_files = list(dir_path.glob(f"{prefix}*.gdt"))
    if not temp_files:
        return dir_path / f"{prefix}00.gdt"
    return gdt.gene_dict_impl.natural_sort(temp_files, key=lambda x: x.stem)[-1]

## Setup

In [None]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "/home/brenodupin/matheus/gdt/sandbox/fungi_mt_model"
most_recent_gdt_file = "fungi_mt_model_pilot_01.gdt"
global_query_string = gdt.gff3_utils.QS_GENE_TRNA_RRNA
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"


Entrez.email = "dupin@alunos.utfpr.edu.br"
Entrez.api_key = "b3abc1ac7ae9ac035af84ec1abf895878d09"
print(f"Chosen feature query string: '{global_query_string}'")

In [20]:
# Check if all variables exist
# TODO better way to do this?
DATA_DIR = Path(DATA_DIR).resolve()
MISC_DIR = DATA_DIR / "misc"
MISC_DIR.mkdir(exist_ok=True)
GDT_dir = MISC_DIR / "gdt"
GDT_dir.mkdir(exist_ok=True)

AN_missing_gene_dict = MISC_DIR / "AN_missing_gene_dict.txt"

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(
        f"Data directory {DATA_DIR} does not exist or is not a directory."
    )

if not AN_missing_gene_dict.exists() and not AN_missing_gene_dict.is_file():
    raise FileNotFoundError(
        f"AN missing gene dictionary {AN_missing_gene_dict} does not exist or is not a file."
    )

most_recent_gdt_file = GDT_dir / most_recent_gdt_file
if not most_recent_gdt_file:
    print(
        "If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable."
    )
    print("Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(
            f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file."
        )

In [11]:
log_file = MISC_DIR / "01_missing_gene_dict.log"

_, log = gdt.logger_setup.logger_creater(
    log_file=log_file, console_level="DEBUG", file_level="TRACE"
)
log.debug("Running from notebook AN_missing_gene_dict")

2025-06-06 17:02:11,558 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/01_missing_gene_dict.log
2025-06-06 17:02:11,566 - DEBUG - Running from notebook AN_missing_gene_dict


In [12]:
RE_ID = re.compile(r"ID=([^;]+)")
RE_GENE = re.compile(r"gene=([^;]+)")
RE_DBXREF = re.compile(r"Dbxref=GeneID:([^;,]+)")


def data_process(
    df_missing,
    AN,
    gene_dict,
    temp_gene_dict,
    organelle_type,
    temp_count,
    log,
    use_NCBI_symbol=False,
    use_gene=False,
    temp_name="temp_desc",
    c_text="ncbi_desc",
    gn_tag="NCBI",
):
    """
    Process the data in the dataframe and update the gene_dict and corresponding temp_gene_dict.
    Args:
        df_missing (pd.DataFrame): DataFrame containing the missing genes.
        AN (str): The accession number.
        gene_dict (dict): Dictionary containing gene information.
        temp_gene_dict (dict): Temporary dictionary for gene information.
        organelle_type (str): Type of organelle. Should be "MT" or "PT".
        temp_count (int): Counter for temporary labels.
        logger: Logger object for logging messages.
        use_NCBI_symbol (bool): Flag to indicate whether to use NCBI gene symbol or NCBI description. Default is False, which means use NCBI description.
        use_gene (bool): Flag to indicate whether to use gff gene or not. Default is False, which means use what was set with use_NCBI_symbol.
        temp_name (str): Name for the temporary dictionary. Default is 'temp_desc'.
    Returns:
        tuple: Updated gene_dict, temp_gene_dict, and temp_count.
    """
    for row in df_missing.itertuples():
        check_var = row.gene_symbol if use_NCBI_symbol else row.desc
        check_var = row.gene if use_gene else check_var
        check_desc = (
            f"{check_var} | ncbi_desc: {row.desc}" if use_NCBI_symbol else check_var
        )

        log.debug(
            f"gene_id: {row.gene_id} | dbxref: {row.dbxref} | s: {row.start} | att: {row.attributes}"
        )
        log.trace(
            f"\tname: {row.other_aliases} | desc: {row.desc} | gene_symbol: {row.gene_symbol}"
        )
        log.trace(
            f"\tcheck_var: {check_var} | use_symbol: {use_NCBI_symbol} | use_gene: {use_gene}"
        )

        if check_var in gene_dict:
            gene_label = gene_dict[check_var].label
            log.debug(
                f"\t[1st T]Label in gene_dict, L: |{gene_label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}"
            )
            gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                label=gene_label,
                an_source=AN,
                dbxref=row.dbxref,
                c=f"{c_text}: {check_desc}",
            )

        else:
            log.trace(
                f"\t[1st F]Label not found gene_dict | checking {temp_name} | Label: {check_var}"
            )

            if check_var in temp_gene_dict:
                gene_label = temp_gene_dict[check_var].label
                log.debug(
                    f"\t[2nd T]Label in {temp_name}, L: |{gene_label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}"
                )
                temp_gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f"{c_text}: {check_desc}",
                )
            else:

                temp_count += 1
                label = f"{organelle_type}-TEMP-{temp_count}"
                log.debug(
                    f"\t[2nd F]Label not in {temp_name}, new label |{label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}"
                )
                temp_gene_dict[check_var] = gdt.gene_dict_impl.GeneDescription(
                    label=label, source=gn_tag, c=None
                )

                temp_gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                    label=label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f"{c_text}: {check_desc}",
                )

    return gene_dict, temp_gene_dict, temp_count

### TEMP using NCBI description

In [13]:
with open(AN_missing_gene_dict, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 3


In [14]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict_impl.create_gene_dict(
        most_recent_gdt_file, max_an_sources=0
    )
    print(f"Loaded gene_dict from {most_recent_gdt_file}\nHeader:")
    [print(x) for x in gene_dict["gdt_header"]]
    print("\nGDT Info:")
    [print(x) for x in gene_dict["gdt_info"]]
else:
    gene_dict = {}
    print("No GDT file found, starting with an empty gene_dict.")

temp_gene_dict = {}

Loaded gene_dict from /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/fungi_mt_model_stripped.gdt
Header:
version 0.0.2
Fungi_mt
2025-04-09 17:54 - Conversion from gdt to gdt2
2025-06-05 18:11 - Stripped GDT version from original GDT file Fungi_mt.gdt

GDT Info:
Gene dictionary length: 509
Label: 54
GeneDescription: 509
GeneGenerics: 0
GeneDbxref: 0


In [15]:
temp_gene_dict = {}
temp_count = 0
errors = []
log.info(" ---- [Starting TEMP process] ----")
for i, AN in enumerate(ANs):
    log.debug(f"-- [Processing: {AN}] --")

    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    if not an_path.exists():
        log.error(f"Error: {AN} does not exist (an_path: {an_path})")
        errors.append((AN, "File not found"))
        continue

    df = gdt.gff3_utils.load_gff3(an_path, query_string=global_query_string)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["gene"] = df["attributes"].str.extract(RE_GENE, expand=False)
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)
    df["has_gene"] = df["gene"].notna()

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    df["dbxref"] = df["attributes"].str.extract(RE_DBXREF, expand=False)

    df_missing = df[~df["in_gene_dict"] & ~df["has_gene"]].copy()
    df_gene = df[~df["in_gene_dict"] & df["has_gene"]].copy()

    if not df_gene.empty:
        log.debug(
            f"Found {len(df_gene)} feature(s) with gene= in {AN}, not in gene_dict."
        )
        df_gene[["other_aliases", "desc", "gene_symbol"]] = np.nan
        gene_dict, temp_gene_dict, temp_count = data_process(
            df_gene,
            AN,
            gene_dict,
            temp_gene_dict,
            organelle_type,
            temp_count,
            log,
            use_gene=True,
            temp_name="temp_gene",
            c_text="gff_gene",
            gn_tag="gff_gene",
        )

    if df_missing.empty:
        log.debug(
            f"All features are either in gene_dict or have a gene= attribute in {AN}. Skipping."
        )
        continue

    # search NCBI
    with Entrez.esummary(db="gene", id=",".join(df_missing["dbxref"])) as search_handle:
        try:
            search_results = Entrez.read(search_handle)["DocumentSummarySet"]["DocumentSummary"]  # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            log.error(f"{ex} in Entrez.read for {AN}")
            errors.append((AN, "Entrez.read"))
            continue

    mr_check = len(df_missing) == len(search_results)
    log.trace(f"\tm: {len(df_missing)} | r: {len(search_results)} | m/r check: {mr_check}")  # type: ignore

    # merge with search_results
    temp_df = pd.DataFrame(
        [
            {
                "dbxref": x.attributes["uid"],
                "other_aliases": x.get("OtherAliases", "no_other_aliases"),
                "desc": x.get("Description", "no_description"),
                "gene_symbol": x.get("Name", "no_gene_symbol"),
            }
            for x in search_results
        ]
    )

    df_missing = df_missing.merge(temp_df, on="dbxref", how="left", copy=False)

    # check if df_missing len is equal to search_results, and equal to the original df
    if (len(df_missing) != len(temp_df)) or (
        len(df_missing) != len(df[~df["in_gene_dict"] & ~df["has_gene"]])
    ):
        log.warning(
            f"{AN} m/r check: {mr_check} | df_missing len {len(df_missing)} | temp_df len {len(temp_df)} | df len {len(df[~df['in_gene_dict'] & ~df['has_gene']])}"
        )
        log.warning(
            "This is not expected, but can be caused by fragmented genes that have the same dbxref/gene_id. Please check the log file for more details in TRACE level."
        )

    # process the data
    gene_dict, temp_gene_dict, temp_count = data_process(
        df_missing, AN, gene_dict, temp_gene_dict, organelle_type, temp_count, log
    )


log.info(" ---- [Finished] ----")
if errors:
    log.warning(f"Errors: {len(errors)}")
    for an, msg in errors:
        log.warning(f"{an} - {msg}")

2025-06-06 17:02:24,242 - INFO -  ---- [Starting TEMP process] ----
2025-06-06 17:02:24,245 - DEBUG - -- [Processing: NC_031514.1] --
2025-06-06 17:02:25,348 - DEBUG - gene_id: gene-BK479_gr02 | dbxref: 29991917 | s: 1 | att: ID=gene-BK479_gr02;Dbxref=GeneID:29991917;Name=BK479_gr02;description=rnl;gbkey=Gene;gene_biotype=rRNA;locus_tag=BK479_gr02
2025-06-06 17:02:25,349 - DEBUG - 	[1st T]Label in gene_dict, L: |MT-RNR2| adding: gene-BK479_gr02 #dx NC_031514.1:29991917 #c ncbi_desc: rnl
2025-06-06 17:02:25,350 - DEBUG - gene_id: gene-BK479_gr02 | dbxref: 29991917 | s: 1 | att: ID=gene-BK479_gr02;Dbxref=GeneID:29991917;Name=BK479_gr02;description=rnl;gbkey=Gene;gene_biotype=rRNA;locus_tag=BK479_gr02
2025-06-06 17:02:25,350 - DEBUG - 	[1st T]Label in gene_dict, L: |MT-RNR2| adding: gene-BK479_gr02 #dx NC_031514.1:29991917 #c ncbi_desc: rnl
2025-06-06 17:02:25,351 - DEBUG - gene_id: rna-BK479_gr02 | dbxref: 29991917 | s: 1 | att: ID=rna-BK479_gr02;Parent=gene-BK479_gr02;Dbxref=GeneID:2999

In [16]:
if temp_gene_dict:
    temp_path = get_most_recent_gdt(GDT_dir, prefix="TEMP_")
    new_path, nth_iteration = increment_gdt_file(temp_path)
    log.info(f"Writing TEMP GDT file: {new_path} | Iteration: {nth_iteration}")
    temp_gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(temp_gene_dict)
    temp_gene_dict["gdt_header"] = [
        "version 0.0.2",
        f"TEMP - {nth_iteration}",
        "Automagically generated by AN_missing_gene_dict.ipynb | TEMP using NCBI gene description",
    ]
    gdt.gene_dict_impl.write_gdt_file(temp_gene_dict, new_path, overwrite=True)

2025-06-06 17:02:32,963 - INFO - Writing TEMP GDT file: /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/TEMP_01.gdt | Iteration: 1


In [None]:
if gene_dict:
    new_path, nth_iteration = increment_gdt_file(most_recent_gdt_file)
    log.info(f"Writing gene_dict file: {new_path} | Iteration: {nth_iteration}")
    gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
    gene_dict["gdt_header"].append(
        f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - Data added from TEMP {nth_iteration:02}"
    )
    gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

2025-06-06 17:02:37,767 - INFO - Writing gene_dict file: /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/fungi_mt_model_pilot_01.gdt | Iteration: 1


### TEMP using NCBI Symbol

In [18]:
ANS_Symbol = set()

with open(MISC_DIR / "seed_TEMP_Symbol.txt", "r") as f:
    for line in f:
        line = line.strip()

        if not line or line.startswith("#") or line.startswith("[") or "#gd" in line:
            continue

        ANS_Symbol.add(line.split("#dx", 1)[1].strip().split(":", 1)[0])

In [25]:
# Make sure you had change to now use <name>_pilot_xx.gdt!, change it in the most_recent_gdt_file variable
# and run the second and third cells of the notebook.
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict_impl.create_gene_dict(
        most_recent_gdt_file, max_an_sources=0
    )
    print(f"Loaded gene_dict from {most_recent_gdt_file}\nHeader:")
    [print(x) for x in gene_dict["gdt_header"]]
    print("\nGDT Info:")
    [print(x) for x in gene_dict["gdt_info"]]
else:
    gene_dict = {}
    print("No GDT file found, starting with an empty gene_dict.")

temp_symbol_gene_dict = {}

Loaded gene_dict from /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/fungi_mt_model_pilot_01.gdt
Header:
version 0.0.2
Fungi_mt
2025-04-09 17:54 - Conversion from gdt to gdt2
2025-06-05 18:11 - Stripped GDT version from original GDT file Fungi_mt.gdt
2025-06-06 18:07 - Data added from TEMP 01

GDT Info:
Gene dictionary length: 765
Label: 56
GeneDescription: 531
GeneGenerics: 0
GeneDbxref: 234


In [26]:
temp_symbol_gene_dict = {}
temp_count = 0
errors = []
log.info(" ---- [Starting TEMP process] ----")
for i, AN in enumerate(ANS_Symbol):
    log.debug(f"-- [Processing: {AN}] --")

    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    if not an_path.exists():
        log.error(f"Error: {AN} does not exist (an_path: {an_path})")
        errors.append((AN, "File not found"))
        continue

    df = gdt.gff3_utils.load_gff3(an_path, query_string=global_query_string)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    df["dbxref"] = df["attributes"].str.extract(RE_DBXREF, expand=False)

    df_missing = df[~df["in_gene_dict"]].copy()

    # search NCBI
    with Entrez.esummary(db="gene", id=",".join(df_missing["dbxref"])) as search_handle:
        try:
            search_results = Entrez.read(search_handle)["DocumentSummarySet"]["DocumentSummary"]  # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            log.error(f"{ex} in Entrez.read for {AN}")
            errors.append((AN, "Entrez.read"))
            continue

    mr_check = len(df_missing) == len(search_results)
    log.trace(f"\tm: {len(df_missing)} | r: {len(search_results)} | m/r check: {mr_check}")  # type: ignore

    # merge with search_results
    temp_df = pd.DataFrame(
        [
            {
                "dbxref": x.attributes["uid"],
                "other_aliases": x.get("OtherAliases", "no_other_aliases"),
                "desc": x.get("Description", "no_description"),
                "gene_symbol": x.get("Name", "no_gene_symbol"),
            }
            for x in search_results
        ]
    )

    df_missing = df_missing.merge(temp_df, on="dbxref", how="left", copy=False)

    # check if df_missing len is equal to search_results, and equal to the original df
    if (len(df_missing) != len(temp_df)) or (
        len(df_missing) != len(df[~df["in_gene_dict"]])
    ):
        log.warning(
            f'{AN} m/r check: {mr_check} | df_missing len {len(df_missing)} | temp_df len {len(temp_df)} | df len {len(df[~df["in_gene_dict"]])}'
        )
        log.warning(
            "This is not expected, but can be caused by fragmented genes that have the same dbxref/gene_id. Please check the log file for more details in TRACE level."
        )

    # process the data
    gene_dict, temp_symbol_gene_dict, temp_count = data_process(
        df_missing,
        AN,
        gene_dict,
        temp_symbol_gene_dict,
        organelle_type,
        temp_count,
        log,
        temp_name="temp_symbol",
        use_NCBI_symbol=True,
        c_text="ncbi_symbol",
    )


log.info(" ---- [Finished] ----")
if errors:
    log.warning(f"Errors: {len(errors)}")
    for an, msg in errors:
        log.warning(f"{an} - {msg}")

2025-06-06 18:07:56,407 - INFO -  ---- [Starting TEMP process] ----
2025-06-06 18:07:56,410 - DEBUG - -- [Processing: NC_031514.1] --
2025-06-06 18:07:57,078 - DEBUG - gene_id: gene-BK479_gp13 | dbxref: 29991894 | s: 16614 | att: ID=gene-BK479_gp13;Dbxref=GeneID:29991894;Name=BK479_gp13;description=Orf1;gbkey=Gene;gene_biotype=protein_coding;locus_tag=BK479_gp13
2025-06-06 18:07:57,080 - DEBUG - 	[2nd F]Label not in temp_symbol, new label |MT-TEMP-1| adding: gene-BK479_gp13 #dx NC_031514.1:29991894 #c ncbi_symbol: BK479_gp13 | ncbi_desc: Orf1
2025-06-06 18:07:57,080 - DEBUG - gene_id: gene-BK479_gp11 | dbxref: 29991899 | s: 22059 | att: ID=gene-BK479_gp11;Dbxref=GeneID:29991899;Name=BK479_gp11;description=Orf2;gbkey=Gene;gene_biotype=protein_coding;locus_tag=BK479_gp11
2025-06-06 18:07:57,081 - DEBUG - 	[2nd F]Label not in temp_symbol, new label |MT-TEMP-2| adding: gene-BK479_gp11 #dx NC_031514.1:29991899 #c ncbi_symbol: BK479_gp11 | ncbi_desc: Orf2
2025-06-06 18:07:57,081 - DEBUG - --

In [27]:
if temp_symbol_gene_dict:
    temp_path = get_most_recent_gdt(GDT_dir, prefix="TEMP_Symbol_")
    new_path, symbol_iteration = increment_gdt_file(temp_path)
    log.info(
        f"Writing TEMP Symbol GDT file: {new_path} | Iteration: {symbol_iteration}"
    )
    temp_symbol_gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(
        temp_symbol_gene_dict
    )
    temp_symbol_gene_dict["gdt_header"] = [
        "version 0.0.2",
        f"TEMP Symbol - {symbol_iteration}",
        "Automagically generated by AN_missing_gene_dict.ipynb | TEMP Symbol using NCBI gene symbol",
    ]
    gdt.gene_dict_impl.write_gdt_file(temp_symbol_gene_dict, new_path, overwrite=True)

2025-06-06 18:08:03,339 - INFO - Writing TEMP Symbol GDT file: /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/TEMP_Symbol_01.gdt | Iteration: 1


In [28]:
if gene_dict:
    new_path, nth_iteration = increment_gdt_file(most_recent_gdt_file)
    log.info(
        f"Writing TEMP GDT file: {new_path} | pilot itr: {nth_iteration} | Symbol itr: {symbol_iteration}"
    )
    gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
    gene_dict["gdt_header"].append(
        f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - Data added from TEMP Symbol {symbol_iteration}"
    )
    gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

2025-06-06 18:08:26,440 - INFO - Writing TEMP GDT file: /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/fungi_mt_model_pilot_02.gdt | pilot itr: 2 | Symbol itr: 1


### Genes Discard using dbxref

In [29]:
remove_string = "discard-"
genes_to_remove = "genome_features_to_remove.txt"

In [36]:
remove_gene_ids = {}
with open(MISC_DIR / genes_to_remove, "r") as f:
    for line in f:
        if (
            not line.strip()
            or line.startswith("#")
            or line.startswith("[")
            or "#gd" in line
        ):
            continue

        gene_id, an = line.split("#c", 1)[0].split("#dx", 1)
        gene_id = gene_id.strip()
        an = an.split(':', 1)[0].strip()

        if an not in remove_gene_ids:
            remove_gene_ids[an] = set([gene_id])
        else:
            remove_gene_ids[an].add(gene_id)

In [38]:
for an in remove_gene_ids.keys():
    an_path = DATA_DIR / f"{an}{gff_suffix}"
    with open(an_path, "r") as f:
        lines = f.readlines()

    headers, index = [], 0
    while lines[index].startswith("#"):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile("|".join([f"ID={x};" for x in remove_gene_ids[an]]))
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()):
            continue
        line = line.split("\t")

        # line[2] is type line, line[8] is attributes
        if pattern.search(line[8]):
            if remove_string not in line[2]:
                line[2] = remove_string + line[2]

        contents.append("\t".join(line))

    with open(an_path, "w") as f:
        f.write("\n".join(headers))
        f.write("\n")
        f.write("\n".join(contents))
        f.write("\n\n")

    print(f"{an} Done!")

NC_031514.1 Done!
NC_044675.1 Done!
