In [None]:
# imports and functions
import re
import gdt

import pandas as pd

from pathlib import Path
from datetime import datetime

RE_ID = re.compile(r"ID=([^;]+)")

# Regex patterns for extracting GFF attributes
RE_gene = re.compile(r"gene=([^;]+)")
RE_name = re.compile(r"Name=([^;]+)")
RE_note = re.compile(r"Note=([^;]+)")
RE_parent = re.compile(r"Parent=([^;]+)")
RE_product = re.compile(r"product=([^;]+)")
RE_description = re.compile(r"description=([^;]+)")
RE_gene_synonym = re.compile(r"gene_synonym=([^;]+)")

# Features to extract from GFF files (must match the regex names above without RE_ prefix)
features_name = ["name", "product", "description", "gene", "gene_synonym", "note"]

# To add a new feature for extraction:
# 1. Create a regex pattern with the naming convention: RE_{feature_name}
#    Use any regex pattern that captures the desired value from GFF files.
#    Common pattern: RE_{feature_name} = re.compile(r"{attribute_name}=([^;]+)")
#    but you can customize the regex to capture exactly what you need.
#
# 2. Add the feature name (without RE_ prefix) to the features_name list
#
# Example - to extract 'locus_tag' attributes:
# RE_locus_tag = re.compile(r"locus_tag=([^;]+)")  # or any custom regex
# features_name.append("locus_tag")
#
# Important: The regex variable name (after RE_) must exactly match the name
# you add to features_name for the extraction to work properly

re_features = {}
for name in features_name:
    try:
        re_features[name] = globals()[f"RE_{name}"]
    except KeyError:
        print(f"Warning: No regex found for '{name}' (expected variable: RE_{name})")


def increment_gdt_file(path):
    """
    Increment the GDT file name by 1.
    Example: fungi-ncbi_pilot_03.gdt -> fungi-ncbi_pilot_04.gdt
    """
    plist = path.stem.split("_")
    if plist[-1] == "stripped":
        plist[-1] = "pilot"
        plist.append(0)

    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDT file name: {path.name}. Expected format: <preferred_name>_##.gdt, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number


def get_most_recent_gdt(dir_path, prefix="TEMP_"):
    """
    Get the most recent gdt file in the directory.
    Arguments:
        dir_path (Path): Directory to search for GDT files.
        prefix (str): Prefix of the GDT files to search for. It will match files like "<prefix>*.gdt".
    Returns:
        Path: Path to the most recent GDT file.
    """
    temp_files = list(dir_path.glob(f"{prefix}*.gdt"))
    if not temp_files:
        return dir_path / f"{prefix}00.gdt"
    return gdt.gene_dict_impl.natural_sort(temp_files, key=lambda x: x.stem)[-1]

In [None]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../sandbox/fungi_mt_model"
most_recent_gdt_filename = "fungi_mt_model_pilot_03.gdt"
global_query_string = gdt.gff3_utils.QS_GENE_TRNA_RRNA
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

print(f"Chosen feature query string: '{global_query_string}'")


# just checking
DATA_DIR = Path(DATA_DIR).resolve()
if not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Path {DATA_DIR} is not a directory.")

MISC_DIR = DATA_DIR / "misc"
GDT_DIR = MISC_DIR / "gdt"
GDT_DIR.mkdir(511, True, True)  # 511 = 0o777

AN_missing_dbxref = MISC_DIR / "AN_missing_dbxref.txt"

if not AN_missing_dbxref.is_file():
    raise FileNotFoundError(
        f"Missing {AN_missing_dbxref}, did you run geneDict filter?"
    )

if "most_recent_gdt_filename" in globals():
    gdt_path = GDT_DIR / most_recent_gdt_filename
    if not gdt_path.is_file():
        print(
            f"Not found {gdt_path.name}, does it exist in misc/gdt?\nGDTs in {GDT_DIR}:"
        )
        [print(f" - {f.name}") for f in sorted(GDT_DIR.glob("*.gdt"))]
        raise FileNotFoundError(
            f"Most recent GDT file {gdt_path.name} does not exist in {GDT_DIR}."
        )
else:
    print(
        "Warning: 'most_recent_gdt_filename' variable not set.\n\n"
        "You should have a GDT file from AN_missing_gene_dict.ipynb:\n"
        "• Set the most_recent_gdt_filename variable\n"
        "• Re-run this cell\n\n"
        "If you intend to run this without a GDT file (e.g., because your GFF files "
        "don't have dbxrefs and AN_missing_dbxref.ipynb isn't needed), this warning can be ignored."
    )
    # to simplify the code, a exetution without most_recent_gdt_filename is
    # basically the same as with one, but with and empty gdt file
    gdt_path = GDT_DIR / "pilot_00.gdt"
    gdt.gene_dict_impl.create_empty_gdt(gdt_path)

Chosen feature query string: 'type == ['gene', 'tRNA', 'rRNA']'


In [None]:
_, log = gdt.logger_setup.logger_creater(
    log_file=MISC_DIR / "02_missing_dbxref.log",
    console_level="DEBUG",
    file_level="TRACE",
)
log.debug("Running from notebook AN_missing_dbxref")

2025-06-06 18:21:24,072 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/02_missing_dbxref.log
2025-06-06 18:21:24,079 - DEBUG - Running from notebook AN_missing_dbxref


In [8]:
with open(AN_missing_dbxref, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 2


In [None]:
# Load the GDT file (even if empty)
gene_dict = gdt.gene_dict_impl.create_gene_dict(gdt_path, max_an_sources=0)
log.info(f"Loaded gene_dict from {gdt_path}")
log.info("Header:")
[log.info(f"\t{x}") for x in gene_dict["gdt_header"]]
log.info("GDT Info:")
[log.info(f"\t{x}") for x in gene_dict["gdt_info"]]

temp_gene_dict = {}

Loaded gene_dict from /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/fungi_mt_model_pilot_03.gdt
Header:
version 0.0.2
Fungi_mt
2025-04-09 17:54 - Conversion from gdt to gdt2
2025-06-05 18:11 - Stripped GDT version from original GDT file Fungi_mt.gdt
2025-06-06 18:07 - Data added from TEMP 01
2025-06-06 18:08 - Data added from TEMP Symbol 1
automated insertion from missing_dbxref_feature_name

GDT Info:
Gene dictionary length: 862
Label: 56
GeneDescription: 535
GeneGenerics: 93
GeneDbxref: 234


### Deeper investigation using other gff attributes, primarily gene=

In [None]:
temp_list = []
to_drop = ["source", "type", "start", "end", "score", "strand", "phase", "attributes"]

for AN in ANs:
    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    df = gdt.gff3_utils.load_gff3(
        an_path, query_string=global_query_string, usecols=gdt.GFF3_COLUMNS
    )
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df = df[~df["gene_id"].isin(gene_dict)]

    for name, pattern in re_features.items():
        df[name] = df["attributes"].str.extract(pattern, expand=False)

    if df[features_name].isna().all(axis=1).any():
        log.warning(f"{AN} has row(s) with no identifiable atribute.")
        log.warning(
            "Please modify this script to add a new possible identifiable attribute or just remove the AN from your dataset"
        )
        log.debug(df[df[features_name].isna().all(axis=1)])

    temp_list.extend(df.to_dict("records"))

features_info_df = pd.DataFrame(temp_list)
features_info_df = features_info_df.drop(columns=to_drop, errors="ignore")

drop_cols = [col for col in features_name if features_info_df[col].isna().all()]

features_info_df["feature_name"] = (
    features_info_df["gene"]
    .fillna(features_info_df["product"])
    .fillna(features_info_df["description"])
    .fillna(features_info_df["name"])
    .fillna(features_info_df["note"])
    .fillna(features_info_df["gene_synonym"])
)

features_info_df = features_info_df.drop(columns=drop_cols)
features_info_df = features_info_df.sort_values(by="feature_name")
log.debug(f"Features info df, writing to {MISC_DIR / 'features_info.tsv'}")
features_info_df.to_csv(MISC_DIR / "features_info.tsv", sep="\t", index=False)

2025-06-06 18:21:39,106 - DEBUG - missing_dbxref: creation of features_info_df


In [11]:
add_gdt_compliance = True
comment = "Manual from missing_dbxref_names_raw"

In [12]:
if add_gdt_compliance:
    gdt_str = f' #gd MANUAL{ " #c " + comment if comment else "" }'
else:
    gdt_str = ""

# df with 2 columns, one for feature_names and one for in_gene_dict
new_df = pd.DataFrame({"feature_name": features_info_df["feature_name"].unique()})
new_df["in_gene_dict"] = new_df["feature_name"].isin(gene_dict)

In [None]:
unique_names = new_df[~new_df["in_gene_dict"]]["feature_name"]
unique_names = gdt.gene_dict_impl.natural_sort(unique_names)
with open(MISC_DIR / "feature_name.txt", "w+") as f1:
    for name in unique_names:
        f1.write(f"{name}{gdt_str}\n")

The user must now parse feature_names.txt  

Features that can be easily identifiable must be added to the current  
version of the gdt, and features that need deeper investigation should be  
copied to a new file name 'features_unk.txt'
  
The script will now try to automatically add the gene_ids with feature names   
that __are not in 'features_unk.txt'__ to gene_dict.

In [16]:
# Check if the names exist in the gene_dict
features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")

names_unk = set()
with open(MISC_DIR / "features_unk.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [None]:
names_not_in_dict = set()
all_names = set(features_info_df["feature_name"].unique()) - names_unk
for name in all_names:
    if name not in gene_dict:
        names_not_in_dict.add(name)

if names_not_in_dict:
    log.debug(f"Warning: {len(names_not_in_dict)} name(s) not in gene_dict!")
    log.debug(
        "These names are not in feature_unk, so you marked them as identifiable. Please identify them or add them feature_unk."
    )
    log.debug(
        "It could also be that you forgot to reload the gene_dict with the changes you made above."
    )
    [log.debug(name) for name in names_not_in_dict]
    raise ValueError(f"Error: {len(names_not_in_dict)} names not in gene_dict!")

In [20]:
comment = "automated insertion from missing_dbxref_feature_name"

In [21]:
unique_gene_ids = features_info_df[~features_info_df["feature_name"].isin(names_unk)][
    "gene_id"
].unique()

In [22]:
unique_gene_ids

array(['gene-atp6', 'gene-atp8', 'gene-atp9', 'gene-cob', 'gene-cox1',
       'gene-cox2', 'gene-cox3', 'rna-AFUA_m0250', 'rna-lsu', 'gene-lsu',
       'gene-rpm1', 'rna-rpm1', 'rna-AFUA_m0120', 'rna-ssu', 'gene-ssu',
       'rna-AFUA_m0330', 'gene-tRNA-Ala', 'rna-AFUA_m0470',
       'rna-tRNA-Ala', 'gene-tRNA-Arg', 'rna-tRNA-Arg', 'rna-AFUA_m0500',
       'gene-tRNA-Arg-2', 'rna-AFUA_m0070', 'rna-tRNA-Arg-2',
       'gene-tRNA-Asn', 'rna-tRNA-Asn', 'rna-AFUA_m0080',
       'rna-AFUA_m0030', 'gene-tRNA-Asp', 'rna-AFUA_m0190',
       'rna-tRNA-Asp', 'rna-AFUA_m0020', 'gene-tRNA-Cys', 'rna-tRNA-Cys',
       'gene-tRNA-Gln', 'rna-AFUA_m0360', 'rna-tRNA-Gln', 'gene-tRNA-Glu',
       'rna-AFUA_m0280', 'rna-tRNA-Glu', 'rna-AFUA_m0180',
       'rna-AFUA_m0170', 'gene-tRNA-Gly', 'rna-tRNA-Gly', 'rna-tRNA-His',
       'rna-AFUA_m0410', 'gene-tRNA-His', 'rna-AFUA_m0050',
       'rna-AFUA_m0380', 'gene-tRNA-Ile', 'rna-AFUA_m0100',
       'rna-tRNA-Ile', 'rna-AFUA_m0220', 'rna-tRNA-Leu-2',
       

In [None]:
unique_gene_ids = features_info_df[~features_info_df["feature_name"].isin(names_unk)][
    "gene_id"
].unique()

for gene_id in unique_gene_ids:
    df = features_info_df[features_info_df["gene_id"] == gene_id]

    # sanity check, are all feature_names the same?
    if df["feature_name"].nunique() != 1:
        log.warning(
            f"{gene_id} has multiple feature_names: {df['feature_name'].unique()}"
        )
        log.debug("\tChecking if they have the same label in gene_dict...")

        labels = {gene_dict[feat].label for feat in df["feature_name"].unique()}
        if len(labels) != 1:
            log.error(f"\tError: {gene_id} has multiple labels: {labels}")
            raise ValueError(
                f"Error: {gene_id} has multiple labels: {labels}. "
                "Please edit features_info.tsv to resolve this issue."
            )
        else:
            log.debug(f"\tAll feature_names have the same label: {labels.pop()}")

    label = gene_dict[df["feature_name"].iloc[0]].label
    an_sources = df["seqid"].unique().tolist()
    log.debug(
        f"Adding {gene_id} with label '{label}', an_sources: {an_sources}, comment: {comment}"
    )
    gene_dict[gene_id] = gdt.gene_dict_impl.GeneGeneric(
        label=label, an_sources=an_sources, c=comment
    )

gene_id: gene-atp6, label: MT-ATP6, an_sources: ['HE983613.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: gene-atp8, label: MT-ATP8, an_sources: ['HE983613.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: gene-atp9, label: MT-ATP9, an_sources: ['HE983613.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: gene-cob, label: MT-COB, an_sources: ['HE983613.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: gene-cox1, label: MT-CO1, an_sources: ['HE983613.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: gene-cox2, label: MT-CO2, an_sources: ['HE983613.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: gene-cox3, label: MT-CO3, an_sources: ['HE983613.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: rna-AFUA_m0250, label: MT-RNR2, an_sources: ['JQ346808.1'], comment: automated insertion from missing_dbxref_featu

In [None]:
if gene_dict:
    new_path, nth_iteration = increment_gdt_file(gdt_path)
    log.info(f"Writing gene_dict file: {new_path} | Iteration: {nth_iteration}")
    gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
    gene_dict["gdt_header"].append(
        f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - Data added from missing_dbxref_feature_name"
    )
    gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

2025-06-06 18:26:43,718 - INFO - Writing gene_dict file: /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/fungi_mt_model_pilot_03.gdt | Iteration: 3


#### features_unk.txt

In [33]:
names_unk = set()
with open(MISC_DIR / "features_unk.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [34]:
features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")
features_unk_df = (
    features_info_df[features_info_df["feature_name"].isin(names_unk)]
    .copy()
    .reset_index(drop=True)
)

In [36]:
unk_dict = features_unk_df.groupby("seqid")["gene_id"].agg(list).to_dict()

In [None]:
temp_unk = {}
label_count = 0
change_gene_dict = False
log.debug(
    "missing_dbxref: matching probable 'child feature + parent gene' pair (on the an original gff3, using all the features)"
)
for an in unk_dict.keys():
    gene_ids = unk_dict[an]
    log.debug(f"AN: {an}| gene_ids: {gene_ids}")
    an_path = DATA_DIR / f"{an}{gff_suffix}"

    df = gdt.gff3_utils.load_gff3(an_path, usecols=gdt.GFF3_COLUMNS)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)

    for name, pattern in re_features.items():
        df[name] = df["attributes"].str.extract(pattern, expand=False)

    df["feature_name"] = (
        df["gene"]
        .fillna(df["product"])
        .fillna(df["description"])
        .fillna(df["name"])
        .fillna(df["note"])
        .fillna(df["gene_synonym"])
    )

    for gene_id in gene_ids:
        candidates = df[df["parent"] == gene_id]

        # Handle case where no candidates found
        if len(candidates) == 0:
            log.debug(f" {gene_id} | no candidate with parent={gene_id} found")
            log.debug("  adding it to UNKNOWN label\n")
            temp_unk[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                label=f"{organelle_type}-UNKNOWN",
                an_sources=[an],
                c=f"unknown gene_id from {an}{gff_suffix} | "
                f"a: {df[df['gene_id'] == gene_id]['attributes'].iloc[0] if not df[df['gene_id'] == gene_id].empty else 'N/A'}",
            )
            continue

        # Handle feature name conflicts
        if candidates["feature_name"].nunique() > 1:
            log.debug(
                f" {gene_id} | more than one canditate found, but with feature_name conflict, chosing the first one."
            )
            log.debug(f"  feature_names: {candidates['feature_name'].unique()}")

        feature_name = candidates["feature_name"].iloc[0]
        log.debug(f"  chosen feature_name: {feature_name}")
        [
            log.debug(f"\tt: {x.type} | fn: {x.feature_name} | a: {x.attributes}")
            for x in candidates.itertuples()
        ]

        # Handle case where feature_name is in gene_dict
        if feature_name in gene_dict:
            change_gene_dict = True
            label = gene_dict[feature_name].label
            log.debug(f"  feature name in gene_dict, label: {label}\n")
            gene_dict[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_unk_problems, source: {feature_name} | type: {candidates['type'].iloc[0]}",
            )
            continue

        # Handle case where feature_name is NOT in gene_dict
        log.debug(f"  feature name not in gene_dict: {feature_name}")
        log.debug("  checking in temp_unk")

        if feature_name in temp_unk:
            label = temp_unk[feature_name].label
            log.debug(f"  found in temp_unk, label: {label}\n")
            gene_dict[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_unk_problems, source: {feature_name} | type: {candidates['type'].iloc[0]}",
            )
        else:
            label_count += 1
            label = f"{organelle_type}-TEMP-{label_count}"
            log.debug(f"  not found in temp_unk, new label: {label}\n")
            temp_unk[feature_name] = gdt.gene_dict_impl.GeneDescription(
                label=label, source="MANUAL", c=None
            )

            temp_unk[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_unk_problems, source: {feature_name} | type: {candidates['type'].iloc[0]}",
            )

2025-06-06 18:29:22,018 - DEBUG - missing_dbxref: matching probable 'child feature + parent gene' pair (2nd pass, on the an original gff3, using all the features)
2025-06-06 18:29:22,019 - DEBUG - AN: JQ346808.1| gene_ids: ['gene-AFUA_m0010', 'gene-AFUA_m0020', 'gene-AFUA_m0030', 'gene-AFUA_m0040', 'gene-AFUA_m0050', 'gene-AFUA_m0060', 'gene-AFUA_m0070', 'gene-AFUA_m0080', 'gene-AFUA_m0090', 'gene-AFUA_m0100', 'gene-AFUA_m0110', 'gene-AFUA_m0120', 'gene-AFUA_m0130', 'gene-AFUA_m0140', 'gene-AFUA_m0150', 'gene-AFUA_m0160', 'gene-AFUA_m0170', 'gene-AFUA_m0180', 'gene-AFUA_m0190', 'gene-AFUA_m0200', 'gene-AFUA_m0210', 'gene-AFUA_m0220', 'gene-AFUA_m0230', 'gene-AFUA_m0240', 'gene-AFUA_m0250', 'gene-AFUA_m0260', 'gene-AFUA_m0270', 'gene-AFUA_m0280', 'gene-AFUA_m0290', 'gene-AFUA_m0300', 'gene-AFUA_m0310', 'gene-AFUA_m0320', 'gene-AFUA_m0330', 'gene-AFUA_m0340', 'gene-AFUA_m0350', 'gene-AFUA_m0360', 'gene-AFUA_m0370', 'gene-AFUA_m0380', 'gene-AFUA_m0390', 'gene-AFUA_m0400', 'gene-AFUA_m0410

In [None]:
if temp_unk:
    temp_path = get_most_recent_gdt(GDT_DIR, prefix="TEMP_Mapping_")
    new_path, map_iteration = increment_gdt_file(temp_path)
    log.info(f"Writing TEMP Mapping GDT file: {new_path} | Iteration: {map_iteration}")
    temp_unk["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(temp_unk)
    temp_unk["gdt_header"] = [
        "version 0.0.2",
        f"TEMP Mapping - {map_iteration}",
        f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - TEMP Mapping child features to parent genes",
    ]
    gdt.gene_dict_impl.write_gdt_file(temp_unk, new_path, overwrite=True)

2025-06-06 18:29:47,285 - INFO - Writing TEMP Mapping GDT file: /home/brenodupin/matheus/gdt/sandbox/fungi_mt_model/misc/gdt/TEMP_Mapping_01.gdt | Iteration: 1


In [None]:
if change_gene_dict:
    log.debug("gene_dict changed, incrementing gdt file and writing it")
    gdt_path, _ = increment_gdt_file(gdt_path)
    log.info(f"Writing GDT file: {gdt_path}")
    gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
    gene_dict["gdt_header"].append(
        f"{datetime.now().strftime('%Y-%m-%d %H:%M')} - Data added from TEMP Mapping"
    )
    gdt.gene_dict_impl.write_gdt_file(gene_dict, gdt_path)

### Genes Discard using gene_id

In [None]:
append_string = "discard-"
genes_to_remove = "genome_features_to_remove_2.txt"

In [None]:
remove_gene_ids = {}
with open(MISC_DIR / genes_to_remove, "r") as f:
    for line in f:
        if (
            not line.strip()
            or line.startswith("#")
            or line.startswith("[")
            or "#gd" in line
        ):
            continue

        gene_id, an = line.split("#c", 1)[0].split("#gn", 1)
        gene_id = gene_id.strip()
        an = an.strip()

        if an not in remove_gene_ids:
            remove_gene_ids[an] = set([gene_id])
        else:
            remove_gene_ids[an].add(gene_id)

In [56]:
remove_gene_ids

{'JQ346808.1': {'gene-AFUA_m0390'}}

In [None]:
log.info(f"Removing {len(remove_gene_ids)} gene IDs from GFF files.")
for an in remove_gene_ids.keys():
    log.trace(f"Processing {an} for removal of gene IDs {remove_gene_ids[an]}")
    an_path = DATA_DIR / f"{an}{gff_suffix}"
    with open(an_path, "r") as f:
        lines = f.readlines()

    headers, index = [], 0
    while lines[index].startswith("#"):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile("|".join([f"ID={x};" for x in remove_gene_ids[an]]))
    log.trace(f"Pattern for removal: {pattern.pattern}")
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()):
            continue
        line = line.split("\t")

        # line[2] is type line, line[8] is attributes
        if pattern.search(line[8]):
            if append_string not in line[2]:
                line[2] = append_string + line[2]

        contents.append("\t".join(line))

    with open(an_path, "w") as f:
        f.write("\n".join(headers))
        f.write("\n")
        f.write("\n".join(contents))
        f.write("\n\n")

log.info(f"Finished removing gene IDs from {len(remove_gene_ids)} GFF files.")

JQ346808.1 Done!
