### Imports and functions

In [None]:
# imports and functions
import re
from collections import defaultdict
from datetime import datetime
from pathlib import Path

import pandas as pd

import gdt

RE_ID = re.compile(r"ID=([^;]+)")
RE_parent = re.compile(r"Parent=([^;]+)")

# Regex patterns for extracting GFF attributes
RE_gene = re.compile(r"gene=([^;]+)")
RE_product = re.compile(r"product=([^;]+)")
RE_description = re.compile(r"description=([^;]+)")
RE_name = re.compile(r"Name=([^;]+)")
RE_note = re.compile(r"Note=([^;]+)")
RE_gene_synonym = re.compile(r"gene_synonym=([^;]+)")

# Features to extract from GFF files (must match the regex names above without RE_ prefix)
# ORDER MATTERS: Listed from most descriptive/best for identification to least descriptive
# This order determines the priority for filling the feature_name column - earlier entries take precedence
features_order = ["gene", "product", "description", "name", "note", "gene_synonym"]

# To add a new feature for extraction:
# 1. Create a regex pattern with the naming convention: RE_{feature_name}
#    Use any regex pattern that captures the desired value from GFF files.
#    Common pattern: RE_{feature_name} = re.compile(r"{attribute_name}=([^;]+)")
#    but you can customize the regex to capture exactly what you need.
#
# 2. Add the feature name (without RE_ prefix) to the features_name list
#    IMPORTANT: Place it in the appropriate position based on how descriptive/useful
#    it is for identification. More descriptive features should come first in the list.
#
# Example - to extract 'locus_tag' attributes:
# RE_locus_tag = re.compile(r"locus_tag=([^;]+)")  # or any custom regex
# features_name = ["locus_tag", "gene", "product", "description", "name", "note", "gene_synonym"]  # if locus_tag is most descriptive
# # OR
# features_name = ["gene", "product", "description", "locus_tag", "name", "note", "gene_synonym"]  # if locus_tag is moderately descriptive
# # OR
# features_name = ["gene", "product", "description", "name", "note", "gene_synonym", "locus_tag"] # if locus_tag is least descriptive
#
# CRITICAL: The regex variable name (after RE_) must exactly match the name
# you add to features_name for the extraction to work properly

re_features = {}
for name in features_order:
    try:
        re_features[name] = globals()[f"RE_{name}"]
    except KeyError:
        print(f"Warning: No regex found for '{name}' (expected variable: RE_{name})")


def increment_gdict_file(path):
    """Increment the GDICT file name by 1.

    Example: fungi-ncbi_pilot_03.gdict -> fungi-ncbi_pilot_04.gdict
    """
    plist = path.stem.split("_")
    if plist[-1] == "stripped":
        plist[-1] = "pilot"
        plist.append(0)

    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDICT file name: {path.name}. Expected format: <preferred_name>_##.gdict, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number


def most_recent_gdict(dir_path, prefix):
    """Get the most recent gdict file in the directory.

    Arguments:
        dir_path (Path): Directory to search for GDICT files.
        prefix (str): Prefix of the GDICT files to search for. It will match files like "*<prefix>*.gdict".

    Returns:
        Path: Path to the most recent GDICT file.

    """
    temp_files = list(
        dir_path.glob(f"*{prefix}*.gdict")
    )  # maybe change to check for numbers after prefix?
    if not temp_files:
        return dir_path / f"{prefix}_00.gdict"
    return gdt.natural_sort(temp_files, key=lambda x: x.stem)[-1]

### Setup

#### A.

In [None]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "/home/msanital/Desktop/gdt/sandbox/STAR_example/metazoans_mit"
newest_gdict_file = "metazoans_mit_pilot_04.gdict"
global_query_string = gdt.QS_GENE_TRNA_RRNA
remove_orfs = True
in_folder = True
gct = "MIT"
gff_ext = ".gff3"
gff_suffix = ""

print(f"Chosen feature query string: '{global_query_string}'")


# just checking
DATA_DIR = Path(DATA_DIR).resolve()
if not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Path {DATA_DIR} is not a directory.")

MISC_DIR = DATA_DIR / "misc"
GDT_DIR = MISC_DIR / "gdt"
GDT_DIR.mkdir(511, True, True)  # 511 = 0o777

AN_missing_dbxref_GeneID = MISC_DIR / "AN_missing_dbxref_GeneID.txt"

if not AN_missing_dbxref_GeneID.is_file():
    raise FileNotFoundError(
        f"Missing {AN_missing_dbxref_GeneID}, did you run gdt-cli filter?"
    )

if "newest_gdict_file" in globals():
    gdict_path = GDT_DIR / newest_gdict_file
    if not gdict_path.is_file():
        print(
            f"Not found {gdict_path.name}, does it exist in misc/gdt?\nGDICTs in {GDT_DIR}:"
        )
        [print(f" - {f.name}") for f in sorted(GDT_DIR.glob("*.gdict"))]
        raise FileNotFoundError(
            f"Most recent GDICT file {gdict_path.name} does not exist in {GDT_DIR}."
        )
else:
    print(
        "Warning: 'newest_gdict_file' variable not set.\n\n"
        "You should have a GDICT file from AN_missing_gene_dict.ipynb:\n"
        "• Set the newest_gdict_file variable\n"
        "• Re-run this cell\n\n"
        "If you intend to run this without a GDICT file (e.g., because your GFF files "
        "don't have dbxrefs and AN_missing_gene_dict.ipynb isnt't needed), this warning can be ignored."
    )
    # to simplify the code, a exetution without newest_gdict_file is
    # basically the same as with one, but with and empty gdt file
    gdict_path = GDT_DIR / "pilot_00.gdt"
    gdt.create_empty_gdict(gdict_path)

if in_folder:
    gff_builder = gdt.GFFPathBuilder().use_folder_builder(
        DATA_DIR,
        gff_suffix,
        gff_ext,
    )
else:
    gff_builder = gdt.GFFPathBuilder().use_standard_builder(
        DATA_DIR,
        gff_suffix,
        gff_ext,
    )
print(f"Using GFF builder: {gff_builder}\n")

#### b.

In [None]:
log_file = MISC_DIR / "01_missing_dbxref_GeneID.log"

log = gdt.create_simple_logger(
    print_to_console=True,
    console_level="INFO",
    save_to_file=True,
    file_level="TRACE",
    log_file=log_file,
)
log.info("Running from notebook AN_missing_dbxref_GeneID.ipynb")

### Features Extraction

#### A.

In [None]:
with open(AN_missing_dbxref_GeneID, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
log.info(f"Found {len(ANs)} ANs in {AN_missing_dbxref_GeneID}")
log.trace(f"ANs: {ANs}")

#### B.

In [None]:
# Load the GDT file (even if empty)
gene_dict = gdt.read_gdict(gdict_path, lazy_info=False)
log.info(f"GeneDict loaded from {gdict_path.name}")
log.debug(f"path: {gdict_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, gene_dict)

#### C.

In [None]:
temp_list = []
to_drop = ["source", "type", "start", "end", "score", "strand", "phase", "attributes"]
errors = []

log.info(" ---- [Starting 'Features Extraction'] ----")

for AN in ANs:
    log.debug(f"Processing AN: {AN}")
    an_path = gff_builder.build(AN)
    df = gdt.load_gff3(
        an_path, query_string=global_query_string, usecols=gdt.GFF3_COLUMNS
    )
    df = gdt.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df = df[~df["gene_id"].isin(gene_dict)]

    # Procedually extract features based on the regex patterns defined
    for name, pattern in re_features.items():
        df[name] = df["attributes"].str.extract(pattern, expand=False)

    if df[features_order].isna().all(axis=1).any():
        log.error(
            f"{AN} has row(s) with no identifiable atribute, "
            "for now filling 'gene' column with 'no_identifiable_attribute'"
        )
        errors.append((AN, df[df[features_order].isna().all(axis=1)]))
        df.loc[df[features_order].isna().all(axis=1), "gene"] = (
            "no_identifiable_attribute"
        )

    temp_list.extend(df.to_dict("records"))

features_info_df = pd.DataFrame(temp_list)
features_info_df = features_info_df.drop(columns=to_drop, errors="ignore")

drop_cols = [col for col in features_order if features_info_df[col].isna().all()]

# Procedurally fill 'best_feature' based on the order of features_name
features_info_df["best_feature"] = features_info_df[features_order[0]]
for col in features_order[1:]:
    features_info_df["best_feature"] = features_info_df["best_feature"].fillna(
        features_info_df[col]
    )

features_info_df = features_info_df.drop(columns=drop_cols)
features_info_df = features_info_df.sort_values(by="best_feature")
log.debug(f"Features info df, writing to {MISC_DIR / 'features_info.tsv'}")
features_info_df.to_csv(MISC_DIR / "features_info.tsv", sep="\t", index=False)

if errors:
    log.warning(f"Errors found in {len(errors)} ANs,")
    for an, df in errors:
        log.warning(f"AN {an} has {len(df)} row(s) with no identifiable attribute.")
        # remove columns that are all nan
        df = df.dropna(axis=1, how="all")
        log.warning(f"Rows: \n{df.to_string(index=False)}\n")

    log.warning(
        "Found row(s) with no identifiable attribute. This is not a problem right now "
        "as identification may be possible during the TEMP Mapping step."
    )
    log.warning(
        "It's recommended to try to identify these rows (by editing its 'best_feature' column) "
        "in the features_info.tsv file, for better accuracy."
    )
    log.warning(
        "If you wish to try to identify these rows in the TEMP Mapping step, make sure to add"
        "'no_identifiable_attribute' from feature_names.txt to feature_unks.txt, in the next steps"
    )
    log.warning(
        "Another solution is add a new identifying variable to feature_order. This option is "
        "documented in 'Imports and functions' section of this notebook (first cell)."
    )

#### D.

In [None]:
add_gdt_compliance = True
comment = "Manual from missing_dbxref_GeneID feature names"

if add_gdt_compliance:
    gdt_str = f' #gd MANUAL{ " #c " + comment if comment else "" }'
else:
    gdt_str = ""

# df with 2 columns, one for feature_names and one for in_gene_dict
new_df = pd.DataFrame({"best_feature": features_info_df["best_feature"].unique()})
new_df["in_gene_dict"] = new_df["best_feature"].isin(gene_dict)

unique_names = new_df[~new_df["in_gene_dict"]]["best_feature"]
unique_names = gdt.natural_sort(unique_names)
with open(MISC_DIR / "feature_names.txt", "w+") as f1:
    for name in unique_names:
        f1.write(f"{name}{gdt_str}\n")

In [None]:
raise

The user must now parse feature_names.txt  

Features that can be easily identifiable must be added to the current  
version of the gdict, and features that need deeper investigation should be  
copied to a new file name 'feature_unks.txt'
  
The script will now try to automatically add the gene_ids with feature names   
that __are not in 'feature_unks.txt'__ (and therefore known) to gene_dict.

### Automated Insertion from features_info.txt

#### A.

In [None]:
# Check if the names exist in the gene_dict
features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")

names_unk = set()
with open(MISC_DIR / "feature_unks.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

log.info(
    "Verifing that all feature_names.txt values (excluding those in feature_unks.txt) exist in the newest_gdict_file."
)
gene_dict = gdt.read_gdict(gdict_path, lazy_info=False)
log.info(f"GeneDict loaded from {gdict_path.name}")
log.debug(f"path: {gdict_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, gene_dict)

names_not_in_dict = set()
all_names = set(features_info_df["best_feature"].unique()) - names_unk
for name in all_names:
    if name not in gene_dict:
        names_not_in_dict.add(name)

if names_not_in_dict:
    log.debug(f"Warning: {len(names_not_in_dict)} name(s) not in newest_gdict_file!")
    log.debug(
        "These names are not in feature_unk, so you marked them as identifiable. Please identify them or add them feature_unk."
    )
    log.debug(
        "It could also be that you forgot to update and/or reload the newest_gdict_file with the changes you made above."
    )
    [log.debug(name) for name in names_not_in_dict]
    raise ValueError(f"Error: {len(names_not_in_dict)} names not in newest_gdict_file!")

#### B.

In [None]:
comment = "automated insertion from missing_dbxref_GeneID best feature"
unique_gene_ids = features_info_df[~features_info_df["best_feature"].isin(names_unk)][
    "gene_id"
].unique()

errors = []

log.info(
    " ---- [Starting 'Automated insertion of gene_ids with known features from features_info.tsv'] ----"
)
for gene_id in unique_gene_ids:
    df = features_info_df[features_info_df["gene_id"] == gene_id]

    # sanity check, are all feature_names the same?
    if df["best_feature"].nunique() != 1:
        log.warning(
            f"{gene_id} has multiple best_features: {df['best_feature'].unique()}"
        )
        log.debug("\tChecking if they have the same label in gene_dict...")

        labels = {gene_dict[feat].label for feat in df["best_feature"].unique()}
        if len(labels) != 1:
            log.error(
                f"\nSkipping {gene_id} as it has multiple best_features with different labels."
            )
            errors.append((gene_id, df["best_feature"].unique(), labels))
            continue
        else:
            log.debug(f"\tAll best_features have the same label: {labels.pop()}")

    label = gene_dict[df["best_feature"].iloc[0]].label
    an_sources = df["seqid"].unique().tolist()
    log.debug(
        f"Adding {gene_id} (bf: {df['best_feature'].iloc[0]}) with label '{label}', an_sources: {an_sources}, comment: {comment}"
    )
    gene_dict[gene_id] = gdt.GeneGeneric(label=label, an_sources=an_sources, c=comment)
log.info(
    " ---- [Finished 'Automated insertion of gene_ids with known features from features_info.tsv'] ----"
)

if errors:
    log.warning(
        f"'Multiple best_features with different labels' error in {len(errors)} gene_ids:"
    )
    for gene_id, features, labels in errors:
        log.warning(f"{gene_id} has multiple best_features: {features}.")
        log.warning(f"That points to different labels: {labels}.\n")

    log.warning(
        "Please check the features_info.tsv file for these gene_id values and fix them."
    )

#### C.

In [None]:
new_path, nth_iteration = increment_gdict_file(gdict_path)
log.info(f"Writing GeneDict to {new_path} | Iteration: {nth_iteration}")
gene_dict.header.append(
    f"{gdt.time_now()} - Data added from 'Automated insertion of gene_ids with known features from features_info.tsv'"
)
gene_dict.to_gdict(new_path, overwrite=True)
log.info(f"{new_path.name} was created in misc/gdt!")
log.info(
    "You must now add it to newest_gdict_file in the Setup cell, and rerun the cell"
)

### TEMP Mapping

#### A.

In [None]:
gene_dict = gdt.read_gdict(gdict_path, lazy_info=False)
log.info(f"GeneDict loaded from {gdict_path.name}")
log.debug(f"path: {gdict_path}")

log.info("Header:")
for x in gene_dict.header:
    log.info(f"\t{x}")

log.info("GDT Info:")
gdt.log_info(log, gene_dict)

names_unk = set()
with open(MISC_DIR / "feature_unks.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")
features_unk_df = (
    features_info_df[features_info_df["best_feature"].isin(names_unk)]
    .copy()
    .reset_index(drop=True)
)
unk_dict = features_unk_df.groupby("seqid")["gene_id"].agg(list).to_dict()
log.info(
    f"Found {len(unk_dict)} seqids with unrecognized features in features_info.tsv."
)

#### B.

In [None]:
temp_unk = gdt.GeneDict()
label_count = 0
change_gene_dict = False
log.info(
    " ---- [Starting 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'] ----"
)
for an in unk_dict.keys():
    gene_ids = unk_dict[an]
    log.debug(f"AN: {an}| gene_ids: {gene_ids}")
    an_path = gff_builder.build(an)

    df = gdt.load_gff3(an_path, usecols=gdt.GFF3_COLUMNS)
    df = gdt.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["parent"] = df["attributes"].str.extract(RE_parent, expand=False)

    for name, pattern in re_features.items():
        df[name] = df["attributes"].str.extract(pattern, expand=False)

    if df[features_order].isna().all(axis=1).any():
        # alredy logged in the previous step
        df.loc[df[features_order].isna().all(axis=1), "gene"] = (
            "no_identifiable_attribute"
        )

    # Procedurally fill 'best_feature' based on the order of features_name
    df["best_feature"] = df[features_order[0]]
    for col in features_order[1:]:
        df["best_feature"] = df["best_feature"].fillna(df[col])

    for gene_id in gene_ids:
        candidates = df[df["parent"] == gene_id]
        log.debug(f" gene_id: {gene_id} | number of candidates: {len(candidates)}")

        # Handle case where no candidates found
        if len(candidates) == 0:
            log.debug(f" no candidate with parent={gene_id} found")
            log.debug("  adding it to UNKNOWN label\n")
            temp_unk[gene_id] = gdt.GeneGeneric(
                label=f"{gct}-UNKNOWN",
                an_sources=[an],
                c=f"unknown gene_id from {an}{gff_suffix} | "
                f"a: {df[df['gene_id'] == gene_id]['attributes'].iloc[0] if not df[df['gene_id'] == gene_id].empty else 'N/A'}",
            )
            continue

        # Handle feature name conflicts | TODO check their gene_dict labels
        # IMPORTANT
        if candidates["best_feature"].nunique() > 1:
            log.warning(
                " more than one canditate found, but with best_feature conflict, chosing the first one."
            )
            log.warning(
                f"  best_features: {candidates['best_feature'].unique().to_list()}"
            )

        best_feature = candidates["best_feature"].iloc[0]
        log.debug(f"  chosen best_feature: {best_feature}")
        [
            log.debug(f"\tt: {x.type} | bf: {x.best_feature} | a: {x.attributes}")
            for x in candidates.itertuples()
        ]

        # Handle case where feature_name is in gene_dict
        if best_feature in gene_dict:
            change_gene_dict = True
            label = gene_dict[best_feature].label
            log.debug(f"  best_feature in gene_dict, label: {label}\n")
            gene_dict[gene_id] = gdt.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_GeneID feature mapping, source: {best_feature} | type: {candidates['type'].iloc[0]}",
            )
            continue

        # Handle case where feature_name is NOT in gene_dict
        log.debug(f"  best_feature not in gene_dict: {best_feature}")
        log.debug("  checking in temp_unk")

        if best_feature in temp_unk:
            label = temp_unk[best_feature].label
            log.debug(f"  found in temp_unk, label: {label}\n")
            gene_dict[gene_id] = gdt.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_GeneID feature mapping, source: {best_feature} | type: {candidates['type'].iloc[0]}",
            )
        else:
            label_count += 1
            label = f"{gct}-TEMP-{label_count}"
            log.debug(f"  not found in temp_unk, new label: {label}\n")
            temp_unk[best_feature] = gdt.GeneDescription(
                label=label, source="MANUAL", c=None
            )

            temp_unk[gene_id] = gdt.GeneGeneric(
                label=label,
                an_sources=[an],
                c=f"insertion from missing_dbxref_GeneID feature mapping, source: {best_feature} | type: {candidates['type'].iloc[0]}",
            )
log.info(
    " ---- [Finished 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'] ----"
)

#### C.

In [None]:
if temp_unk:
    temp_path = most_recent_gdict(GDT_DIR, prefix="TEMP_Mapping")
    new_path, map_iteration = increment_gdict_file(temp_path)
    log.info(f"Writing TEMP Mapping to {new_path} | Iteration: {map_iteration}")
    temp_unk.header = [
        "version 0.0.2",
        f"TEMP_Mapping - {map_iteration}",
        f"{gdt.time_now()} - Automatically generated from 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'",
    ]
    temp_unk.to_gdict(new_path, overwrite=True)

#### D.

In [None]:
if change_gene_dict:
    new_path, nth_iteration = increment_gdict_file(gdict_path)
    log.info(f"Writing GeneDict to {new_path} | pilot itr: {nth_iteration}")
    gene_dict.header.append(
        f"{gdt.time_now()} - Data added from 'AN_missing_dbxref_GeneID matching 'child + parent' best feature pair'"
    )
    gene_dict.to_gdict(new_path)
    log.info(f"{new_path.name} was created in misc/gdt!")
    log.info(
        "You must now add it to newest_gdict_file in the Setup cell, and rerun the cell"
    )

### Genes exclusion of to_remove_2.txt

In [None]:
append_string = "discard-"
genes_to_exclude = "to_exclude_2.txt"

exclude_gene_ids = defaultdict(set)
with open(MISC_DIR / genes_to_exclude, "r") as f:
    for line in f:
        if (
            not line.strip()
            or line.startswith("#")
            or line.startswith("[")
            or "#gd" in line
        ):
            continue

        gene_id, an = line.split("#c", 1)[0].split("#gn", 1)
        exclude_gene_ids[an.strip()].add(gene_id.strip())

In [None]:
log.info(
    " ---- [Starting 'AN_missing_dbxref_GeneID excluding gene IDs from GFF3s'] ----"
)
for an in exclude_gene_ids.keys():
    log.info(f"Processing {an} for excluding {len(exclude_gene_ids[an])} gene IDs")
    log.trace(f" excluding gene IDs: {exclude_gene_ids[an]}")
    an_path = gff_builder.build(an)
    with open(an_path, "r") as f:
        lines = f.readlines()

    headers, index = [], 0
    while lines[index].startswith("#"):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile(
        "|".join([re.escape(f"ID={x};") for x in exclude_gene_ids[an]])
    )
    log.trace(f"Pattern for exclusion: {pattern.pattern}")
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()):
            continue
        line = line.split("\t")

        # line[2] is type, line[8] is attributes
        if pattern.search(line[8]):
            if append_string not in line[2]:
                line[2] = append_string + line[2]

        contents.append("\t".join(line))

    with open(an_path, "w") as f:
        f.write("\n".join(headers))
        f.write("\n")
        f.write("\n".join(contents))
        f.write("\n\n")

log.info(
    " ---- [Finished 'AN_missing_dbxref_GeneID excluding gene IDs from GFF3s'] ----"
)