In [2]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
import gdt

from Bio import Entrez

\# TODO  
Rather than manually tracking iterations with an 'nth_iteration' variable as implemented in 'AN_missing_gene_dict',  
this script employs an automated versioning approach. The system expects the most recent GDT file to follow  
the naming convention <preferred_name>\_pilot\_##.gdt (e.g., fungi-ncbi_pilot_03.gdt). When modifications are made  
to the gene dictionary, the program automatically increments the two-digit suffix to generate the next  
version (e.g., analysis_pilot_04.gdt), ensuring seamless version control without manual intervention.

In [None]:
def increment_gdt_file(path: Path) -> tuple[Path, int]:
    """
    Increment the GDT file name by 1.
    Example: fungi-ncbi_pilot_03.gdt -> fungi-ncbi_pilot_04.gdt
    """
    plist = path.stem.split("_")
    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDT file name: {path.name}. Expected format: <preferred_name>_##.gdt, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number

In [None]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../test/Test_group16"
AN_missing_dbxref = "../test/Test_group16/AN_missing_dbxref"
# most_recent_gdt_file = "../test/Test_group16/Test_group16.gdt"
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

global_query_string = gdt.gff3_utils.QS_GENE_TRNA_RRNA
print(f"Chosen feature query string: {global_query_string}")

Chosen feature query string: type == ["gene", "tRNA", "rRNA"]


In [5]:
most_recent_gdt_file = ""

In [None]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_dbxref = Path(AN_missing_dbxref).resolve()
most_recent_gdt_file = Path(most_recent_gdt_file).resolve()

if not DATA_DIR.is_dir():
    raise FileNotFoundError(
        f"Data directory {DATA_DIR} does not exist or is not a directory."
    )

if not AN_missing_dbxref.is_file():
    raise FileNotFoundError(
        f"AN missing dbxref {AN_missing_dbxref} does not exist or is not a file."
    )

if not most_recent_gdt_file.is_file():
    print(
        "It's assumed that this script is run after AN_missing_dbxref.ipynb, so you should have a pilot GDT file."
    )
    raise FileNotFoundError(
        f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file."
    )

If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.
Otherwise, ignore this message.


In [7]:
_, logger = gdt.logger_setup.logger_creater(
    log_file=DATA_DIR / "0_test_3.log", console_level="DEBUG", file_level="TRACE"
)
logger.debug("Running from notebook AN_missing_dbxref")

2025-05-23 15:07:28,351 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/test/Test_group16/0_test_3.log
2025-05-23 15:07:28,362 - DEBUG - Running from notebook AN_missing_dbxref


In [7]:
with open(AN_missing_dbxref, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 200


In [8]:
missing_dbxref_path = DATA_DIR / "missing_dbxref"
missing_dbxref_path.mkdir(exist_ok=True)

In [None]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
else:
    gene_dict = {}

temp_gene_dict = {}

In [None]:
temp_list = []
for AN in ANs:
    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    if not an_path.exists():
        logger.error(f"Error: {AN} does not exist (an_path: {an_path})")
        raise FileNotFoundError(f"File {an_path} does not exist.")

    df = gdt.gff3_utils.load_gff3(
        an_path,
        query_string=global_query_string,
        usecols=["seqid", "start", "end", "type", "attributes"],
    )
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = (
        df["attributes"].str.split(";").str[0].str.replace("ID=", "", regex=False)
    )
    df["in_gene_dict"] = df["gene_id"].isin(gene_dict)
    df_missing = df[~df["in_gene_dict"]].copy()

    temp_list.extend(df_missing[["gene_id", "seqid"]].to_dict("records"))

In [11]:
agg_dbxref = (
    pd.DataFrame(temp_list).groupby("gene_id")["seqid"].agg(list).sort_index()
)  # Sort by gene_id

In [12]:
# Write here anything you want to add to the missing_dbxref file, or leave it empty
comment = "manual insertion from missing_dbxref_compiled"

In [14]:
with open(missing_dbxref_path / "compiled.txt", "w+") as f1:
    for gene_id, seqid in agg_dbxref.items():
        f1.write(
            f'{gene_id} #gn {" ".join(seqid)}{ " #c " + comment if comment else "" }\n'
        )

After manual parsing of compiled.txt,  
create problems.txt, with names that  
are not readily indentifiable or that need deeper investigation.

The names that are easily identifiable should be added to the most  
recent _pilot.gdt, and this gdt should be them loaded above, before  
the next part of the pipeline.

### Deeper investigation using other gff attributes, primarily 'Name='

In [14]:
an_with_no_dbxref = set()
with open(missing_dbxref_path / "problems.txt", "r") as f:
    for line in f:
        if line.startswith("#") or not line.strip():
            continue
        # Get ANs part (after '||') and split into individual ANs
        if "#c" in line:
            line = line.split("#c")[0].strip()

        ans = line.split("#gn")[1].strip().split()
        # Add each AN to the set
        an_with_no_dbxref.update(ans)

In [None]:
RE_NAME = re.compile(r"Name=([^;]+)")
RE_PRODUCT = re.compile(r"product=([^;]+)")
RE_DESCRIPTION = re.compile(r"description=([^;]+)")
RE_PARENT = re.compile(r"Parent=([^;]+)")
RE_GENE = re.compile(r"gene=([^;]+)")
RE_GENE_SYNONYM = re.compile(r"gene_synonym=([^;]+)")
RE_NOTE = re.compile(r"Note=([^;]+)")

In [None]:
temp_list = []
logger.debug("missing_dbxref: creation of features_info_df")
for AN in an_with_no_dbxref:
    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    df = gdt.gff3_utils.load_gff3(
        an_path, query_string=global_query_string, usecols=gdt.GFF3_COLUMNS
    )  # TODO change query_string!
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str[3:].str.partition(";", expand=False).str[0]
    df = df[~df["gene_id"].isin(gene_dict)]

    df["name"] = df["attributes"].str.extract(RE_NAME, expand=False)
    df["product"] = df["attributes"].str.extract(RE_PRODUCT, expand=False)
    df["description"] = df["attributes"].str.extract(RE_DESCRIPTION, expand=False)
    df["gene"] = df["attributes"].str.extract(RE_GENE, expand=False)
    df["parent"] = df["attributes"].str.extract(RE_PARENT, expand=False)
    df["gene_synonym"] = df["attributes"].str.extract(RE_GENE_SYNONYM, expand=False)
    df["note"] = df["attributes"].str.extract(RE_NOTE, expand=False)

    if (
        df[["name", "product", "description", "gene", "gene_synonym", "note"]]
        .isna()
        .all(axis=1)
        .any()
    ):
        logger.debug(f"Warning: {AN} has a row with no identifiable atribute.")
        logger.debug(
            "Please modify this script to add a new possible identifiable attribute or just remove the AN from the list."
        )
        logger.debug(
            df[
                df[["name", "product", "description", "gene", "gene_synonym", "note"]]
                .isna()
                .all(axis=1)
            ]
        )

    temp_list.extend(df.to_dict("records"))

features_info_df = pd.DataFrame(temp_list)
features_info_df = features_info_df.drop(
    columns=["source", "type", "start", "end", "score", "strand", "phase", "attributes"]
)

dc = [
    col
    for col in ["product", "description", "gene", "gene_synonym"]
    if features_info_df[col].isna().all()
]

features_info_df["feature_name"] = (
    features_info_df["gene"]
    .fillna(features_info_df["product"])
    .fillna(features_info_df["description"])
    .fillna(features_info_df["name"])
    .fillna(features_info_df["note"])
    .fillna(features_info_df["gene_synonym"])
)

features_info_df = features_info_df.drop(columns=dc)
features_info_df = features_info_df.sort_values(by="feature_name")

In [20]:
add_gdt_compliance = True
comment = "Manual from missing_dbxref_names_raw"

In [21]:
if add_gdt_compliance:
    gdt_str = f' #gd MANUAL{ " #c " + comment if comment else "" }'
else:
    gdt_str = ""

# df with 2 columns, one for feature_names and one for in_gene_dict
new_df = pd.DataFrame({"feature_name": features_info_df["feature_name"].unique()})
new_df["in_gene_dict"] = new_df["feature_name"].isin(gene_dict)

In [22]:
unique_names = features_info_df["name"].dropna().unique()
with open(missing_dbxref_path / "feature_name.txt", "w+") as f1:
    for name in new_df[~new_df["in_gene_dict"]]["feature_name"]:
        f1.write(f"{name}{gdt_str}\n")

features_info_df.to_csv(
    missing_dbxref_path / "features_info.tsv", sep="\t", index=False
)

The user must now parse feature_names.txt  

Features that can be easily identifiable must be added to the current  
version of the gdt, and features that needs a more deep investigation should be  
copied to a new file name 'feature_unk.txt'
  
The program will now try to automatically add the gene_ids with feature name  
that __is not__ in 'feature_unk.txt'.

In [None]:
# Check if the names exist in the gene_dict
features_info_df = pd.read_csv(missing_dbxref_path / "features_info.tsv", sep="\t")

names_unk = set()
with open(missing_dbxref_path / "feature_unk.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [None]:
names_not_in_dict = set()
all_names = set(features_info_df["feature_name"].unique()) - names_unk
for name in all_names:
    if name not in gene_dict:
        names_not_in_dict.add(name)

if names_not_in_dict:
    logger.debug(f"Warning: {len(names_not_in_dict)} names not in gene_dict!")
    logger.debug(
        "These names are not in feature_unk, so you marked them as identifiable. Please identify them or add them feature_unk."
    )
    logger.debug(
        "It could also be that you forgot to reload the gene_dict with the changes that you made."
    )
    [logger.debug(name) for name in names_not_in_dict]
    raise ValueError(f"Error: {len(names_not_in_dict)} names not in gene_dict!")

In [None]:
comment = "automated insertion from missing_dbxref_feature_name"

In [None]:
for r in features_info_df[
    ~features_info_df["feature_name"].isin(names_unk)
].itertuples():
    # sanity check
    if r.feature_name not in gene_dict:
        raise ValueError(
            f"Error: {r.feature_name} not in gene_dict! how? did you run the step above without error?"
        )

    gene_dict[gene_id] = gdt.gene_dict.GeneGeneric(
        label=gene_dict[r.feature_name].label, an_sources=r.seqid, c=comment
    )

In [None]:
# TODO change header with number from increment_gdt_file
most_recent_gdt_file, _ = increment_gdt_file(most_recent_gdt_file)
gdt.gene_dict.write_gdt_file(gene_dict, most_recent_gdt_file)

#### missing_dbxref_names_unk.txt

In [None]:
def print_df_rows(cds_trna):
    for row_cds in cds_trna.itertuples():
        print(
            f"\tan: {row_cds.seqid}| gene_id: {row_cds.gene_id}| p: {row_cds.parent}| fn: {row_cds.feature_name}"
        )


def print_df_rows_logger(df, logger):
    for r in df.itertuples():
        logger.debug(
            f"\tan: {r.seqid}| gene_id: {r.gene_id}| p: {r.parent}| fn: {r.feature_name}"
        )

In [11]:
names_unk = set()
with open(missing_dbxref_path / "feature_unk.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [None]:
features_info_df = pd.read_csv(missing_dbxref_path / "features_info.tsv", sep="\t")
features_know_df = features_info_df[~features_info_df["feature_name"].isin(names_unk)]
features_unk_df = (
    features_info_df[features_info_df["feature_name"].isin(names_unk)]
    .copy()
    .reset_index(drop=True)
)

In [46]:
features_unk_df["feature_child"] = np.nan

In [None]:
logger.debug(
    "missing_dbxref: matching probable 'child feature + parent gene' pair (1st pass, matching rows of known features of features_info.tsv)"
)
for row in features_unk_df.itertuples():
    # check if there's a row in features_info_df with the same seqid and gene_id == parent
    candidates = features_know_df[
        (features_know_df["seqid"] == row.seqid)
        & (features_know_df["parent"] == row.gene_id)
    ]

    if len(candidates) == 0:
        logger.warning(
            f"no feature* found in {row.seqid} that have parent equals to {row.gene_id}"
        )
        continue

    if len(candidates) > 1:
        logger.warning(
            f"more than one feature* found in {row.seqid} that have parent equals to {row.gene_id}"
        )

        logger.debug("Please check the candidates:")
        print_df_rows_logger(candidates, logger)
        logger.debug("Chosing the first one for feature_child")
        features_unk_df.at[row.Index, "feature_child"] = candidates.iloc[0][
            "feature_name"
        ]
        continue

    if len(candidates) == 1:
        logger.debug(
            f'Found perfect candidate for {row.gene_id}: {candidates.iloc[0]["feature_name"]}'
        )
        features_unk_df.at[row.Index, "feature_child"] = candidates.iloc[0][
            "feature_name"
        ]

logger.info(
    "features* are those selected in the global_query_string variable, set at the beginning of the script"
)

W: no feature* found in AP024424.1 that have parent equals to gene-ACHE_n90001S
Found candidate for gene-ACHE_r90001S:
Found candidate for gene-ACHE_t90001S:
Found candidate for gene-ACHE_t90002S:
Found candidate for gene-ACHE_t90003S:
Found candidate for gene-ACHE_t90004S:
Found candidate for gene-ACHE_t90005S:
Found candidate for gene-ACHE_t90006S:
Found candidate for gene-ACHE_t90007S:
Found candidate for gene-ACHE_t90008S:
Found candidate for gene-ACHE_t90009S:
Found candidate for gene-ACHE_t90010S:
Found candidate for gene-ACHE_t90011S:
Found candidate for gene-ACHE_t90012S:
Found candidate for gene-ACHE_t90013S:
Found candidate for gene-ACHE_t90014S:
Found candidate for gene-ACHE_t90015S:
Found candidate for gene-ACHE_t90016S:
Found candidate for gene-ACHE_t90017S:
Found candidate for gene-ACHE_t90018S:
Found candidate for gene-ACHE_t90019S:
Found candidate for gene-ACHE_t90020S:
Found candidate for gene-ACHE_t90021S:
Found candidate for gene-ACHE_t90022S:
Found candidate for gen

In [44]:
feature_child_uniques = features_unk_df["feature_child"].dropna().unique()
feature_child_uniques

array(['12S ribosomal RNA', 'tRNA-Cys', 'tRNA-Arg', 'tRNA-Asn',
       'tRNA-Tyr', 'tRNA-Lys', 'tRNA-Gly', 'tRNA-Asp', 'tRNA-Ser',
       'tRNA-Trp', 'tRNA-Ile', 'tRNA-Pro', 'tRNA-Thr', 'tRNA-Glu',
       'tRNA-Val', 'tRNA-Met', 'tRNA-Leu', 'tRNA-Ala', 'tRNA-Phe',
       'tRNA-Gln', 'tRNA-His', 'small subunit ribosomal RNA', 'tRNA-Sec',
       'large subunit ribosomal RNA', '12S RNA', '16S RNA', 'rrnS',
       'rrnL', '23S ribosomal RNA', '16S ribosomal RNA'], dtype=object)

In [None]:
# check if they exist in the gene_dict, they should if the script is correct
logger.debug("Checking if feature_childs are in gene_dict")
for feature_child in feature_child_uniques:
    if feature_child not in gene_dict:
        logger.error(
            f"{feature_child} not in gene_dict! how? did you run the step above without error?"
        )

Error: 12S ribosomal RNA not in gene_dict! how? did you run the step above without error?
Error: tRNA-Cys not in gene_dict! how? did you run the step above without error?
Error: tRNA-Arg not in gene_dict! how? did you run the step above without error?
Error: tRNA-Asn not in gene_dict! how? did you run the step above without error?
Error: tRNA-Tyr not in gene_dict! how? did you run the step above without error?
Error: tRNA-Lys not in gene_dict! how? did you run the step above without error?
Error: tRNA-Gly not in gene_dict! how? did you run the step above without error?
Error: tRNA-Asp not in gene_dict! how? did you run the step above without error?
Error: tRNA-Ser not in gene_dict! how? did you run the step above without error?
Error: tRNA-Trp not in gene_dict! how? did you run the step above without error?
Error: tRNA-Ile not in gene_dict! how? did you run the step above without error?
Error: tRNA-Pro not in gene_dict! how? did you run the step above without error?
Error: tRNA-Thr not

In [None]:
comment = "automated insertion from missing_dbxref_feature_child"

In [None]:
for row in features_unk_df[features_unk_df["feature_child"].notna()].itertuples():
    # sanity check
    if r.feature_child not in gene_dict:
        raise ValueError(
            f"Error: {r.feature_child} not in gene_dict! how? did you run the step above without error?"
        )

    gene_dict[gene_id] = gdt.gene_dict.GeneGeneric(
        label=gene_dict[r.feature_child].label, an_sources=r.seqid, c=comment
    )

In [None]:
# TODO change header with number from increment_gdt_file
most_recent_gdt_file, _ = increment_gdt_file(most_recent_gdt_file)
gdt.gene_dict.write_gdt_file(gene_dict, most_recent_gdt_file)

#### unk_problems

In [56]:
with open(missing_dbxref_path / "unk_problems.txt", "w+") as f1:
    for row in features_unk_df[features_unk_df["feature_child"].isna()].itertuples():
        f1.write(f"{row.seqid} | {row.gene_id}\n")

In [57]:
unk_dict = {}
with open(missing_dbxref_path / "unk_problems.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        an = line.split("|")[0].strip()
        gene_id = line.split("|")[1].strip()
        if an in unk_dict:
            unk_dict[an].append(gene_id)
        else:
            unk_dict[an] = [gene_id]

In [None]:
really_unk_dict = {}
change_gene_dict = False
logger.debug(
    "missing_dbxref: matching probable 'child feature + parent gene' pair (2nd pass, on the an original gff3, using all the features)"
)
for an in unk_dict:
    gene_ids = unk_dict[an]
    logger.debug(f"AN: {an}| gene_ids: {gene_ids}")
    an_path = DATA_DIR / f"{an}{gff_suffix}"

    df = gdt.gff3_utils.load_gff3(an_path, usecols=gdt.GFF3_COLUMNS)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["parent"] = df["attributes"].str.extract(RE_PARENT, expand=False)
    df["product"] = df["attributes"].str.extract(RE_PRODUCT, expand=False)

    for gene_id in gene_ids:
        candidates = df[df["parent"] == gene_id]
        if len(candidates) >= 1 and len(candidates["product"].unique()) == 1:
            product = candidates["product"].unique()[0]
            logger.debug(f" {gene_id} | possible product: {product}")
            [
                logger.debug(f"\tt: {x.type} | p: {x.product} | a: {x.attributes}")
                for x in candidates.itertuples()
            ]

            if product in gene_dict:
                change_gene_dict = True
                label = gene_dict[product].label
                logger.debug(
                    f"\tlabel: {label}, gene_dict[product]: {gene_dict[product]}\n"
                )
                gene_dict[gene_id] = gdt.gene_dict.GeneGeneric(
                    label=label,
                    an_sources=an,
                    c="automated insertion from missing_dbxref_unk_problems",
                )

            else:
                logger.debug(f"\tproduct not in gene_dict: {product}\n")
                if an in really_unk_dict:
                    really_unk_dict[an].append(gene_id)
                else:
                    really_unk_dict[an] = [gene_id]
        else:
            logger.debug(
                f" {gene_id} | no features with parent {gene_id} found OR more than one features found, but with product conflict"
            )

            if len(candidates) >= 1:
                logger.debug(" candidates:")
                [
                    logger.debug(f"\t t: {x.type} | p: {x.product} | a: {x.attributes}")
                    for x in candidates.itertuples()
                ]

            if gene_id in really_unk_dict:
                really_unk_dict[an].append(gene_id)
            else:
                really_unk_dict[an] = [gene_id]

AN: AP024424.1| gene_ids: ['gene-ACHE_n90001S']
 gene-ACHE_n90001S | possible product: ribonuclease P RNA
	t: RNase_P_RNA | p: ribonuclease P RNA | a: ID=rna-ACHE_n90001S;Parent=gene-ACHE_n90001S;gbkey=ncRNA;locus_tag=ACHE_n90001S;product=ribonuclease P RNA
	product not in gene_dict: ribonuclease P RNA

AN: JQ346808.1| gene_ids: ['gene-AFUA_m0010', 'gene-AFUA_m0040', 'gene-AFUA_m0060', 'gene-AFUA_m0090', 'gene-AFUA_m0110', 'gene-AFUA_m0140', 'gene-AFUA_m0150', 'gene-AFUA_m0260', 'gene-AFUA_m0420', 'gene-AFUA_m0460', 'gene-AFUA_m0480', 'gene-AFUA_m0490', 'gene-AFUA_m0510', 'gene-AFUA_m0520', 'gene-AFUA_m0530']
 gene-AFUA_m0010 | possible product: cytochrome b
	t: CDS | p: cytochrome b | a: ID=cds-AFE02850.1;Parent=gene-AFUA_m0010;Dbxref=NCBI_GP:AFE02850.1;Name=AFE02850.1;gbkey=CDS;locus_tag=AFUA_m0010;product=cytochrome b;protein_id=AFE02850.1;transl_table=4
	product not in gene_dict: cytochrome b

 gene-AFUA_m0040 | possible product: NADH dehydrogenase subunit 1
	t: CDS | p: NADH dehyd

In [None]:
# TODO change header with number from increment_gdt_file
if change_gene_dict:
    logger.debug("gene_dict changed, incrementing gdt file and writing it")
    most_recent_gdt_file, _ = increment_gdt_file(most_recent_gdt_file)
    gdt.gene_dict.write_gdt_file(gene_dict, most_recent_gdt_file)

In [None]:
with open(missing_dbxref_path / "unk_problems_summary.txt", "w+") as f1:
    for an in really_unk_dict:
        gene_ids = unk_dict[an]
        f1.write(f"AN: {an} | gene_ids: {gene_ids}\n")
        an_path = DATA_DIR / f"{an}{gff_suffix}"
        df = gdt.gff3_utils.load_gff3(an_path, usecols=gdt.GFF3_COLUMNS)
        df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

        df["gene_id"] = df["attributes"].str[3:].str.partition(";", expand=False).str[0]
        df["parent"] = df["attributes"].str.extract(RE_PARENT, expand=False)
        df["product"] = df["attributes"].str.extract(RE_PRODUCT, expand=False)

        for gene_id in gene_ids:
            candidates = df[df["parent"] == gene_id]
            gene = df[df["gene_id"] == gene_id]

            if len(candidates) >= 1 and len(candidates["product"].unique()) == 1:
                product = candidates["product"].unique()[0]
                f1.write(
                    f" {gene_id} | possible product: {product} | gene att: {gene.iloc[0]['attributes']}\n"
                )
                [
                    f1.write(f"\tt: {x.type} | p: {x.product} | a: {x.attributes}\n")
                    for x in candidates.itertuples()
                ]
                f1.write(
                    "\n\tif you can identify the product, please add the following lines to gene_dict:\n"
                )
                f1.write(
                    f"\t{product} #gd MANUAL #c manual insertion from missing_dbxref_unk_problems\n"
                )
                f1.write(
                    f"\t{gene_id} #gn {an} #c MANUAL from missing_dbxref_unk_problems\n\n"
                )

            else:
                f1.write(
                    f" {gene_id} | no features with Parent={gene_id} found OR more than one features found, but with conflicting products\n"
                )

                if len(candidates) >= 1:
                    f1.write(" candidates:")
                    [
                        f1.write(f"\tt: {x.type} | p: {x.product} | a: {x.attributes}")
                        for x in candidates.itertuples()
                    ]

                f1.write(
                    f"\n\tyou should problably look up this gene_id in its gff3 ({an}{gff_suffix}) to try to identify it\n"
                )
                f1.write(
                    "\tif you can't, probably discard it, by adding 'discard_' in its type column in the gff3 file\n\n"
                )
                f1.write(
                    "\tif you can identify it, please add the following line to gene_dict:\n"
                )
                f1.write(
                    f"\t{gene_id} #gn {an} #c MANUAL from missing_dbxref_unk_problems\n"
                )

logger.debug(
    "More details about gene_ids with problems written to file 'unk_problems_summary.txt' in the missing_dbxref folder"
)

More details about gene_ids with problems written to file 'unk_problems_summary.txt' in the missing_dbxref folder
