In [1]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
import gdt

from Bio import Entrez

RE_ID = re.compile(r"ID=([^;]+)")
RE_NAME = re.compile(r"Name=([^;]+)")
RE_PRODUCT = re.compile(r"product=([^;]+)")
RE_DESCRIPTION = re.compile(r"description=([^;]+)")
RE_PARENT = re.compile(r"Parent=([^;]+)")
RE_GENE = re.compile(r"gene=([^;]+)")
RE_GENE_SYNONYM = re.compile(r"gene_synonym=([^;]+)")
RE_NOTE = re.compile(r"Note=([^;]+)")


def increment_gdt_file(path: Path) -> tuple[Path, int]:
    """
    Increment the GDT file name by 1.
    Example: fungi-ncbi_pilot_03.gdt -> fungi-ncbi_pilot_04.gdt
    """
    plist = path.stem.split("_")
    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDT file name: {path.name}. Expected format: <preferred_name>_##.gdt, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number


def get_most_recent_gdt(dir_path: Path, prefix="TEMP_") -> Path:
    """
    Get the most recent gdt file in the directory.
    Arguments:
        dir_path (Path): Directory to search for GDT files.
        prefix (str): Prefix of the GDT files to search for. It will match files like "<prefix>*.gdt".
    Returns:
        Path: Path to the most recent GDT file.
    """
    temp_files = list(dir_path.glob(f"{prefix}*.gdt"))
    if not temp_files:
        return dir_path / f"{prefix}00.gdt"
    return gdt.gene_dict_impl.natural_sort(temp_files, key=lambda x: x.stem)[-1]

\# TODO  
Rather than manually tracking iterations with an 'nth_iteration' variable as implemented in 'AN_missing_gene_dict',  
this script employs an automated versioning approach. The system expects the most recent GDT file to follow  
the naming convention <preferred_name>\_pilot\_##.gdt (e.g., fungi-ncbi_pilot_03.gdt). When modifications are made  
to the gene dictionary, the program automatically increments the two-digit suffix to generate the next  
version (e.g., analysis_pilot_04.gdt), ensuring seamless version control without manual intervention.

In [2]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../sandbox/algae_pt_test"
AN_missing_dbxref = "../sandbox/algae_pt_test/AN_missing_dbxref"
most_recent_gdt_file = "algae_pt_pilot_03.gdt"
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

global_query_string = gdt.gff3_utils.QS_GENE_TRNA_RRNA
print(f"Chosen feature query string: '{global_query_string}'")

Chosen feature query string: 'type == ["gene", "tRNA", "rRNA"]'


In [3]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_dbxref = Path(AN_missing_dbxref).resolve()

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(
        f"Data directory {DATA_DIR} does not exist or is not a directory."
    )

if not AN_missing_dbxref.exists() and not AN_missing_dbxref.is_file():
    raise FileNotFoundError(
        f"AN missing dbxref {AN_missing_dbxref} does not exist or is not a file."
    )

MISC_DIR = DATA_DIR / "misc"
MISC_DIR.mkdir(exist_ok=True)
GDT_dir = MISC_DIR / "gdt"
GDT_dir.mkdir(exist_ok=True)
most_recent_gdt_file = GDT_dir / most_recent_gdt_file

if not most_recent_gdt_file:
    print(
        "If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable."
    )
    print("Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(
            f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file."
        )

In [4]:
_, logger = gdt.logger_setup.logger_creater(
    log_file=MISC_DIR / "05_test.log", console_level="DEBUG", file_level="TRACE"
)
logger.debug("Running from notebook AN_missing_dbxref")

2025-06-04 17:53:56,796 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/sandbox/algae_pt_test/misc/05_test.log
2025-06-04 17:53:56,797 - DEBUG - Running from notebook AN_missing_dbxref


In [6]:
with open(AN_missing_dbxref, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 63


In [5]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict_impl.create_gene_dict(
        most_recent_gdt_file, max_an_sources=0
    )
    print(f"Loaded gene_dict from {most_recent_gdt_file}\nHeader:")
    [print(x) for x in gene_dict["gdt_header"]]
    print("\nGDT Info:")
    [print(x) for x in gene_dict["gdt_info"]]
else:
    gene_dict = {}
    print("No GDT file found, starting with an empty gene_dict.")

temp_gene_dict = {}

Loaded gene_dict from /home/brenodupin/matheus/gdt/sandbox/algae_pt_test/misc/gdt/algae_pt_pilot_03.gdt
Header:
version 0.0.2
GDT - 1
Data added from TEMP 01 (old version)
Data added from TEMP 01
Data added from TEMP Symbol 1
automated insertion from missing_dbxref_feature_name

GDT Info:
Gene dictionary length: 27444
Label: 245
GeneDescription: 1709
GeneGenerics: 1287
GeneDbxref: 24448


### Deeper investigation using other gff attributes, primarily gene=

In [None]:
temp_list = []
logger.debug("missing_dbxref: creation of features_info_df")
for AN in ANs:
    an_path = DATA_DIR / f"{AN}{gff_suffix}"
    df = gdt.gff3_utils.load_gff3(
        an_path, query_string=global_query_string, usecols=gdt.GFF3_COLUMNS
    )
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df = df[~df["gene_id"].isin(gene_dict)]

    df["name"] = df["attributes"].str.extract(RE_NAME, expand=False)
    df["product"] = df["attributes"].str.extract(RE_PRODUCT, expand=False)
    df["description"] = df["attributes"].str.extract(RE_DESCRIPTION, expand=False)
    df["gene"] = df["attributes"].str.extract(RE_GENE, expand=False)
    df["parent"] = df["attributes"].str.extract(RE_PARENT, expand=False)
    df["gene_synonym"] = df["attributes"].str.extract(RE_GENE_SYNONYM, expand=False)
    df["note"] = df["attributes"].str.extract(RE_NOTE, expand=False)

    if (
        df[["name", "product", "description", "gene", "gene_synonym", "note"]]
        .isna()
        .all(axis=1)
        .any()
    ):
        logger.debug(f"Warning: {AN} has row(s) with no identifiable atribute.")
        logger.debug(
            "Please modify this script to add a new possible identifiable attribute or just remove the AN from the list."
        )
        logger.debug(
            df[
                df[["name", "product", "description", "gene", "gene_synonym", "note"]]
                .isna()
                .all(axis=1)
            ]
        )

    temp_list.extend(df.to_dict("records"))

features_info_df = pd.DataFrame(temp_list)
features_info_df = features_info_df.drop(
    columns=["source", "type", "start", "end", "score", "strand", "phase", "attributes"]
)

dc = [
    col
    for col in ["name", "product", "description", "gene", "gene_synonym", "note"]
    if features_info_df[col].isna().all()
]

features_info_df["feature_name"] = (
    features_info_df["gene"]
    .fillna(features_info_df["product"])
    .fillna(features_info_df["description"])
    .fillna(features_info_df["name"])
    .fillna(features_info_df["note"])
    .fillna(features_info_df["gene_synonym"])
)

features_info_df = features_info_df.drop(columns=dc)
features_info_df = features_info_df.sort_values(by="feature_name")

2025-06-03 17:05:23,888 - DEBUG - missing_dbxref: creation of features_info_df


In [None]:
add_gdt_compliance = True
comment = "Manual from missing_dbxref_names_raw"

In [17]:
if add_gdt_compliance:
    gdt_str = f' #gd MANUAL{ " #c " + comment if comment else "" }'
else:
    gdt_str = ""

# df with 2 columns, one for feature_names and one for in_gene_dict
new_df = pd.DataFrame({"feature_name": features_info_df["feature_name"].unique()})
new_df["in_gene_dict"] = new_df["feature_name"].isin(gene_dict)

#### Check feature name, where both df and gene_dict use .lower()

In [20]:
check_df = new_df[~new_df["in_gene_dict"]].copy()

In [21]:
check_df["lower_feature_name"] = check_df["feature_name"].str.lower()
check_df["in_gene_dict_lower"] = check_df["lower_feature_name"].isin(gene_dict)

In [22]:
check_df[check_df["in_gene_dict_lower"]]

Unnamed: 0,feature_name,in_gene_dict,lower_feature_name,in_gene_dict_lower
248,Ycf4,False,ycf4,True


In [24]:
check_df["in_gene_dict_both_lower"] = check_df["lower_feature_name"].isin(
    [x.lower() for x in gene_dict]
)

In [26]:
check_df[check_df["in_gene_dict_both_lower"]]

Unnamed: 0,feature_name,in_gene_dict,lower_feature_name,in_gene_dict_lower,in_gene_dict_both_lower
248,Ycf4,False,ycf4,True,True
382,ftsH,False,ftsh,False,True
434,rnpB,False,rnpb,False,True
470,rps2b,False,rps2b,False,True


In [27]:
unique_names = new_df[~new_df["in_gene_dict"]]["feature_name"]
unique_names = gdt.gene_dict_impl.natural_sort(unique_names)
with open(MISC_DIR / "feature_name.txt", "w+") as f1:
    for name in unique_names:
        f1.write(f"{name}{gdt_str}\n")

features_info_df.to_csv(MISC_DIR / "features_info.tsv", sep="\t", index=False)

The user must now parse feature_names.txt  

Features that can be easily identifiable must be added to the current  
version of the gdt, and features that need deeper investigation should be  
copied to a new file name 'feature_unk.txt'
  
The program will now try to automatically add the gene_ids with feature name  
that __is not__ in 'feature_unk.txt'.

In [22]:
# Check if the names exist in the gene_dict
features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")

names_unk = set()
with open(MISC_DIR / "features_unk.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [23]:
names_not_in_dict = set()
all_names = set(features_info_df["feature_name"].unique()) - names_unk
for name in all_names:
    if name not in gene_dict:
        names_not_in_dict.add(name)

if names_not_in_dict:
    logger.debug(f"Warning: {len(names_not_in_dict)} names not in gene_dict!")
    logger.debug(
        "These names are not in feature_unk, so you marked them as identifiable. Please identify them or add them feature_unk."
    )
    logger.debug(
        "It could also be that you forgot to reload the gene_dict with the changes that you made."
    )
    [logger.debug(name) for name in names_not_in_dict]
    raise ValueError(f"Error: {len(names_not_in_dict)} names not in gene_dict!")

In [24]:
comment = "automated insertion from missing_dbxref_feature_name"

In [25]:
unique_gene_ids = features_info_df[~features_info_df["feature_name"].isin(names_unk)][
    "gene_id"
].unique()

In [26]:
unique_gene_ids

array(['rna-MK792750.1:98144..100523', 'rna-KX756229.1:163184..164681',
       'rna-COCOBI_pt-1500', ..., 'gene-CH29B_p089', 'gene-ycf62',
       'gene-ycf66'], shape=(1287,), dtype=object)

In [29]:
unique_gene_ids = features_info_df[~features_info_df["feature_name"].isin(names_unk)][
    "gene_id"
].unique()

for gene_id in unique_gene_ids:
    df = features_info_df[features_info_df["gene_id"] == gene_id]

    # sanity check, are all feature_names the same?
    if df["feature_name"].nunique() != 1:
        print(
            f"Warning: {gene_id} has multiple feature_names: {df['feature_name'].unique()}"
        )
        print("\tChecking if they have the same label in gene_dict...")

        labels = {gene_dict[feat].label for feat in df["feature_name"].unique()}
        if len(labels) != 1:
            print(f"\tError: {gene_id} has multiple labels: {labels}")
            continue
        else:
            print(f"\tAll feature_names have the same label: {labels.pop()}")

    label = gene_dict[df["feature_name"].iloc[0]].label
    an_sources = df["seqid"].unique().tolist()
    print(
        f"gene_id: {gene_id}, label: {label}, an_sources: {an_sources}, comment: {comment}"
    )
    gene_dict[gene_id] = gdt.gene_dict_impl.GeneGeneric(
        label=label, an_sources=an_sources, c=comment
    )

gene_id: rna-MK792750.1:98144..100523, label: PT-RNR2, an_sources: ['MK792750.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: rna-KX756229.1:163184..164681, label: PT-RNR2, an_sources: ['KX756229.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: rna-COCOBI_pt-1500, label: PT-RNR2, an_sources: ['AP025008.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: rna-MK995333.1:51311..53021, label: PT-RNR2, an_sources: ['MK995333.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: rna-cvarcp_00059, label: PT-RNR2, an_sources: ['KP271969.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: rna-MK995333.1:110499..111776, label: PT-RNR2, an_sources: ['MK995333.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id: rna-KX756229.1:77940..79437, label: PT-RNR2, an_sources: ['KX756229.1'], comment: automated insertion from missing_dbxref_feature_name
gene_id

In [30]:
if gene_dict:
    new_path, nth_iteration = increment_gdt_file(most_recent_gdt_file)
    logger.info(f"Writing gene_dict file: {new_path} | Iteration: {nth_iteration}")
    gene_dict["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
    gene_dict["gdt_header"].append(
        "automated insertion from missing_dbxref_feature_name"
    )
    gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

2025-06-03 19:24:41,246 - INFO - Writing gene_dict file: /home/brenodupin/matheus/gdt/sandbox/algae_pt_test/misc/gdt/algae_pt_pilot_03.gdt | Iteration: 3


#### missing_dbxref_names_unk.txt

In [31]:
def print_df_rows(cds_trna):
    for row_cds in cds_trna.itertuples():
        print(
            f"\tan: {row_cds.seqid}| gene_id: {row_cds.gene_id}| p: {row_cds.parent}| fn: {row_cds.feature_name}"
        )


def print_df_rows_logger(df, logger):
    for r in df.itertuples():
        logger.debug(
            f"\tan: {r.seqid}| gene_id: {r.gene_id}| p: {r.parent}| fn: {r.feature_name}"
        )

In [32]:
names_unk = set()
with open(MISC_DIR / "features_unk.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [34]:
features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")
features_know_df = features_info_df[~features_info_df["feature_name"].isin(names_unk)]
features_unk_df = (
    features_info_df[features_info_df["feature_name"].isin(names_unk)]
    .copy()
    .reset_index(drop=True)
)

In [41]:
features_unk_df["feature_child"] = np.nan
# make feature_child a object type
features_unk_df["feature_child"] = features_unk_df["feature_child"].astype(object)

In [42]:
logger.debug(
    "missing_dbxref: matching probable 'child feature + parent gene' pair (1st pass, matching rows of known features of features_info.tsv)"
)
for row in features_unk_df.itertuples():
    # check if there's a row in features_info_df with the same seqid and gene_id == parent
    candidates = features_know_df[
        (features_know_df["seqid"] == row.seqid)
        & (features_know_df["parent"] == row.gene_id)
    ]

    if len(candidates) == 0:
        logger.warning(
            f"no feature* found in {row.seqid} that have parent equals to {row.gene_id}"
        )
        continue

    if len(candidates) > 1:
        logger.warning(
            f"more than one feature* found in {row.seqid} that have parent equals to {row.gene_id}"
        )

        logger.debug("Please check the candidates:")
        print_df_rows_logger(candidates, logger)
        logger.debug("Chosing the first one for feature_child")
        features_unk_df.at[row.Index, "feature_child"] = candidates.iloc[0][
            "feature_name"
        ]
        continue

    if len(candidates) == 1:
        logger.debug(
            f'Found perfect candidate for {row.gene_id}: {candidates.iloc[0]["feature_name"]}'
        )
        features_unk_df.at[row.Index, "feature_child"] = candidates.iloc[0][
            "feature_name"
        ]

logger.info(
    "features* are those selected in the global_query_string variable, set at the beginning of the script"
)

2025-06-03 19:29:20,325 - DEBUG - missing_dbxref: matching probable 'child feature + parent gene' pair (1st pass, matching rows of known features of features_info.tsv)
2025-06-03 19:29:20,372 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0280: tRNA-Asp
2025-06-03 19:29:20,392 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0380: tRNA-Met
2025-06-03 19:29:20,454 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0710: tRNA-Phe
2025-06-03 19:29:20,458 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0740: tRNA-Thr
2025-06-03 19:29:20,459 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0750: tRNA-Gln
2025-06-03 19:29:20,463 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0780: tRNA-Lys
2025-06-03 19:29:20,466 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0790: tRNA-His
2025-06-03 19:29:20,472 - DEBUG - Found perfect candidate for gene-COCOBI_pt-0810: tRNA-Gly
2025-06-03 19:29:20,509 - DEBUG - Found perfect candidate for gene-COCOBI_pt-1010: 23S ribosomal

In [40]:
features_unk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363 entries, 0 to 362
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   seqid          363 non-null    object
 1   gene_id        363 non-null    object
 2   name           363 non-null    object
 3   product        0 non-null      object
 4   gene           7 non-null      object
 5   parent         0 non-null      object
 6   note           0 non-null      object
 7   feature_name   363 non-null    object
 8   feature_child  65 non-null     object
dtypes: object(9)
memory usage: 25.6+ KB


In [44]:
feature_child_uniques = features_unk_df["feature_child"].dropna().unique()
feature_child_uniques

array(['12S ribosomal RNA', 'tRNA-Cys', 'tRNA-Arg', 'tRNA-Asn',
       'tRNA-Tyr', 'tRNA-Lys', 'tRNA-Gly', 'tRNA-Asp', 'tRNA-Ser',
       'tRNA-Trp', 'tRNA-Ile', 'tRNA-Pro', 'tRNA-Thr', 'tRNA-Glu',
       'tRNA-Val', 'tRNA-Met', 'tRNA-Leu', 'tRNA-Ala', 'tRNA-Phe',
       'tRNA-Gln', 'tRNA-His', 'small subunit ribosomal RNA', 'tRNA-Sec',
       'large subunit ribosomal RNA', '12S RNA', '16S RNA', 'rrnS',
       'rrnL', '23S ribosomal RNA', '16S ribosomal RNA'], dtype=object)

In [None]:
# check if they exist in the gene_dict, they should if the script is correct
logger.debug("Checking if feature_childs are in gene_dict")
for feature_child in feature_child_uniques:
    if feature_child not in gene_dict:
        logger.error(
            f"{feature_child} not in gene_dict! how? did you run the step above without error?"
        )

Error: 12S ribosomal RNA not in gene_dict! how? did you run the step above without error?
Error: tRNA-Cys not in gene_dict! how? did you run the step above without error?
Error: tRNA-Arg not in gene_dict! how? did you run the step above without error?
Error: tRNA-Asn not in gene_dict! how? did you run the step above without error?
Error: tRNA-Tyr not in gene_dict! how? did you run the step above without error?
Error: tRNA-Lys not in gene_dict! how? did you run the step above without error?
Error: tRNA-Gly not in gene_dict! how? did you run the step above without error?
Error: tRNA-Asp not in gene_dict! how? did you run the step above without error?
Error: tRNA-Ser not in gene_dict! how? did you run the step above without error?
Error: tRNA-Trp not in gene_dict! how? did you run the step above without error?
Error: tRNA-Ile not in gene_dict! how? did you run the step above without error?
Error: tRNA-Pro not in gene_dict! how? did you run the step above without error?
Error: tRNA-Thr not

In [None]:
comment = "automated insertion from missing_dbxref_feature_child"

In [None]:
for row in features_unk_df[features_unk_df["feature_child"].notna()].itertuples():
    # sanity check
    if r.feature_child not in gene_dict:
        raise ValueError(
            f"Error: {r.feature_child} not in gene_dict! how? did you run the step above without error?"
        )

    gene_dict[gene_id] = gdt.gene_dict.GeneGeneric(
        label=gene_dict[r.feature_child].label, an_sources=r.seqid, c=comment
    )

In [None]:
# TODO change header with number from increment_gdt_file
most_recent_gdt_file, _ = increment_gdt_file(most_recent_gdt_file)
gdt.gene_dict.write_gdt_file(gene_dict, most_recent_gdt_file)

#### unk_problems

In [6]:
names_unk = set()
with open(MISC_DIR / "features_unk.txt", "r") as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if "#gd" in line:
            line = line.split("#gd")[0].strip()

        names_unk.add(line)

In [7]:
features_info_df = pd.read_csv(MISC_DIR / "features_info.tsv", sep="\t")
features_unk_df = (
    features_info_df[features_info_df["feature_name"].isin(names_unk)]
    .copy()
    .reset_index(drop=True)
)

In [8]:
features_unk_df

Unnamed: 0,seqid,gene_id,name,product,gene,parent,note,feature_name
0,AP025008.1,gene-COCOBI_pt-0010,COCOBI_pt-0010,,,,,COCOBI_pt-0010
1,AP025008.1,gene-COCOBI_pt-0020,COCOBI_pt-0020,,,,,COCOBI_pt-0020
2,AP025008.1,gene-COCOBI_pt-0030,COCOBI_pt-0030,,,,,COCOBI_pt-0030
3,AP025008.1,gene-COCOBI_pt-0040,COCOBI_pt-0040,,,,,COCOBI_pt-0040
4,AP025008.1,gene-COCOBI_pt-0050,COCOBI_pt-0050,,,,,COCOBI_pt-0050
...,...,...,...,...,...,...,...,...
358,LC523992.1,gene-intronic orf281-2,intronic orf281,,intronic orf281,,,intronic orf281
359,LC523992.1,gene-intronic orf281,intronic orf281,,intronic orf281,,,intronic orf281
360,LC523992.1,gene-intronic orf307,intronic orf307,,intronic orf307,,,intronic orf307
361,KX756229.1,gene-ltrA,ltrA,,ltrA,,,ltrA


In [9]:
unk_dict = features_unk_df.groupby("seqid")["gene_id"].agg(list).to_dict()

In [11]:
unk_dict

{'AP012291.1': ['gene-I-CreI'],
 'AP014542.1': ['gene-I-CreI'],
 'AP018038.1': ['gene-Rsub_pt095'],
 'AP025008.1': ['gene-COCOBI_pt-0010',
  'gene-COCOBI_pt-0020',
  'gene-COCOBI_pt-0030',
  'gene-COCOBI_pt-0040',
  'gene-COCOBI_pt-0050',
  'gene-COCOBI_pt-0060',
  'gene-COCOBI_pt-0070',
  'gene-COCOBI_pt-0080',
  'gene-COCOBI_pt-0090',
  'gene-COCOBI_pt-0100',
  'gene-COCOBI_pt-0110',
  'gene-COCOBI_pt-0120',
  'gene-COCOBI_pt-0130',
  'gene-COCOBI_pt-0140',
  'gene-COCOBI_pt-0150',
  'gene-COCOBI_pt-0160',
  'gene-COCOBI_pt-0170',
  'gene-COCOBI_pt-0180',
  'gene-COCOBI_pt-0190',
  'gene-COCOBI_pt-0200',
  'gene-COCOBI_pt-0210',
  'gene-COCOBI_pt-0220',
  'gene-COCOBI_pt-0230',
  'gene-COCOBI_pt-0240',
  'gene-COCOBI_pt-0250',
  'gene-COCOBI_pt-0260',
  'gene-COCOBI_pt-0270',
  'gene-COCOBI_pt-0280',
  'gene-COCOBI_pt-0290',
  'gene-COCOBI_pt-0300',
  'gene-COCOBI_pt-0310',
  'gene-COCOBI_pt-0320',
  'gene-COCOBI_pt-0330',
  'gene-COCOBI_pt-0340',
  'gene-COCOBI_pt-0350',
  'gene-COC

In [12]:
temp_unk = {}
label_count = 0
change_gene_dict = False
logger.debug(
    "missing_dbxref: matching probable 'child feature + parent gene' pair (2nd pass, on the an original gff3, using all the features)"
)
for an in unk_dict.keys():
    gene_ids = unk_dict[an]
    logger.debug(f"AN: {an}| gene_ids: {gene_ids}")
    an_path = DATA_DIR / f"{an}{gff_suffix}"

    df = gdt.gff3_utils.load_gff3(an_path, usecols=gdt.GFF3_COLUMNS)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df["gene_id"] = df["attributes"].str.extract(RE_ID, expand=False)
    df["name"] = df["attributes"].str.extract(RE_NAME, expand=False)
    df["product"] = df["attributes"].str.extract(RE_PRODUCT, expand=False)
    df["description"] = df["attributes"].str.extract(RE_DESCRIPTION, expand=False)
    df["gene"] = df["attributes"].str.extract(RE_GENE, expand=False)
    df["parent"] = df["attributes"].str.extract(RE_PARENT, expand=False)
    df["gene_synonym"] = df["attributes"].str.extract(RE_GENE_SYNONYM, expand=False)
    df["note"] = df["attributes"].str.extract(RE_NOTE, expand=False)

    df["feature_name"] = (
        df["gene"]
        .fillna(df["product"])
        .fillna(df["description"])
        .fillna(df["name"])
        .fillna(df["note"])
        .fillna(df["gene_synonym"])
    )

    for gene_id in gene_ids:
        candidates = df[df["parent"] == gene_id]
        if len(candidates) >= 1:
            if candidates["feature_name"].nunique() > 1:
                logger.debug(
                    f" {gene_id} | more than one feature with parent {gene_id} found, but with feature_name conflict"
                )
                logger.debug(
                    f" {gene_id} | feature_names: {candidates['feature_name'].unique()}"
                )
                logger.debug(
                    f" {gene_id} | choosing the first one: {candidates['feature_name'].iloc[0]}"
                )

            feature_name = candidates["feature_name"].iloc[0]
            logger.debug(f" {gene_id} | possible feature_name: {feature_name}")
            [
                logger.debug(f"\tt: {x.type} | p: {x.feature_name} | a: {x.attributes}")
                for x in candidates.itertuples()
            ]

            if feature_name in gene_dict:
                change_gene_dict = True
                label = gene_dict[feature_name].label
                logger.debug(
                    f"\tlabel: {label}, gene_dict[feature_name]: {gene_dict[feature_name]}"
                )
                logger.debug(
                    f"\t{gene_id} | inserting from missing_dbxref_unk_problems, source: {feature_name} | type: {candidates['type'].iloc[0]}\n"
                )
                gene_dict[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                    label=label,
                    an_sources=[an],
                    c=f"insertion from missing_dbxref_unk_problems, source: {feature_name} | type: {candidates['type'].iloc[0]}",
                )

            else:
                logger.debug(f"\tproduct not in gene_dict: {feature_name}\n")
                logger.debug("\tchecking in temp_unk")

                if feature_name in temp_unk:
                    logger.debug(
                        f"\t{gene_id} | found in temp_unk, inserting gene_id in there"
                    )
                    label = temp_unk[feature_name].label
                    gene_dict[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                        label=label,
                        an_sources=[an],
                        c=f"insertion from missing_dbxref_unk_problems, source: {feature_name} | type: {candidates['type'].iloc[0]}",
                    )
                else:
                    logger.debug(
                        f"\t{gene_id} | not found in temp_unk, creating new entry"
                    )
                    label_count += 1
                    label = f"{organelle_type}-TEMP-{label_count}"
                    temp_unk[feature_name] = gdt.gene_dict_impl.GeneDescription(
                        label=label, source="MANUAL", c=None
                    )

                    temp_unk[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                        label=label,
                        an_sources=[an],
                        c=f"insertion from missing_dbxref_unk_problems, source: {feature_name} | type: {candidates['type'].iloc[0]}",
                    )
        else:
            logger.debug(f" {gene_id} | no features with parent {gene_id} found")
            logger.debug(" adding it to UNKNOWN label ")
            temp_unk[gene_id] = gdt.gene_dict_impl.GeneGeneric(
                label=f"{organelle_type}-UNKNOWN",
                an_sources=[an],
                c=f"unknown gene_id from {an}{gff_suffix} | "
                f"a: {df[df['gene_id'] == gene_id]['attributes'].iloc[0] if not df[df['gene_id'] == gene_id].empty else 'N/A'}",
            )

2025-06-04 17:56:22,789 - DEBUG - missing_dbxref: matching probable 'child feature + parent gene' pair (2nd pass, on the an original gff3, using all the features)
2025-06-04 17:56:22,799 - DEBUG - AN: AP012291.1| gene_ids: ['gene-I-CreI']
2025-06-04 17:56:22,813 - DEBUG -  gene-I-CreI | possible feature_name: I-CreI
2025-06-04 17:56:22,824 - DEBUG - 	t: CDS | p: I-CreI | a: ID=cds-BBN51359.1;Parent=gene-I-CreI;Dbxref=NCBI_GP:BBN51359.1;Name=BBN51359.1;Note=in rrnL gene (intron)%3B~tag:HaboC_001;gbkey=CDS;gene=I-CreI;product=LAGLIDADG homing endonuclease;protein_id=BBN51359.1;transl_table=11
2025-06-04 17:56:22,829 - DEBUG - 	product not in gene_dict: I-CreI

2025-06-04 17:56:22,833 - DEBUG - 	checking in temp_unk
2025-06-04 17:56:22,833 - DEBUG - 	gene-I-CreI | not found in temp_unk, creating new entry
2025-06-04 17:56:22,838 - DEBUG - AN: AP014542.1| gene_ids: ['gene-I-CreI']
2025-06-04 17:56:22,858 - DEBUG -  gene-I-CreI | possible feature_name: I-CreI
2025-06-04 17:56:22,860 - DEBUG

In [13]:
if temp_unk:
    temp_path = get_most_recent_gdt(GDT_dir, prefix="TEMP_Mapping_")
    new_path, map_iteration = increment_gdt_file(temp_path)
    logger.info(
        f"Writing TEMP Mapping GDT file: {new_path} | Iteration: {map_iteration}"
    )
    temp_unk["gdt_info"] = gdt.gene_dict_impl.get_gene_dict_info(temp_unk)
    temp_unk["gdt_header"] = [
        "version 0.0.2",
        f"TEMP Mapping - {map_iteration}",
        "Automagically generated by AN_missing_dbxref.ipynb | TEMP Mapping child features to parent genes",
    ]
    gdt.gene_dict_impl.write_gdt_file(temp_unk, new_path, overwrite=True)

2025-06-04 17:56:27,835 - INFO - Writing TEMP Mapping GDT file: /home/brenodupin/matheus/gdt/sandbox/algae_pt_test/misc/gdt/TEMP_Mapping_03.gdt | Iteration: 3


In [None]:
# TODO change header with number from increment_gdt_file
if change_gene_dict:
    logger.debug("gene_dict changed, incrementing gdt file and writing it")
    most_recent_gdt_file, _ = increment_gdt_file(most_recent_gdt_file)
    gdt.gene_dict.write_gdt_file(gene_dict, most_recent_gdt_file)

In [None]:
with open(missing_dbxref_path / "unk_problems_summary.txt", "w+") as f1:
    for an in really_unk_dict:
        gene_ids = unk_dict[an]
        f1.write(f"AN: {an} | gene_ids: {gene_ids}\n")
        an_path = DATA_DIR / f"{an}{gff_suffix}"
        df = gdt.gff3_utils.load_gff3(an_path, usecols=gdt.GFF3_COLUMNS)
        df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

        df["gene_id"] = df["attributes"].str[3:].str.partition(";", expand=False).str[0]
        df["parent"] = df["attributes"].str.extract(RE_PARENT, expand=False)
        df["product"] = df["attributes"].str.extract(RE_PRODUCT, expand=False)

        for gene_id in gene_ids:
            candidates = df[df["parent"] == gene_id]
            gene = df[df["gene_id"] == gene_id]

            if len(candidates) >= 1 and len(candidates["product"].unique()) == 1:
                product = candidates["product"].unique()[0]
                f1.write(
                    f" {gene_id} | possible product: {product} | gene att: {gene.iloc[0]['attributes']}\n"
                )
                [
                    f1.write(f"\tt: {x.type} | p: {x.product} | a: {x.attributes}\n")
                    for x in candidates.itertuples()
                ]
                f1.write(
                    "\n\tif you can identify the product, please add the following lines to gene_dict:\n"
                )
                f1.write(
                    f"\t{product} #gd MANUAL #c manual insertion from missing_dbxref_unk_problems\n"
                )
                f1.write(
                    f"\t{gene_id} #gn {an} #c MANUAL from missing_dbxref_unk_problems\n\n"
                )

            else:
                f1.write(
                    f" {gene_id} | no features with Parent={gene_id} found OR more than one features found, but with conflicting products\n"
                )

                if len(candidates) >= 1:
                    f1.write(" candidates:")
                    [
                        f1.write(f"\tt: {x.type} | p: {x.product} | a: {x.attributes}")
                        for x in candidates.itertuples()
                    ]

                f1.write(
                    f"\n\tyou should problably look up this gene_id in its gff3 ({an}{gff_suffix}) to try to identify it\n"
                )
                f1.write(
                    "\tif you can't, probably discard it, by adding 'discard_' in its type column in the gff3 file\n\n"
                )
                f1.write(
                    "\tif you can identify it, please add the following line to gene_dict:\n"
                )
                f1.write(
                    f"\t{gene_id} #gn {an} #c MANUAL from missing_dbxref_unk_problems\n"
                )

logger.debug(
    "More details about gene_ids with problems written to file 'unk_problems_summary.txt' in the missing_dbxref folder"
)

More details about gene_ids with problems written to file 'unk_problems_summary.txt' in the missing_dbxref folder


In [None]:
features_unk_df[~features_unk_df["parent"].isna()]

Unnamed: 0,seqid,gene_id,name,product,gene,parent,note,feature_name


### Genes Discard using dbxref

In [14]:
remove_string = "discard-"
genes_to_remove = "genome_features_to_remove_2.txt"

In [15]:
genes_to_remove = "TEMP_Mapping_01.gdt"

In [16]:
remove_gene_ids = {}
with open(MISC_DIR / "gdt" / genes_to_remove, "r") as f:
    for line in f:
        if (
            not line.strip()
            or line.startswith("#")
            or line.startswith("[")
            or "#gd" in line
        ):
            continue

        gene_id, an = line.split("#c", 1)[0].split("#gn", 1)
        gene_id = gene_id.strip()
        an = an.strip()

        if not an or not gene_id:
            raise ValueError(f"Error: {line} - wtf (TODO, remove)?")

        if an not in remove_gene_ids:
            remove_gene_ids[an] = set([gene_id])
        else:
            remove_gene_ids[an].add(gene_id)

In [18]:
remove_gene_ids = {"AP012291.1": {"gene-I-CreI"}}
remove_gene_ids

{'AP012291.1': {'gene-I-CreI'}}

In [20]:
for an in remove_gene_ids.keys():
    an_path = DATA_DIR / f"{an}{gff_suffix}"
    with open(an_path, "r") as f:
        lines = f.readlines()

    headers, index = [], 0
    while lines[index].startswith("#"):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile("|".join([f"ID={x};" for x in remove_gene_ids[an]]))
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()):
            continue
        line = line.split("\t")

        # line[2] is type line, line[8] is attributes
        if pattern.search(line[8]):
            if remove_string not in line[2]:
                line[2] = remove_string + line[2]

        contents.append("\t".join(line))

    with open(an_path, "w") as f:
        f.write("\n".join(headers))
        f.write("\n")
        f.write("\n".join(contents))
        f.write("\n\n")

    print(f"{an} Done!")

AP012291.1 Done!
