In [35]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
import gdt

from Bio import Entrez

def increment_gdt_file(path: Path) -> tuple[Path, int]:
    """
    Increment the GDT file name by 1.
    Example: fungi-ncbi_pilot_03.gdt -> fungi-ncbi_pilot_04.gdt
    """
    plist = path.stem.split("_")
    try:
        number = int(plist[-1]) + 1
        plist[-1] = f"{number:02d}"
    except ValueError:
        raise ValueError(
            f"Invalid GDT file name: {path.name}. Expected format: <preferred_name>_##.gdt, where ## is a number."
        )
    return path.parent / f'{"_".join(plist)}{path.suffix}', number

def get_most_recent_gdt(dir_path: Path, prefix="TEMP_") -> Path:
    """
    Get the most recent gdt file in the directory.
    Arguments:
        dir_path (Path): Directory to search for GDT files.
        prefix (str): Prefix of the GDT files to search for. It will match files like "<prefix>*.gdt".
    Returns:
        Path: Path to the most recent GDT file.
    """
    temp_files = list(dir_path.glob(f"{prefix}*.gdt"))
    if not temp_files:
        return dir_path / f"{prefix}00.gdt"
    return gdt.gene_dict_impl.natural_sort(temp_files, key=lambda x: x.stem)[-1]

In [7]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../sandbox/algae_pt_test"
AN_missing_gene_dict = "../sandbox/algae_pt_test/AN_missing_gene_dict"
most_recent_gdt_file = "GDT_pilot_01.gdt"
remove_orfs = True
organelle_type = "PT"
gff_suffix = ".gff3"

Entrez.email = 'dupin@alunos.utfpr.edu.br'
Entrez.api_key = 'b3abc1ac7ae9ac035af84ec1abf895878d09'

In [None]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_gene_dict = Path(AN_missing_gene_dict).resolve()

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Data directory {DATA_DIR} does not exist or is not a directory.")

if not AN_missing_gene_dict.exists() and not AN_missing_gene_dict.is_file():
    raise FileNotFoundError(f"AN missing gene dictionary {AN_missing_gene_dict} does not exist or is not a file.")

MISC_DIR = DATA_DIR / "misc"
MISC_DIR.mkdir(exist_ok=True)
GDT_dir = MISC_DIR / "gdt"
GDT_dir.mkdir(exist_ok=True)
most_recent_gdt_file = GDT_dir / most_recent_gdt_file
if not most_recent_gdt_file:
        print(f"If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.")
        print(f"Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")

In [5]:
_, logger = gdt.logger_setup.logger_creater(log_file=MISC_DIR / "01_test.log", console_level="DEBUG", file_level="TRACE")
logger.debug("Running from notebook AN_missing_gene_dict")

2025-05-30 15:04:57,404 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/sandbox/algae_pt_test/misc/01_test.log
2025-05-30 15:04:57,416 - DEBUG - Running from notebook AN_missing_gene_dict


In [6]:
with open(AN_missing_gene_dict, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 176


In [50]:
def data_process(df_missing, AN, gene_dict, temp_gene_dict,
                 organelle_type, temp_count, logger, use_NCBI_symbol=False,
                 use_gene=False, temp_name='temp_desc', c_text='ncbi_desc', gn_tag='NCBI'):
    """
    Process the data in the dataframe and update the gene_dict and corresponding temp_gene_dict.
    Args:
        df_missing (pd.DataFrame): DataFrame containing the missing genes.
        AN (str): The accession number.
        gene_dict (dict): Dictionary containing gene information.
        temp_gene_dict (dict): Temporary dictionary for gene information.
        organelle_type (str): Type of organelle. Should be "MT" or "PT".
        temp_count (int): Counter for temporary labels.
        logger: Logger object for logging messages.
        use_NCBI_symbol (bool): Flag to indicate whether to use NCBI gene symbol or NCBI description. Default is False, which means use NCBI description.
        use_gene (bool): Flag to indicate whether to use gff gene or not. Default is False, which means use what was set with use_NCBI_symbol.
        temp_name (str): Name for the temporary dictionary. Default is 'temp_desc'.
    Returns:
        tuple: Updated gene_dict, temp_gene_dict, and temp_count.
    """
    for row in df_missing.itertuples():
        check_var = row.gene_symbol if use_NCBI_symbol else row.desc
        check_var = row.gene if use_gene else check_var
        check_desc = f'{check_var} | ncbi_desc: {row.desc}' if use_NCBI_symbol else check_var

        logger.debug(f'gene_id: {row.gene_id} | dbxref: {row.dbxref} | s: {row.start} | att: {row.attributes}')
        logger.trace(f'\tname: {row.other_aliases} | desc: {row.desc} | gene_symbol: {row.gene_symbol}')
        logger.trace(f'\tcheck_var: {check_var} | use_symbol: {use_NCBI_symbol} | use_gene: {use_gene}')        
        
        if check_var in gene_dict:
            gene_label = gene_dict[check_var].label
            logger.debug(f'\t[1st T]Label in gene_dict, L: |{gene_label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}')
            gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'{c_text}: {check_desc}')
        
        else:
            logger.trace(f'\t[1st F]Label not found gene_dict | checking {temp_name} | Label: {check_var}')

            if check_var in temp_gene_dict:
                gene_label = temp_gene_dict[check_var].label
                logger.debug(f'\t[2nd T]Label in {temp_name}, L: |{gene_label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}')
                temp_gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'{c_text}: {check_desc}')
            else:
                
                temp_count += 1
                label = f'{organelle_type}-TEMP-{temp_count}'
                logger.debug(f'\t[2nd F]Label not in {temp_name}, new label |{label}| adding: {row.gene_id} #dx {AN}:{row.dbxref} #c {c_text}: {check_desc}')
                temp_gene_dict[check_var] = gdt.gene_dict_impl.GeneDescription(
                    label=label,
                    source=gn_tag,
                    c=None)

                temp_gene_dict[row.gene_id] = gdt.gene_dict_impl.GeneDbxref(
                    label=label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'{c_text}: {check_desc}')
    
    return gene_dict, temp_gene_dict, temp_count

### TEMP using NCBI description

In [9]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict_impl.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
    print(f"Loaded gene_dict from {most_recent_gdt_file}\nHeader:")
    [print(x) for x in gene_dict['gdt_header']]
    print("\nGDT Info:")
    [print(x) for x in gene_dict['gdt_info']]
else:
    gene_dict = {}
    print("No GDT file found, starting with an empty gene_dict.")

temp_gene_dict = {}

Loaded gene_dict from /home/brenodupin/matheus/gdt/sandbox/algae_pt_test/misc/gdt/GDT_pilot_01.gdt
Header:
version 0.0.2
Green_Algae_pt
2025-04-09 18:56 - Conversion from gdt to gdt2
2025-05-28 16:24 - Stripped GDT version from original GDT file Green_Algae_pt.gdt
Data added from TEMP 01

GDT Info:
Gene dictionary length: 26114
Label: 247
GeneDescription: 1702
GeneGenerics: 0
GeneDbxref: 24412


In [9]:
RE_ID = re.compile(r'ID=([^;]+)')
RE_GENE = re.compile(r'gene=([^;]+)')
RE_DBXREF = re.compile(r'Dbxref=GeneID:([^;]+)')
RE_DBXREF2 = re.compile(r'GeneID:([^;]+)')

In [None]:
temp_gene_dict = {}
temp_count = 0
errors = []
logger.info(' ---- [Starting TEMP process] ----')
for i, AN in enumerate(ANs):
    logger.debug(f'-- [Processing: {AN}] --')
    
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    if not an_path.exists():
        logger.error(f'Error: {AN} does not exist (an_path: {an_path})')
        errors.append((AN, 'File not found'))
        continue
    
    df = gdt.gff3_utils.load_gff3(an_path, query_string=gdt.gff3_utils.QS_GENE_TRNA_RRNA)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df['gene_id'] = df['attributes'].str.extract(RE_ID, expand=False)
    df['gene'] = df['attributes'].str.extract(RE_GENE, expand=False)
    df['in_gene_dict'] = df['gene_id'].isin(gene_dict)
    df['has_gene'] = df['gene'].notna()
    
    #placeholder for adding parent dbxref to child genes

    df['dbxref'] = df['attributes'].str.extract(RE_DBXREF, expand=False)
    #df['dbxref'] = df['dbxref'].fillna(df['attributes'].str.extract(RE_DBXREF2, expand=False))  # Fallback to gene_id if dbxref is NaN
    # TODO, deal with multiple GeneID in dbxref
    
    df_missing = df[~df['in_gene_dict'] & ~df['has_gene']].copy()
    df_gene = df[~df['in_gene_dict'] & df['has_gene']].copy()

    if not df_gene.empty:
        logger.debug(f'Found {len(df_gene)} genes in {AN} that are not in the gene_dict, but have a gene attribute.')
        df_gene[['other_aliases', 'desc', 'gene_symbol']] = np.nan
        gene_dict, temp_gene_dict, temp_count = data_process(
            df_gene, AN, gene_dict, temp_gene_dict, organelle_type,
            temp_count, logger, use_gene=True, temp_name='temp_gene', c_text='gff_gene', gn_tag='gff_gene')
    else:
        logger.debug(f'No genes found in {AN} that are not in the gene_dict, but have a gene attribute.')

    if df_missing.empty:
        logger.debug(f'No missing genes in {AN} that are not in the gene_dict and do not have a gene attribute.')
        continue

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    if df_missing['dbxref'].isna().any() or not df_missing['dbxref'].str.isnumeric().all():
        logger.warning(f'Error in {AN} - dbxref is not numeric or contains NaN')
        logger.debug('\ttrying only "GeneID:"')
        
        df_missing['dbxref'] = df_missing['attributes'].str.extract(RE_DBXREF2, expand=False)
        if not df_missing['dbxref'].str.isnumeric().all() or df_missing['dbxref'].isna().any():
            logger.error(f'\tError in {AN} - dbxref is not numeric or contains NaN x2')
            errors.append((AN, 'NaN or not numeric'))
            continue
        else:
            logger.debug('\tSuccess! - continuing')

    # search NCBI
    with Entrez.esummary(db="gene", id=",".join(df_missing['dbxref'])) as search_handle:
        try:
            search_results = Entrez.read(search_handle)['DocumentSummarySet']['DocumentSummary'] # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            logger.error(f'{ex} in Entrez.read for {AN}')
            errors.append((AN, 'Entrez.read'))
            continue
    
    mr_check = len(df_missing) == len(search_results)
    logger.trace(f"\tm: {len(df_missing)} | r: {len(search_results)} | m/r check: {mr_check}") # type: ignore
    
    # merge with search_results
    temp_df = pd.DataFrame([{
        'dbxref': x.attributes['uid'],
        'other_aliases': x.get('OtherAliases', 'no_other_aliases'),
        'desc': x.get('Description', 'no_description'),
        'gene_symbol': x.get('Name', 'no_gene_symbol')
        } for x in search_results])
    
    df_missing = df_missing.merge(temp_df, on='dbxref', how='left', copy=False)

    # check if df_missing len is equal to search_results, and equal to the original df
    if (len(df_missing) != len(temp_df)) or (len(df_missing) != len(df[~df['in_gene_dict'] & ~df['has_gene']])):
        logger.warning(f"{AN} m/r check: {mr_check} | df_missing len {len(df_missing)} | temp_df len {len(temp_df)} | df len {len(df[~df['in_gene_dict'] & ~df['has_gene']])}")
        logger.warning(f'This is not expected, but can be caused by fragmented genes that have the same dbxref/gene_id. Please check the log file for more details in TRACE level.')

    # process the data
    gene_dict, temp_gene_dict, temp_count = data_process(df_missing, AN, gene_dict,
                                                         temp_gene_dict, organelle_type,
                                                         temp_count, logger)
    

logger.info(f' ---- [Finished] ----')
if errors:
    logger.warning(f'Errors: {len(errors)}')
    for an, msg in errors:
        logger.warning(f'{an} - {msg}')

In [None]:
if temp_gene_dict:
    temp_path = get_most_recent_gdt(GDT_dir, prefix="TEMP_")
    new_path, nth_iteration = increment_gdt_file(temp_path)
    logger.info(f'Writing TEMP GDT file: {new_path} | Iteration: {nth_iteration}')
    temp_gene_dict['gdt_info'] = gdt.gene_dict_impl.get_gene_dict_info(temp_gene_dict)
    temp_gene_dict['gdt_header'] = ['version 0.0.2', f'TEMP - {nth_iteration}', 'Automagically generated by AN_missing_gene_dict.ipynb | TEMP using NCBI gene description']
    gdt.gene_dict_impl.write_gdt_file(temp_gene_dict, new_path, overwrite=True)

In [None]:
if gene_dict:
    new_path, nth_iteration = increment_gdt_file(most_recent_gdt_file)
    logger.info(f'Writing TEMP GDT file: {new_path} | Iteration: {nth_iteration}')
    gene_dict['gdt_info'] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
    gene_dict['gdt_header'].append(f'Data added from TEMP {nth_iteration:02}')
    gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

### TEMP using NCBI Symbol

In [51]:
ANS_Symbol = set()

with open(MISC_DIR / "seed_TEMP_Symbol.txt", "r") as f:
    for line in f:
        line = line.strip()
        
        if not line or line.startswith('#') or line.startswith('[') or '#gd' in line:
            continue

        ANS_Symbol.add(line.split('#dx', 1)[1].strip().split(':', 1)[0])

In [52]:
# If you running this right after the TEMP process, you need to update most_recent_gdt_file
# to the GDT_pilot file created in the TEMP process.
most_recent_gdt_file = GDT_dir / f'GDT_pilot_{nth_iteration:02}.gdt'

In [5]:

if most_recent_gdt_file:
    gene_dict = gdt.gene_dict_impl.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
else:
    gene_dict = {}

temp_symbol_gene_dict = {}

In [None]:
temp_symbol_gene_dict = {}
temp_count = 0
errors = []
logger.info(' ---- [Starting TEMP process] ----')
for i, AN in enumerate(ANS_Symbol):
    logger.debug(f'-- [Processing: {AN}] --')
    
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    if not an_path.exists():
        logger.error(f'Error: {AN} does not exist (an_path: {an_path})')
        errors.append((AN, 'File not found'))
        continue
    
    df = gdt.gff3_utils.load_gff3(an_path, query_string=gdt.gff3_utils.QS_GENE_TRNA_RRNA)
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df['gene_id'] = df['attributes'].str.extract(RE_ID, expand=False)
    df['in_gene_dict'] = df['gene_id'].isin(gene_dict)
    df_missing = df[~df['in_gene_dict']].copy()

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    df_missing['dbxref'] = df_missing['attributes'].str.extract(RE_DBXREF, expand=False)
    if df_missing['dbxref'].isna().any() or not df_missing['dbxref'].str.isnumeric().all():
        logger.warning(f'Error in {AN} - dbxref is not numeric or contains NaN')
        logger.debug('\ttrying only "GeneID:"')
        
        df_missing['dbxref'] = df_missing['attributes'].str.extract(RE_DBXREF2, expand=False)
        if not df_missing['dbxref'].str.isnumeric().all() or df_missing['dbxref'].isna().any():
            logger.error(f'\tError in {AN} - dbxref is not numeric or contains NaN x2')
            errors.append((AN, 'NaN or not numeric'))
            continue
        else:
            logger.debug('\tSuccess! - continuing')

    # search NCBI
    with Entrez.esummary(db="gene", id=",".join(df_missing['dbxref'])) as search_handle:
        try:
            search_results = Entrez.read(search_handle)['DocumentSummarySet']['DocumentSummary'] # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            logger.error(f'{ex} in Entrez.read for {AN}')
            errors.append((AN, 'Entrez.read'))
            continue
    
    mr_check = len(df_missing) == len(search_results)
    logger.trace(f"\tm: {len(df_missing)} | r: {len(search_results)} | m/r check: {mr_check}") # type: ignore
    
    # merge with search_results
    temp_df = pd.DataFrame([{
        'dbxref': x.attributes['uid'],
        'other_aliases': x.get('OtherAliases', 'no_other_aliases'),
        'desc': x.get('Description', 'no_description'),
        'gene_symbol': x.get('Name', 'no_gene_symbol')
        } for x in search_results])
    
    df_missing = df_missing.merge(temp_df, on='dbxref', how='left', copy=False)

    # check if df_missing len is equal to search_results, and equal to the original df
    if (len(df_missing) != len(temp_df)) or (len(df_missing) != len(df[~df['in_gene_dict']])):
        logger.warning(f'{AN} m/r check: {mr_check} | df_missing len {len(df_missing)} | temp_df len {len(temp_df)} | df len {len(df[~df["in_gene_dict"]])}')
        logger.warning(f'This is not expected, but can be caused by fragmented genes that have the same dbxref/gene_id. Please check the log file for more details in TRACE level.')
    
    # process the data
    gene_dict, temp_symbol_gene_dict, temp_count = data_process(df_missing, AN, gene_dict,
                                                         temp_symbol_gene_dict, organelle_type,
                                                         temp_count, logger, temp_name='temp_symbol',
                                                         use_NCBI_symbol=True, c_text='ncbi_symbol')
    

logger.info(f' ---- [Finished] ----')
if errors:
    logger.warning(f'Errors: {len(errors)}')
    for an, msg in errors:
        logger.warning(f'{an} - {msg}')

In [None]:
if temp_symbol_gene_dict:
    temp_path = get_most_recent_gdt(GDT_dir, prefix="TEMP_Symbol_")
    new_path, nth_iteration = increment_gdt_file(temp_path)
    logger.info(f'Writing TEMP Symbol GDT file: {new_path} | Iteration: {nth_iteration}')
    temp_symbol_gene_dict['gdt_info'] = gdt.gene_dict_impl.get_gene_dict_info(temp_symbol_gene_dict)
    temp_symbol_gene_dict['gdt_header'] = ['version 0.0.2', f'TEMP Symbol - {nth_iteration}', 'Automagically generated by AN_missing_gene_dict.ipynb | TEMP Symbol using NCBI gene symbol']
    gdt.gene_dict_impl.write_gdt_file(temp_symbol_gene_dict, new_path, overwrite=True)

In [None]:
if gene_dict:
    new_path, nth_iteration = increment_gdt_file(most_recent_gdt_file)
    logger.info(f'Writing TEMP GDT file: {new_path} | Iteration: {nth_iteration}')
    gene_dict['gdt_info'] = gdt.gene_dict_impl.get_gene_dict_info(gene_dict)
    gene_dict['gdt_header'].append(f'Data added from TEMP {nth_iteration:02}')
    gdt.gene_dict_impl.write_gdt_file(gene_dict, new_path, overwrite=True)

### Genes Discard using dbxref

In [None]:
remove_string = 'discard-'
genes_to_remove = "gene_to_remove"

gene-J2C28_mgp19 #dx NC_053825.1:63373456 #c ncbi_desc: hypothetical protein
gene-J2C28_mgp16 #dx NC_053825.1:63373460 #c ncbi_desc: hypothetical protein

In [None]:
remove_dbxref = {}
with open(genes_to_remove, "r") as f:
    for line in f:
        if not line.strip():
            continue
        
        an, dbxref = line.split("#c", 1)[0].split("#dx", 1)[1].strip().split(":")
        
        if not an or not dbxref:
            raise ValueError(f"Error: {line} - AN and dbxref are empty after split, why?")
        
        if an not in remove_dbxref:
            remove_dbxref[an] = [dbxref]
        else:
            remove_dbxref[an].append(dbxref)

In [None]:
remove_dbxref

In [None]:
for an in remove_dbxref.keys():
    an_path = DATA_DIR / f'{an}{gff_suffix}'
    with open(an_path, 'r') as f:
        lines = f.readlines()
    
    headers, index = [], 0
    while lines[index].startswith('#'):
        headers.append(lines[index].strip())
        index += 1

    pattern = re.compile('|'.join([f'GeneID:{x}[,;]' for x in remove_dbxref[an]]))
    contents = []

    for line in lines[index:]:
        if not (line := line.strip()): continue
        line = line.split('\t')
        
        if pattern.search(line[8]):
            if 'discard-' not in line[2]:
                line[2] = remove_string + line[2]
        
        contents.append('\t'.join(line))
    
    with open(an_path, 'w') as f:
        f.write('\n'.join(headers))
        f.write('\n')
        f.write('\n'.join(contents))
        f.write('\n\n')

    print("Done!")

In [10]:
gdt.write_gdt_file_sorted(gene_dict, GDT_dir / f'pilot_01_sorted.gdt', overwrite=True)