In [1]:
import pandas as pd
import numpy as np
import os

from pathlib import Path
import gdt

from Bio import Entrez

In [2]:
nth_iteration = 1

In [3]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../test/Test_group16"
AN_missing_gene_dict = "../test/Test_group16/AN_missing_gene_dict"
#most_recent_gdt_file = "../test/Test_group16/Test_group16.gdt"
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

Entrez.email = 'dupin@alunos.utfpr.edu.br'
Entrez.api_key = 'b3abc1ac7ae9ac035af84ec1abf895878d09'

In [4]:
most_recent_gdt_file = ""

In [5]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_gene_dict = Path(AN_missing_gene_dict).resolve()

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Data directory {DATA_DIR} does not exist or is not a directory.")

if not AN_missing_gene_dict.exists() and not AN_missing_gene_dict.is_file():
    raise FileNotFoundError(f"AN missing gene dictionary {AN_missing_gene_dict} does not exist or is not a file.")

if not most_recent_gdt_file:
    if nth_iteration > 1:
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")
    else:
        print(f"If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.")
        print(f"Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")

If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.
Otherwise, ignore this message.


In [6]:
_, logger = gdt.logger_setup.logger_creater(log_file="test.log", console_level="DEBUG")
logger.info("Running from notebook AN_missing_gene_dict")

2025-05-06 19:32:00,032 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/notebooks/test.log
2025-05-06 19:32:00,036 - INFO - Running from notebook AN_missing_gene_dict


In [7]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
else:
    gene_dict = {}

temp_gene_dict = {}

In [8]:
with open(AN_missing_gene_dict, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 5


In [9]:
def data_process(df_missing, AN, gene_dict, temp_gene_dict, organelle_type, temp_count, logger):
    for row in df_missing.itertuples():
        logger.debug(f'gene_id: {row.gene_id} | dbxref: {row.dbxref} | s: {row.start} | att: {row.attributes}')
        logger.debug(f'\tname: {row.other_aliases} | desc: {row.desc} | gene_symbol: {row.gene_symbol}')
        
        if row.desc in gene_dict:
            gene_label = gene_dict[row.desc].label
            logger.debug(f'\t[1st T]Label in gene_dict, adding: {row.gene_id} # {AN}:{row.dbxref} | Label: {gene_label}')
            gene_dict[row.gene_id] = gdt.gene_dict.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'ncbi_desc: {row.desc}')
        
        else:
            logger.debug(f'\t[1st F]Label not found gene_dict | checking temp_gene_dict | Label: {row.desc}')

            if row.desc in temp_gene_dict:
                gene_label = temp_gene_dict[row.desc].label
                logger.debug(f'\t[2nd T]Label in temp_gene_dict, adding: {row.gene_id} # {AN}:{row.dbxref} | Label: {gene_label}')
                temp_gene_dict[row.gene_id] = gdt.gene_dict.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'ncbi_desc: {row.desc}')
            else:
                logger.debug(f'\t[2nd F]Label not in temp_gene_dict, creating new label | Label: {row.desc}')
                temp_count += 1
                label = f'{organelle_type}-TEMP-{temp_count}'
                
                logger.debug(f'\tNew Label {label} to {row.desc} # {AN}:{row.dbxref}')
                temp_gene_dict[row.desc] = gdt.gene_dict.GeneDescription(
                    label=label,
                    source="NCBI",
                    c=None)

                temp_gene_dict[row.gene_id] = gdt.gene_dict.GeneDbxref(
                    label=label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'ncbi_desc: {row.desc}')
    
    return gene_dict, temp_gene_dict, temp_count

In [10]:
temp_gene_dict = {}
temp_count = 0
errors = []
logger.info('Starting TEMP process')
for AN in ANs:
    logger.debug(f'-- [Processing: {AN}] --')
    
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    if not an_path.exists():
        logger.error(f'Error: {AN} does not exist (an_path: {an_path})')
        errors.append((AN, 'File not found'))
        continue
    
    df = gdt.gff3_utils.load_gff3(an_path, query_string='type == "gene"')
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df['gene_id'] = df['attributes'].str.split(';').str[0].str.replace('ID=', '', regex=False)
    df['in_gene_dict'] = df['gene_id'].isin(gene_dict)
    df_missing = df[~df['in_gene_dict']].copy()

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    df_missing['dbxref'] = df_missing['attributes'].str.extract(r'Dbxref=GeneID:([^;]*)(?:;|$)')
    if df_missing['dbxref'].isna().any() or not df_missing['dbxref'].str.isnumeric().all():
        logger.warning(f'Error in {AN} - dbxref is not numeric or contains NaN')
        logger.debug('\ttrying only "GeneID:"')
        
        df_missing['dbxref'] = df_missing['attributes'].str.extract(r'GeneID:([^;]*)(?:;|$)')
        if not df_missing['dbxref'].str.isnumeric().all() or df_missing['dbxref'].isna().any():
            logger.error(f'\tError in {AN} - dbxref is not numeric or contains NaN x2')
            errors.append((AN, 'NaN or not numeric'))
            continue
        else:
            logger.debug('\tSuccess! - continuing')

    # search NCBI
    with Entrez.esummary(db="gene", id=",".join(df_missing['dbxref'])) as search_handle:
        try:
            search_results = Entrez.read(search_handle)['DocumentSummarySet']['DocumentSummary'] # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            logger.error(f'{ex} in Entrez.read for {AN}')
            errors.append((AN, 'Entrez.read'))
            continue
    
    logger.debug(f"\tm: {len(df_missing)} | r: {len(search_results)} | m/r check: {len(df_missing) == len(search_results)}") # type: ignore
    
    # merge with search_results
    temp_df = pd.DataFrame([{
        'dbxref': x.attributes['uid'],
        'other_aliases': x.get('OtherAliases', 'no_other_aliases'),
        'desc': x.get('Description', 'no_description'),
        'gene_symbol': x.get('Name', 'no_gene_symbol')
        } for x in search_results])
    
    df_missing = df_missing.merge(
        temp_df, # type: ignore
        on='dbxref',
        how='left',
        copy=False
    )

    # process the data
    gene_dict, temp_gene_dict, temp_count = data_process(df_missing, AN, gene_dict,
                                                         temp_gene_dict, organelle_type,
                                                         temp_count, logger)
logger.info(f' -- [Finished] --')
if errors:
    logger.warning(f'Errors: {len(errors)}')
    for an, msg in errors:
        logger.warning(f'{an} - {msg}')

2025-05-06 19:32:00,124 - INFO - Starting TEMP process
2025-05-06 19:32:00,125 - DEBUG - -- [Processing: NC_001329.3] --
2025-05-06 19:32:00,674 - DEBUG - 	m: 43 | r: 43 | m/r check: True
2025-05-06 19:32:00,679 - DEBUG - gene_id: gene-PoanfMt57 | dbxref: 802418 | s: 600 | att: ID=gene-PoanfMt57;Dbxref=GeneID:802418;Name=trnI(gau);gbkey=Gene;gene=trnI(gau);gene_biotype=tRNA;locus_tag=PoanfMt57
2025-05-06 19:32:00,679 - DEBUG - 	name: PoanfMt57 | desc: tRNA-Ile | gene_symbol: trnI(gau)
2025-05-06 19:32:00,679 - DEBUG - 	[1st F]Label not found gene_dict | checking temp_gene_dict | Label: tRNA-Ile
2025-05-06 19:32:00,680 - DEBUG - 	[2nd F]Label not in temp_gene_dict, creating new label | Label: tRNA-Ile
2025-05-06 19:32:00,681 - DEBUG - 	New Label MT-TEMP-1 to tRNA-Ile # NC_001329.3:802418
2025-05-06 19:32:00,682 - DEBUG - gene_id: gene-PoanfMt58 | dbxref: 802426 | s: 832 | att: ID=gene-PoanfMt58;Dbxref=GeneID:802426;Name=trnS(uga);gbkey=Gene;gene=trnS(uga);gene_biotype=tRNA;locus_tag=Poa

In [11]:
if temp_gene_dict:
    temp_gene_dict['info'] = gdt.gene_dict.get_gene_dict_info(temp_gene_dict)
    temp_gene_dict['header'] = ['version 0.0.2', f'TEMP - {nth_iteration}', 'Automatically generated by gdt.py']
    gdt.gene_dict.write_gdt_file(temp_gene_dict, f'TEMP_{nth_iteration:02}.gdt', overwrite=True)

In [None]:
if gene_dict:
    gene_dict['info'] = gdt.gene_dict.get_gene_dict_info(gene_dict)
    gene_dict['header'] = ['version 0.0.2', f'GDT - {nth_iteration}', 'Automatically generated by gdt.py']
    gdt.gene_dict.write_gdt_file(gene_dict, f'GDT_pilot_{nth_iteration:02}.gdt', overwrite=True)