In [34]:
import pandas as pd
import numpy as np
import os

from pathlib import Path
import gdt

from Bio import Entrez

In [35]:
nth_iteration = 1

In [36]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../test/Test_group16"
AN_missing_gene_dict = "../test/Test_group16/AN_missing_gene_dict"
#most_recent_gdt_file = "../test/Test_group16/Test_group16.gdt"
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

Entrez.email = 'dupin@alunos.utfpr.edu.br'
Entrez.api_key = 'b3abc1ac7ae9ac035af84ec1abf895878d09'

In [37]:
most_recent_gdt_file = ""

In [38]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_gene_dict = Path(AN_missing_gene_dict).resolve()

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Data directory {DATA_DIR} does not exist or is not a directory.")

if not AN_missing_gene_dict.exists() and not AN_missing_gene_dict.is_file():
    raise FileNotFoundError(f"AN missing gene dictionary {AN_missing_gene_dict} does not exist or is not a file.")

if not most_recent_gdt_file:
    if nth_iteration > 1:
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")
    else:
        print(f"If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.")
        print(f"Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")

If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.
Otherwise, ignore this message.


In [39]:
_, logger = gdt.logger_setup.logger_creater(log_file="test.log", console_level="DEBUG", file_level="TRACE")
logger.debug("Running from notebook AN_missing_gene_dict")

2025-05-07 17:17:45,669 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/notebooks/test.log
2025-05-07 17:17:45,674 - DEBUG - Running from notebook AN_missing_gene_dict


In [40]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
else:
    gene_dict = {}

temp_gene_dict = {}

In [41]:
with open(AN_missing_gene_dict, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 211


In [42]:
def data_process(df_missing, AN, gene_dict, temp_gene_dict, organelle_type, temp_count, logger):
    for row in df_missing.itertuples():
        logger.trace(f'gene_id: {row.gene_id} | dbxref: {row.dbxref} | s: {row.start} | att: {row.attributes}')
        logger.trace(f'\tname: {row.other_aliases} | desc: {row.desc} | gene_symbol: {row.gene_symbol}')
        
        if row.desc in gene_dict:
            gene_label = gene_dict[row.desc].label
            logger.debug(f'[1st T]Label in gene_dict, L: |{gene_label}| adding: {row.gene_id} #gn {AN}:{row.dbxref} #c ncbi_desc: {row.desc}')
            gene_dict[row.gene_id] = gdt.gene_dict.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'ncbi_desc: {row.desc}')
        
        else:
            logger.trace(f'[1st F]Label not found gene_dict | checking temp_gene_dict | Label: {row.desc}')

            if row.desc in temp_gene_dict:
                gene_label = temp_gene_dict[row.desc].label
                logger.debug(f'[2nd T]Label in temp_gene_dict, L: |{gene_label}| adding: {row.gene_id} #gn {AN}:{row.dbxref} #c ncbi_desc: {row.desc}')
                temp_gene_dict[row.gene_id] = gdt.gene_dict.GeneDbxref(
                    label=gene_label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'ncbi_desc: {row.desc}')
            else:
                
                temp_count += 1
                label = f'{organelle_type}-TEMP-{temp_count}'
                logger.debug(f'[2nd F]Label not in temp_gene_dict, new label |{label}| adding: {row.gene_id} #gn {AN}:{row.dbxref} #c ncbi_desc: {row.desc}')
                temp_gene_dict[row.desc] = gdt.gene_dict.GeneDescription(
                    label=label,
                    source="NCBI",
                    c=None)

                temp_gene_dict[row.gene_id] = gdt.gene_dict.GeneDbxref(
                    label=label,
                    an_source=AN,
                    dbxref=row.dbxref,
                    c=f'ncbi_desc: {row.desc}')
    
    return gene_dict, temp_gene_dict, temp_count

In [None]:
temp_gene_dict = {}
temp_count = 0
errors = []
logger.info(' ---- [Starting TEMP process] ----')
for i, AN in enumerate(ANs):
    logger.debug(f'-- [Processing: {AN}] --')
    
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    if not an_path.exists():
        logger.error(f'Error: {AN} does not exist (an_path: {an_path})')
        errors.append((AN, 'File not found'))
        continue
    
    df = gdt.gff3_utils.load_gff3(an_path, query_string='type == "gene"')
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    # getting the gene_id and if it is in the gene_dict
    df['gene_id'] = df['attributes'].str.split(';').str[0].str.replace('ID=', '', regex=False)
    df['in_gene_dict'] = df['gene_id'].isin(gene_dict)
    df_missing = df[~df['in_gene_dict']].copy()

    # two step method to extract dbxref, first try to get the full dbxref,
    # if not all genes are numeric and not NaN, fallback to GeneID,
    # check again if all genes are numeric and not NaN.
    # if not, raise an error
    df_missing['dbxref'] = df_missing['attributes'].str.extract(r'Dbxref=GeneID:([^;]*)(?:;|$)')
    if df_missing['dbxref'].isna().any() or not df_missing['dbxref'].str.isnumeric().all():
        logger.warning(f'Error in {AN} - dbxref is not numeric or contains NaN')
        logger.debug('\ttrying only "GeneID:"')
        
        df_missing['dbxref'] = df_missing['attributes'].str.extract(r'GeneID:([^;]*)(?:;|$)')
        if not df_missing['dbxref'].str.isnumeric().all() or df_missing['dbxref'].isna().any():
            logger.error(f'\tError in {AN} - dbxref is not numeric or contains NaN x2')
            errors.append((AN, 'NaN or not numeric'))
            continue
        else:
            logger.debug('\tSuccess! - continuing')

    # search NCBI
    with Entrez.esummary(db="gene", id=",".join(df_missing['dbxref'])) as search_handle:
        try:
            search_results = Entrez.read(search_handle)['DocumentSummarySet']['DocumentSummary'] # type: ignore
        except (RuntimeError, KeyError, Exception) as ex:
            logger.error(f'{ex} in Entrez.read for {AN}')
            errors.append((AN, 'Entrez.read'))
            continue
    
    mr_check = len(df_missing) == len(search_results)
    logger.trace(f"\tm: {len(df_missing)} | r: {len(search_results)} | m/r check: {mr_check}") # type: ignore
    
    # merge with search_results
    temp_df = pd.DataFrame([{
        'dbxref': x.attributes['uid'],
        'other_aliases': x.get('OtherAliases', 'no_other_aliases'),
        'desc': x.get('Description', 'no_description'),
        'gene_symbol': x.get('Name', 'no_gene_symbol')
        } for x in search_results])
    
    df_missing = df_missing.merge(temp_df, on='dbxref', how='left', copy=False)

    # check if df_missing len is equal to search_results, and equal to the original df
    if (len(df_missing) != len(temp_df)) or (len(df_missing) != len(df[~df['in_gene_dict']])):
        logger.warning(f'{AN} m/r check: {mr_check} | df_missing len {len(df_missing)} | temp_df len {len(temp_df)} | df len {len(df[~df["in_gene_dict"]])}')
        logger.warning(f'This is not expected, but can be caused by fragmented genes that have the same dbxref/gene_id. Please check the log file for more details in TRACE level.')
    
    # process the data
    gene_dict, temp_gene_dict, temp_count = data_process(df_missing, AN, gene_dict,
                                                         temp_gene_dict, organelle_type,
                                                         temp_count, logger)
    

logger.info(f' ---- [Finished] ----')
if errors:
    logger.warning(f'Errors: {len(errors)}')
    for an, msg in errors:
        logger.warning(f'{an} - {msg}')

In [44]:
if temp_gene_dict:
    temp_gene_dict['info'] = gdt.gene_dict.get_gene_dict_info(temp_gene_dict)
    temp_gene_dict['header'] = ['version 0.0.2', f'TEMP - {nth_iteration}', 'Automatically generated by gdt.py']
    gdt.gene_dict.write_gdt_file(temp_gene_dict, f'TEMP_{nth_iteration:02}.gdt', overwrite=True)

In [45]:
if gene_dict:
    gene_dict['info'] = gdt.gene_dict.get_gene_dict_info(gene_dict)
    gene_dict['header'] = ['version 0.0.2', f'GDT - {nth_iteration}', 'Automatically generated by gdt.py']
    gdt.gene_dict.write_gdt_file(gene_dict, f'GDT_pilot_{nth_iteration:02}.gdt', overwrite=True)

In [46]:
df_missing

Unnamed: 0,type,start,end,attributes,gene_id,in_gene_dict,dbxref,other_aliases,desc,gene_symbol
0,gene,1,5454,ID=gene-QQQ02_mgp01;Dbxref=GeneID:81489573;Nam...,gene-QQQ02_mgp01,False,81489573,QQQ02_mgp01,cytochrome c oxidase subunit 1,COX1
1,gene,5836,5908,ID=gene-QQQ02_mgt01;Dbxref=GeneID:81489529;Nam...,gene-QQQ02_mgt01,False,81489529,QQQ02_mgt01,tRNA-Met,trnM(cat)
2,gene,5950,6035,ID=gene-QQQ02_mgt02;Dbxref=GeneID:81489530;Nam...,gene-QQQ02_mgt02,False,81489530,QQQ02_mgt02,tRNA-Ser,trnS2(tga)
3,gene,6496,7860,ID=gene-QQQ02_mgp20;Dbxref=GeneID:81489531;Nam...,gene-QQQ02_mgp20,False,81489531,QQQ02_mgp20,NADH dehydrogenase subunit 4,ND4
4,gene,7994,8066,ID=gene-QQQ02_mgt03;Dbxref=GeneID:81489532;Nam...,gene-QQQ02_mgt03,False,81489532,QQQ02_mgt03,tRNA-Trp,trnW(tca)
5,gene,8164,8236,ID=gene-QQQ02_mgt04;Dbxref=GeneID:81489533;Nam...,gene-QQQ02_mgt04,False,81489533,QQQ02_mgt04,tRNA-Asp,trnD(gtc)
6,gene,9606,9677,ID=gene-QQQ02_mgt05;Dbxref=GeneID:81489534;Nam...,gene-QQQ02_mgt05,False,81489534,QQQ02_mgt05,tRNA-Gly,trnG(tcc)
7,gene,9767,9988,ID=gene-QQQ02_mgp19;Dbxref=GeneID:81489535;Nam...,gene-QQQ02_mgp19,False,81489535,QQQ02_mgp19,ATP synthase F0 subunit 9,ATP9
8,gene,10391,11452,ID=gene-QQQ02_mgp18;Dbxref=GeneID:81489536;Nam...,gene-QQQ02_mgp18,False,81489536,QQQ02_mgp18,ribosomal protein S3,rps3
9,gene,12306,12372,ID=gene-QQQ02_mgt06;Dbxref=GeneID:81489537;Nam...,gene-QQQ02_mgt06,False,81489537,QQQ02_mgt06,tRNA-Ala,trnA


In [47]:
set(temp_df['dbxref']) == set(df_missing['dbxref'])

True

In [48]:
temp_df[temp_df['dbxref'].isin(['11539217', '11539216'])]

Unnamed: 0,dbxref,other_aliases,desc,gene_symbol


In [49]:
df_missing_OG[df_missing_OG['dbxref'].isin(['11539217', '11539216'])]

Unnamed: 0,type,start,end,attributes,gene_id,in_gene_dict,dbxref
4,gene,14892,19393,ID=gene-GigaM_p03;Dbxref=GeneID:11539216;Name=...,gene-GigaM_p03,False,11539216
18,gene,46763,52011,ID=gene-GigaM_p03;Dbxref=GeneID:11539216;Name=...,gene-GigaM_p03,False,11539216
20,gene,53747,55242,ID=gene-GigaM_r01;Dbxref=GeneID:11539217;Name=...,gene-GigaM_r01,False,11539217
30,gene,79551,80293,ID=gene-GigaM_r01;Dbxref=GeneID:11539217;Name=...,gene-GigaM_r01,False,11539217


In [50]:
df_missing[df_missing['dbxref'].isin(['11539217', '11539216'])]

Unnamed: 0,type,start,end,attributes,gene_id,in_gene_dict,dbxref,other_aliases,desc,gene_symbol


In [51]:
temp_gene_dict['gene-GigaM_p03']

GeneDbxref(label='MT-TEMP-30', c='ncbi_desc: cytochrome c oxidase subunit 1', an_source='NC_016684.1', dbxref='11539216')