In [1]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
import gdt

from Bio import Entrez

In [2]:
nth_iteration = 1

In [3]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../test/Test_group16"
AN_missing_dbxref = "../test/Test_group16/AN_missing_dbxref"
#most_recent_gdt_file = "../test/Test_group16/Test_group16.gdt"
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

Entrez.email = 'dupin@alunos.utfpr.edu.br'
Entrez.api_key = 'b3abc1ac7ae9ac035af84ec1abf895878d09'

In [4]:
most_recent_gdt_file = ""

In [5]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_dbxref = Path(AN_missing_dbxref).resolve()

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Data directory {DATA_DIR} does not exist or is not a directory.")

if not AN_missing_dbxref.exists() and not AN_missing_dbxref.is_file():
    raise FileNotFoundError(f"AN missing dbxref {AN_missing_dbxref} does not exist or is not a file.")

if not most_recent_gdt_file:
    if nth_iteration > 1:
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")
    else:
        print(f"If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.")
        print(f"Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")

If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.
Otherwise, ignore this message.


In [6]:
_, logger = gdt.logger_setup.logger_creater(log_file=DATA_DIR / '0_test_3.log', console_level="DEBUG", file_level="TRACE")
logger.debug("Running from notebook AN_missing_dbxref")

2025-05-22 16:53:23,402 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/test/Test_group16/0_test_3.log
2025-05-22 16:53:23,403 - DEBUG - Running from notebook AN_missing_dbxref


In [7]:
with open(AN_missing_dbxref, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 200


In [8]:
missing_dbxref_path = DATA_DIR / "missing_dbxref"
missing_dbxref_path.mkdir(exist_ok=True)

In [9]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
else:
    gene_dict = {}

temp_gene_dict = {}

In [10]:
temp_list = []
for AN in ANs:
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    if not an_path.exists():
        logger.error(f'Error: {AN} does not exist (an_path: {an_path})')
        continue
    
    df = gdt.gff3_utils.load_gff3(an_path, query_string=gdt.gff3_utils.QS_GENE_TRNA_RRNA, usecols=['seqid', 'start', 'end', 'type', 'attributes'])
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df['gene_id'] = df['attributes'].str.split(';').str[0].str.replace('ID=', '', regex=False)
    df['in_gene_dict'] = df['gene_id'].isin(gene_dict)
    df_missing = df[~df['in_gene_dict']].copy()

    temp_list.extend(df_missing[['gene_id', 'seqid']].to_dict('records'))

In [11]:
agg_dbxref = (pd.DataFrame(temp_list).groupby('gene_id')['seqid']
                .agg(list)
                .sort_index())  # Sort by gene_id

In [12]:
# Write here anything you want to add to the missing_dbxref file, or leave it empty
comment = "manual insertion from missing_dbxref_compiled"

In [13]:
with open(missing_dbxref_path / 'missing_dbxref_compiled.txt', 'w+') as f1:
    for gene_id, seqid in agg_dbxref.items():
        f1.write(f'{gene_id} #gn {" ".join(seqid)}{ " #c " + comment if comment else "" }\n')

After manual parsing of missing_dbxref_compiled.txt,  
create missing_dbxref_problems.txt, with names that  
are not readily indentifiable or that need deeper investigation.

The names that are easily identifiable should be added to the most  
recent _pilot.gdt, and this gdt should be them loaded above, before  
the next part of the pipeline.

### Deeper investigation using other gff attributes, primarily 'Name='

In [14]:
an_with_no_dbxref = set()
with open(missing_dbxref_path / 'missing_dbxref_problems.txt', 'r') as f:
    for line in f:
        if line.startswith('#') or not line.strip():
            continue
        # Get ANs part (after '||') and split into individual ANs
        if '#c' in line:
            line = line.split('#c')[0].strip()
        
        ans = line.split('#gn')[1].strip().split()
        # Add each AN to the set
        an_with_no_dbxref.update(ans)

In [15]:
an_with_no_dbxref

{'AP012272.1',
 'AP024424.1',
 'AP024451.1',
 'AP024468.1',
 'AP028211.1',
 'AP028246.1',
 'AP028247.1',
 'JQ346808.1',
 'KC832409.1',
 'LC545447.1',
 'LC602355.1',
 'LC604067.1',
 'LC612919.1',
 'LC659289.1',
 'MH725795.1'}

In [31]:
temp_list = []
for AN in an_with_no_dbxref:
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    df = gdt.gff3_utils.load_gff3(an_path, query_string=gdt.gff3_utils.QS_GENE_TRNA_RRNA, usecols=gdt.GFF3_COLUMNS) # TODO change query_string!
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df['gene_id'] = df['attributes'].str[3:].str.partition(';', expand=False).str[0]
    df = df[~df['gene_id'].isin(gene_dict)]
    
    df['source_gid'] = 'name'
    df['name'] = df['attributes'].str.extract(r'Name=([^;]*)(?:;|$)')

    df['source_gid'] = np.where(df['name'].isna(), 'product', df['source_gid'])
    df['name'] = df['name'].fillna(df['attributes'].str.extract(r'product=([^;]*)(?:;|$)', expand=False))

    # if 'name' is NaN, try to extract 'description='
    # change source to 'description' if 'description=' is found
    df['source_gid'] = np.where(df['name'].isna(), 'description', df['source_gid'])
    df['name'] = df['name'].fillna(df['attributes'].str.extract(r'description=([^;]*)(?:;|$)', expand=False))

    # if 'name' is still NaN, print a warning
    if df['name'].isna().any():
        print(f'Warning: {AN} has NaN names')
        df['source_gid'] = np.where(df['name'].isna(), 'gene_synonym', df['source_gid'])
        df['name'] = df['name'].fillna(df['attributes'].str.extract(r'gene_synonym=([^;]*)(?:;|$)', expand=False))

        df['source_gid'] = np.where(df['name'].isna(), 'stil_nan', df['source_gid'])
        print(df[df['name'].isna()])

    temp_list.extend(df[['name', 'gene_id', 'seqid', 'source_gid']].to_dict('records'))

gene_ans_name_df = pd.DataFrame(temp_list)
gene_ans_name_df = gene_ans_name_df.sort_values(by='name')

In [32]:
gene_ans_name_df

Unnamed: 0,name,gene_id,seqid,source_gid
416,12S RNA,rna-Aspvir_mt0010,LC604067.1,product
770,12S ribosomal RNA,rna-ACHE_r90001S,AP024424.1,product
955,12S ribosomal RNA,rna-APUU_r90001S,AP024451.1,product
440,16S RNA,rna-Aspvir_mt0023,LC604067.1,product
1066,16S ribosomal RNA,rna-C0995_000083,MH725795.1,product
...,...,...,...,...
133,trnW(cca),gene-EMPS_mt21,LC659289.1,name
709,trnY,gene-CcaverHIS002_mit130,AP028211.1,name
642,trnY,gene-CspHIS471_mit130,AP028246.1,name
23,trnY,gene-CspeluHIS016_mit230,AP028247.1,name


In [34]:
add_gdt_compliance = True
comment = 'Manual from missing_dbxref_names_raw'

In [36]:
if add_gdt_compliance:
    gdt_str = f' #gd MANUAL{ " #c " + comment if comment else "" }'
else:
    gdt_str = ""
    
with open(missing_dbxref_path / 'missing_dbxref_names_raw.txt', 'w+') as f1:
    for name in gene_ans_name_df['name'].unique():
        f1.write(f'{name}{gdt_str}\n')

gene_ans_name_df.to_csv(missing_dbxref_path / 'missing_dbxref_names_raw.tsv', sep='\t', index=False)

The user must now parse missing_dbxref_names_raw.txt into two files:  
missing_dbxref_names_clean.txt  
missing_dbxref_names_unk.txt  
  
missing_dbxref_names_clean.txt should contain all easily identifiable gene names,  
all the names __must__ also be in your current GDT version, because the next step  
will automaticaly add all gene_ids that have names inside missing_dbxref_names_clean.txt.  
  
missing_dbxref_names_unk.txt should contain all the names that you dont know or you're not certain about,  
the next step for theses names are matching feature using CDS, then extracting information about these features,  
to better classify the original gene.

In [52]:
# missing_dbxref_names_clean.txt

# Check if the names exist in the gene_dict
names = set()
with open(missing_dbxref_path / 'missing_dbxref_names_clean.txt', 'r') as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if '#gd' in line:
            line = line.split('#gd')[0].strip()
        
        names.add(line)

In [None]:
comment = "automated insertion from missing_dbxref_names_clean"

In [None]:
df_names_clean  = pd.read_csv(missing_dbxref_path / 'missing_dbxref_names_raw.tsv', sep='\t')
df_names_clean = df_names_clean[df_names_clean['name'].isin(names)]

for r in df_names_clean.itertuples():
    gene_id = r.gene_id
    seqid = r.seqid
    source_gid = r.source_gid
    name = r.name

    if gene_id in gene_dict:
        logger.debug(f'Gene {gene_id} already exists in gene_dict')
        continue

    if name not in gene_id:
        raise AttributeError(f'Gene name {name} not in gene_dict! Did you reload the newer version of gene_dict? If yes, then not all names from missing_dbxref_names_clean.txt have been added to gene_dict.')
    
    gene_dict[gene_id] = gdt.gene_dict.GeneGeneric(
                label=gene_dict[name].label,
                an_sources=seqid,
                c=comment)

In [None]:
# TODO save new version of gene_dict

In [None]:
# missing_dbxref_names_unk.txt

In [None]:
def print_df_rows(cds_trna):
    for row_cds in cds_trna.itertuples():
        print(f'\ts: {row_cds.start}| e: {row_cds.end} | {row_cds.attributes}')

In [None]:
temp_gene_an = {}
an_set = set()
with open(f'{folder}_dbxref_names_unk.txt', 'r') as f:
    for line in f:
        raw = line.strip().split('|')[1].strip()
        gene_id, seqid = raw.split(' # ')
        if ' ' in seqid:
            seqid = [x.strip() for x in seqid.split()]
        else:
            seqid = [seqid.strip()]
        gene_id = gene_id.strip()
        
        an_set.update(seqid)
        temp_gene_an[gene_id] = seqid

In [None]:
dict_an_gene = {}
for an in an_set:
    for gene_id, seqid in temp_gene_an.items():
        if an in seqid:
            if an not in dict_an_gene:
                dict_an_gene[an] = []
            dict_an_gene[an].append(gene_id)

In [None]:
dict_an_gene
