In [1]:
import pandas as pd
import numpy as np
import os
import re

from pathlib import Path
import gdt

from Bio import Entrez

In [2]:
nth_iteration = 1

In [3]:
# Defines all the global variables used in the script.
# Change these variables to match your local setup.
# The most_recent_gdt_file variable should be set to the path of the most recent GDT file,
# OR the stripped GDT file used in filter command, if applicable.

DATA_DIR = "../test/Test_group16"
AN_missing_dbxref = "../test/Test_group16/AN_missing_dbxref"
#most_recent_gdt_file = "../test/Test_group16/Test_group16.gdt"
remove_orfs = True
organelle_type = "MT"
gff_suffix = ".gff3"

Entrez.email = 'dupin@alunos.utfpr.edu.br'
Entrez.api_key = 'b3abc1ac7ae9ac035af84ec1abf895878d09'

In [4]:
most_recent_gdt_file = ""

In [5]:
# Check if all variables exist
DATA_DIR = Path(DATA_DIR).resolve()
AN_missing_dbxref = Path(AN_missing_dbxref).resolve()

if not DATA_DIR.exists() and not DATA_DIR.is_dir():
    raise FileNotFoundError(f"Data directory {DATA_DIR} does not exist or is not a directory.")

if not AN_missing_dbxref.exists() and not AN_missing_dbxref.is_file():
    raise FileNotFoundError(f"AN missing dbxref {AN_missing_dbxref} does not exist or is not a file.")

if not most_recent_gdt_file:
    if nth_iteration > 1:
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")
    else:
        print(f"If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.")
        print(f"Otherwise, ignore this message.")
else:
    most_recent_gdt_file = Path(most_recent_gdt_file).resolve()
    if not most_recent_gdt_file.exists() and not most_recent_gdt_file.is_file():
        raise FileNotFoundError(f"Most recent GDT file {most_recent_gdt_file} does not exist or is not a file.")

If you set up a stripped GDT file, please set the path to it in the most_recent_gdt_file variable.
Otherwise, ignore this message.


In [6]:
_, logger = gdt.logger_setup.logger_creater(log_file=DATA_DIR / '0_test_3.log', console_level="DEBUG", file_level="TRACE")
logger.debug("Running from notebook AN_missing_dbxref")

2025-05-22 19:21:13,004 - DEBUG - Logger setup complete. Logging to /home/brenodupin/matheus/gdt/test/Test_group16/0_test_3.log
2025-05-22 19:21:13,012 - DEBUG - Running from notebook AN_missing_dbxref


In [7]:
with open(AN_missing_dbxref, "r") as f:
    ANs = [line.strip() for line in f.readlines() if line.strip()]
print(f"len(ANs): {len(ANs)}")

len(ANs): 200


In [8]:
missing_dbxref_path = DATA_DIR / "missing_dbxref"
missing_dbxref_path.mkdir(exist_ok=True)

In [9]:
# Load the GDT file
if most_recent_gdt_file:
    gene_dict = gdt.gene_dict.create_gene_dict(most_recent_gdt_file, max_an_sources=0)
else:
    gene_dict = {}

temp_gene_dict = {}

In [10]:
temp_list = []
for AN in ANs:
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    if not an_path.exists():
        logger.error(f'Error: {AN} does not exist (an_path: {an_path})')
        continue
    
    df = gdt.gff3_utils.load_gff3(an_path, query_string=gdt.gff3_utils.QS_GENE_TRNA_RRNA, usecols=['seqid', 'start', 'end', 'type', 'attributes'])
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df['gene_id'] = df['attributes'].str.split(';').str[0].str.replace('ID=', '', regex=False)
    df['in_gene_dict'] = df['gene_id'].isin(gene_dict)
    df_missing = df[~df['in_gene_dict']].copy()

    temp_list.extend(df_missing[['gene_id', 'seqid']].to_dict('records'))

In [11]:
agg_dbxref = (pd.DataFrame(temp_list).groupby('gene_id')['seqid']
                .agg(list)
                .sort_index())  # Sort by gene_id

In [12]:
# Write here anything you want to add to the missing_dbxref file, or leave it empty
comment = "manual insertion from missing_dbxref_compiled"

In [14]:
with open(missing_dbxref_path / 'compiled.txt', 'w+') as f1:
    for gene_id, seqid in agg_dbxref.items():
        f1.write(f'{gene_id} #gn {" ".join(seqid)}{ " #c " + comment if comment else "" }\n')

After manual parsing of compiled.txt,  
create problems.txt, with names that  
are not readily indentifiable or that need deeper investigation.

The names that are easily identifiable should be added to the most  
recent _pilot.gdt, and this gdt should be them loaded above, before  
the next part of the pipeline.

### Deeper investigation using other gff attributes, primarily 'Name='

In [15]:
an_with_no_dbxref = set()
with open(missing_dbxref_path / 'problems.txt', 'r') as f:
    for line in f:
        if line.startswith('#') or not line.strip():
            continue
        # Get ANs part (after '||') and split into individual ANs
        if '#c' in line:
            line = line.split('#c')[0].strip()
        
        ans = line.split('#gn')[1].strip().split()
        # Add each AN to the set
        an_with_no_dbxref.update(ans)

In [16]:
an_with_no_dbxref

{'AP012272.1',
 'AP017979.1',
 'AP017980.1',
 'AP017981.1',
 'AP024424.1',
 'AP024451.1',
 'AP024468.1',
 'AY376688.1',
 'D31785.1',
 'JQ346808.1',
 'KC832409.1',
 'KY245891.1',
 'LC532387.1',
 'LC545447.1',
 'LC602355.1',
 'LC604067.1',
 'LC612919.1',
 'LK392300.1',
 'MH725795.1'}

In [17]:
RE_NAME = re.compile(r'Name=([^;]*)(?:;|$)')
RE_PRODUCT = re.compile(r'product=([^;]*)(?:;|$)')
RE_DESCRIPTION = re.compile(r'description=([^;]*)(?:;|$)')
RE_GENE = re.compile(r'gene=([^;]*)(?:;|$)')
RE_GENE_SYNONYM = re.compile(r'gene_synonym=([^;]*)(?:;|$)')

In [None]:
temp_list = []
for AN in an_with_no_dbxref:
    an_path = DATA_DIR / f'{AN}{gff_suffix}'
    df = gdt.gff3_utils.load_gff3(an_path, query_string=gdt.gff3_utils.QS_GENE_TRNA_RRNA, usecols=gdt.GFF3_COLUMNS) # TODO change query_string!
    df = gdt.gff3_utils.filter_orfs(df) if remove_orfs else df

    df['gene_id'] = df['attributes'].str[3:].str.partition(';', expand=False).str[0]
    df = df[~df['gene_id'].isin(gene_dict)]
    
    df['name'] = df['attributes'].str.extract(RE_NAME, expand=False)
    df['product'] = df['attributes'].str.extract(RE_PRODUCT, expand=False)
    df['description'] = df['attributes'].str.extract(RE_DESCRIPTION, expand=False)
    df['gene'] = df['attributes'].str.extract(RE_GENE, expand=False)
    df['gene_synonym'] = df['attributes'].str.extract(RE_GENE_SYNONYM, expand=False)

    if df[['name', 'product', 'description', 'gene', 'gene_synonym']].isna().all(axis=1).any():
        print(f'Warning: {AN} has a row with no identifiable atribute.')
        print('Please modify this script to add a new possible identifiable attribute or just remove the AN from the list.')
        print(df[df[['name', 'product', 'description', 'gene', 'gene_synonym']].isna().all(axis=1)])

    temp_list.extend(df.to_dict('records'))

features_info_df = pd.DataFrame(temp_list)
features_info_df = features_info_df.drop(columns=['source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])

dc = [col for col in ['product', 'description', 'gene', 'gene_synonym'] if features_info_df[col].isna().all()]

features_info_df['feature_name'] = (features_info_df['gene']
                                     .fillna(features_info_df['product'])
                                     .fillna(features_info_df['description'])
                                     .fillna(features_info_df['name'])
                                     .fillna(features_info_df['gene_synonym']))

features_info_df = features_info_df.drop(columns=dc)


features_info_df = features_info_df.sort_values(by='feature_name')

In [19]:
add_gdt_compliance = True
comment = 'Manual from missing_dbxref_names_raw'

In [23]:
gene_dict = {'12S RNA', '12S ribosomal RNA', '15S rRNA', '16S RNA'}

In [None]:
if add_gdt_compliance:
    gdt_str = f' #gd MANUAL{ " #c " + comment if comment else "" }'
else:
    gdt_str = ""

# df with 2 columns, one for feature_names and one for in_gene_dict
new_df = pd.DataFrame({'feature_name': features_info_df['feature_name'].unique()})
new_df['in_gene_dict'] = new_df['feature_name'].isin(gene_dict)

In [None]:
unique_names = features_info_df['name'].dropna().unique()
with open(missing_dbxref_path / 'feature_name.txt', 'w+') as f1:
    for name in new_df[~new_df['in_gene_dict']]['feature_name']:
        f1.write(f'{name}{gdt_str}\n')

features_info_df.to_csv(missing_dbxref_path / 'features_info.tsv', sep='\t', index=False)

The user must now parse feature_names.txt  

Features that can be easily identifiable must be added to the current  
version of the gdt, and features that needs a more deep investigation should be  
copied to a new file name 'feature_unk.txt'
  
The program will now try to automatically add the gene_ids with feature name  
that __is not__ in 'feature_unk.txt'.

In [None]:
# Check if the names exist in the gene_dict
features_info_df = pd.read_csv(missing_dbxref_path / 'features_info.tsv', sep='\t')

names_unk = set()
with open(missing_dbxref_path / 'feature_unk.txt', 'r') as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if '#gd' in line:
            line = line.split('#gd')[0].strip()
        
        names_unk.add(line)

In [None]:
names_not_in_dict = set()
all_names = set(features_info_df['feature_name'].unique()) - names_unk
for name in all_names:
    if name not in gene_dict:
        names_not_in_dict.add(name)

if names_not_in_dict:
    print(f'Warning: {len(names_not_in_dict)} names not in gene_dict!')
    print(f'These names are not in feature_unk, so you marked them as identifiable. Please identify them or add them feature_unk.')
    [print(name) for name in names_not_in_dict]
    raise ValueError(f'Error: {len(names_not_in_dict)} names not in gene_dict!')

In [None]:
comment = "automated insertion from missing_dbxref_names_clean"

In [None]:
for r in features_info_df[~features_info_df['feature_name'].isin(names_unk)].itertuples():
    # sanity check
    if r.feature_name not in gene_dict:
        raise ValueError(f'Error: {r.feature_name} not in gene_dict! how? did you run the step above without error?')

    gene_dict[gene_id] = gdt.gene_dict.GeneGeneric(
                label=gene_dict[r.feature_name].label,
                an_sources=r.seqid,
                c=comment)

In [None]:
# TODO save new version of gene_dict

In [None]:
# missing_dbxref_names_unk.txt

In [None]:
def print_df_rows(cds_trna):
    for row_cds in cds_trna.itertuples():
        print(f'\ts: {row_cds.start}| e: {row_cds.end} | {row_cds.attributes}')

In [None]:
# Check if the names exist in the gene_dict
names_unk = set()
with open(missing_dbxref_path / 'missing_dbxref_names_unk.txt', 'r') as f1:
    for line in f1:
        line = line.strip()
        if not line:
            continue

        if '#gd' in line:
            line = line.split('#gd')[0].strip()
        
        names_unk.add(line)

In [None]:
df_names_unk  = pd.read_csv(missing_dbxref_path / 'missing_dbxref_names_raw.tsv', sep='\t')
df_names_unk = df_names_unk[df_names_unk['name'].isin(names_unk)]

In [None]:
dict_an_gene = {}
for an in an_set:
    for gene_id, seqid in temp_gene_an.items():
        if an in seqid:
            if an not in dict_an_gene:
                dict_an_gene[an] = []
            dict_an_gene[an].append(gene_id)

In [None]:
dict_an_gene
