In [1]:
"""
This script will parse the results of the reciprocal best hit (RBH) analysis done 
using the ncbi-blast+ suite.

Are we really looking for reciprocal best hits, or the most appropriate replacements 
for each new locus tag? I think the latter, so the best hits we need are actually 
unidirectional in a sense. - Avi 5/7/25

OLD - GCF_000195955.2
NEW - GCF_026185275.1

Avi Shah, May 6th 2025
W. Evan Johnson Lab

"""
import pandas as pd
from collections import defaultdict


In [5]:
blast_results = pd.read_csv("GCF_026185275.1_ASM2618527v1_genomic_vs_GCF_000195955.2_ASM19595v2_genomic.tab", sep='\t', 
                           names=["query", "subject", "pident", "qcovs", "length", "evalue", "bitscore"])

filtered_results = blast_results[(blast_results['pident'] >= 97) & 
                                (blast_results['qcovs'] >= 97)]

new_to_old_mapping = {}
for query, group in filtered_results.groupby('query'):
    # Sort by bitscore (descending) and get the top hit
    best_hit = group.sort_values('bitscore', ascending=False).iloc[0]
    new_to_old_mapping[query] = best_hit['subject']  # subject is the old locus tag

numbered_mapping = {}

old_tag_counts = {}
for new_tag, old_tag in new_to_old_mapping.items():
    if old_tag not in old_tag_counts:
        old_tag_counts[old_tag] = 1
    else:
        old_tag_counts[old_tag] += 1

old_tag_current_number = {}
for new_tag, old_tag in new_to_old_mapping.items():
    count = old_tag_counts[old_tag]
    if count > 1:
        if old_tag not in old_tag_current_number:
            old_tag_current_number[old_tag] = 1
        else:
            old_tag_current_number[old_tag] += 1
        numbered_old_tag = f"{old_tag}.{old_tag_current_number[old_tag]}"
    else:
        numbered_old_tag = old_tag
    numbered_mapping[new_tag] = numbered_old_tag

with open("GCF_026185275.1_ASM2618527v1_genomic_locus_tags_from_GCF_000195955.2_ASM19595v2_genomic.tsv", "w") as f:
    f.write("new_locus_tag\told_locus_tag\n")
    for new_tag, numbered_old_tag in numbered_mapping.items():
        f.write(f"{new_tag}\t{numbered_old_tag}\n")

# Print statistics
print(f"Total new locus tags mapped: {len(new_to_old_mapping)}")
print(f"Unique old locus tags used: {len(set(new_to_old_mapping.values()))}")
print(f"Old locus tags with duplications: {sum(1 for tag, count in old_tag_counts.items() if count > 1)}")


Total new locus tags mapped: 3817
Unique old locus tags used: 3792
Old locus tags with duplications: 23
