In [1]:
"""
This script will use the best best hits from the old reference BLASTed
against the new reference to determine unique Rv number locus tag mappings 
to the Alland reference.

OLD - GCF_000195955.2
NEW - GCF_026185275.1

Avi Shah, May 8th 2025
W. Evan Johnson Lab
"""

import pandas as pd


In [2]:

gtf_path = 'GCF_026185275.1_ASM2618527v1_genomic.gtf' 
gtf_df = pd.read_csv(gtf_path, sep='\t', header=None, comment='#')

# Assign standard GTF column names
gtf_df.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

# View the first few rows
print(gtf_df.head())

# Check column names
print(gtf_df.columns.tolist())


         seqname            source      feature  start   end score strand  \
0  NZ_CP110619.1            RefSeq         gene      1  1524     .      +   
1  NZ_CP110619.1  Protein Homology          CDS      1  1521     .      +   
2  NZ_CP110619.1  Protein Homology  start_codon      1     3     .      +   
3  NZ_CP110619.1  Protein Homology   stop_codon   1522  1524     .      +   
4  NZ_CP110619.1            RefSeq         gene   2052  3260     .      +   

  frame                                          attribute  
0     .  gene_id "M7V54_RS00005"; transcript_id ""; gbk...  
1     0  gene_id "M7V54_RS00005"; transcript_id "unassi...  
2     0  gene_id "M7V54_RS00005"; transcript_id "unassi...  
3     0  gene_id "M7V54_RS00005"; transcript_id "unassi...  
4     .  gene_id "M7V54_RS00010"; transcript_id ""; gbk...  
['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']


In [3]:
for v in gtf_df['attribute'][:3]:
    print(v)

gene_id "M7V54_RS00005"; transcript_id ""; gbkey "Gene"; gene "dnaA"; gene_biotype "protein_coding"; locus_tag "M7V54_RS00005"; 
gene_id "M7V54_RS00005"; transcript_id "unassigned_transcript_1"; Ontology_term "GO:0006275"; Ontology_term "GO:0003688"; Ontology_term "GO:0005524"; Ontology_term "GO:0043565"; db_xref "GenBank:WP_003400253.1"; gbkey "CDS"; gene "dnaA"; go_function "DNA replication origin binding|0003688||IEA"; go_function "ATP binding|0005524||IEA"; go_function "sequence-specific DNA binding|0043565||IEA"; go_process "regulation of DNA replication|0006275||IEA"; inference "COORDINATES: similar to AA sequence:RefSeq:NP_214515.1"; locus_tag "M7V54_RS00005"; product "chromosomal replication initiator protein DnaA"; protein_id "WP_003400253.1"; transl_table "11"; exon_number "1"; 
gene_id "M7V54_RS00005"; transcript_id "unassigned_transcript_1"; Ontology_term "GO:0006275"; Ontology_term "GO:0003688"; Ontology_term "GO:0005524"; Ontology_term "GO:0043565"; db_xref "GenBank:WP_00

In [5]:
import pandas as pd
import re

# Read the mapping TSV into a dictionary
def read_mapping(mapping_file):
    mapping_df = pd.read_csv(mapping_file, sep='\t')
    return dict(zip(mapping_df['new_locus_tag'], mapping_df['old_locus_tag']))

# Function to replace locus_tag in attribute strings
def replace_locus_tag(attr_str, mapping):
    # Pattern to find locus_tag "value" in the attribute string
    pattern = r'locus_tag "([^"]+)"'
    
    def replacer(match):
        old_value = match.group(1)
        # Replace if in mapping
        if old_value in mapping:
            return f'locus_tag "{mapping[old_value]}"'
        else:
            return match.group(0)
    
    # Replace all occurrences
    return re.sub(pattern, replacer, attr_str)

# Main function to process GTF file
def process_gtf(gtf_file, mapping_file, output_file):
    # Read mapping
    mapping_dict = read_mapping(mapping_file)
    
    # Read GTF file
    gtf_df = pd.read_csv(gtf_file, sep='\t', header=None, comment='#')
    gtf_df.columns = ['seqname', 'source', 'feature', 'start', 'end', 
                      'score', 'strand', 'frame', 'attribute']
    
    # Apply replacement to each attribute string
    gtf_df['attribute'] = gtf_df['attribute'].apply(
        lambda attr: replace_locus_tag(attr, mapping_dict)
    )
    
    # Write updated GTF file
    gtf_df.to_csv(output_file, sep='\t', header=False, index=False, quoting=3)
    
    return len(gtf_df)


mapping_file = "intermed_GCF_026185275.1_ASM2618527v1_genomic_locus_tags_from_GCF_000195955.2_ASM19595v2_genomic.tsv"
outfile = "fixed_gtf_for_pam.gtf"
rows_processed = process_gtf(gtf_path, mapping_file, outfile)
print(f"Processed {rows_processed} rows. Output written to {outfile}")



Processed 16494 rows. Output written to fixed_gtf_for_pam.gtf
