# Code to map the transcript IDs (ENSEMBL ID to RefSeq ID and vice versa)

### 1) Call libraries

In [1]:
import gzip

### 2) Define the input files

In [2]:
# Define the input files
refseq_id_file = 'refseq_ids_list.txt'
ensembl_id_file = 'ensembl_ids_list.txt'
mapping_id_file = 'kgXref.txt.gz'


### 3) Code body

In [3]:
# Read in the RefSeq IDs and ENSEMBL IDs
with open(refseq_id_file, 'r') as f:
    refseq_ids = [line.strip() for line in f.readlines()]

with open(ensembl_id_file, 'r') as f:
    ensembl_ids = [line.strip() for line in f.readlines()]


# Read in the mapping file
with gzip.open(mapping_id_file, 'rt') as f:
#     next(f) # to skip the header
    text_file = [line.strip().split('\t') for line in f]

    
# Create dictionaries for the RefSeq and ENSEMBL IDs
refseq_dict = {}
ensembl_dict = {}
for mapped in text_file:
    refseq_id = mapped[1]
    ensembl_id = mapped[0]
    gene_symbol = mapped[4]
    refseq_dict[refseq_id] = (ensembl_id, gene_symbol)
    ensembl_dict[ensembl_id] = (refseq_id, gene_symbol)

    
# Match the RefSeq and ENSEMBL IDs
refseq_matches = []
ensembl_matches = []

for refseq_id in refseq_ids:
    if refseq_id in refseq_dict:
        ensembl_id, gene_symbol = refseq_dict[refseq_id]
        refseq_matches.append((refseq_id, ensembl_id, gene_symbol))
    
for ensembl_id in ensembl_ids:
    if ensembl_id in ensembl_dict:
        refseq_id, gene_symbol = ensembl_dict[ensembl_id]
        ensembl_matches.append((ensembl_id, refseq_id, gene_symbol))

        
# Print the results
print('RefSeq ID\tENSEMBL ID\tGene Symbol')
for match in refseq_matches:
    print(f'{match[0]}\t{match[1]}\t{match[2]}')
for match in ensembl_matches:
    print(f'{match[1]}\t{match[0]}\t{match[2]}')

RefSeq ID	ENSEMBL ID	Gene Symbol
NM_145160	ENST00000178640.10	MAP2K5
NM_004686	ENST00000180173.10	MTMR7
NM_012288	ENST00000182527.4	TRAM2
NM_025227	ENST00000170150.4	BPIFB2
NM_020695	ENST00000170168.9	REXO1
NM_031453	ENST00000181796.7	FAM107B
