In [1]:
import pandas as pd
import numpy as np

In [2]:
#protein wild-types

av_wt = 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*'
amac_wt = 'MSKGEELFTGIVPVLIELDGDVHGHKFSVRGEGEGDADYGKLEIKFICTTGKLPVPWPTLVTTLSYGILCFARYPEHMKMNDFFKSAMPEGYIQERTIFFQDDGKYKTRGEVKFEGDTLVNRIELKGMDFKEDGNILGHKLEYNFNSHNVYIMPDKANNGLKVNFKIRHNIEGGGVQLADHYQTNVPLGDGPVLIPINHYLSCQTAISKDRNETRDHMVFLEFFSACGHTHGMDELYK*'
cgre_wt = 'MTALTEGAKLFEKEIPYITELEGDVEGMKFIIKGEGTGDATTGTIKAKYICTTGDLPVPWATILSSLSYGVFCFAKYPRHIADFFKSTQPDGYSQDRIISFDNDGQYDVKAKVTYENGTLYNRVTVKGTGFKSNGNILGMRVLYHSPPHAVYILPDRKNGGMKIEYNKAFDVMGGGHQMARHAQFNKPLGAWEEDYPLYHHLTVWTSFGKDPDDDETDHLTIVEVIKAVDLETYR*'
pplu_wt = 'MPAMKIECRITGTLNGVEFELVGGGEGTPEQGRMTNKMKSTKGALTFSPYLLSHVMGYGFYHFGTYPSGYENPFLHAINNGGYTNTRIEKYEDGGVLHVSFSYRYEAGRVIGDFKVVGTGFPEDSVIFTDKIIRSNATVEHLHPMGDNVLVGSFARTFSLRDGGYYSFVVDSHMHFKSAIHPSILQNGGPMFAFRRVEELHSNTELGIVEYQHAFKTPIAFA*'


#from the fasta_aln file
av_aligned   = 'MSK---GEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAM-PEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLP---DNHYLSTQSALSKDPNE-KRDHMVLLEFVTAAGITHGMDELYK*'
amac_aligned = 'MSK---GEELFTGIVPVLIELDGDVHGHKFSVRGEGEGDADYGKLEIKFICTTGKLPVPWPTLVTTLSYGILCFARYPEHMKMNDFFKSAM-PEGYIQERTIFFQDDGKYKTRGEVKFEGDTLVNRIELKGMDFKEDGNILGHKLEYNFNSHNVYIMPDKANNGLKVNFKIRHNIEGGGVQLADHYQTNVPLGDGPVLIP---INHYLSCQTAISKDRNE-TRDHMVFLEFFSACGHTHGMDELYK*'
cgre_aligned = 'MTALTEGAKLFEKEIPYITELEGDVEGMKFIIKGEGTGDATTGTIKAKYICTTGDLPVPWATILSSLSYGVFCFAKYPRHIA--DFFKSTQ-PDGYSQDRIISFDNDGQYDVKAKVTYENGTLYNRVTVKGTGFKSNGNILGMRVLYHSPPHAVYILPDRKNGGMKIEYNKAFDVMGGGHQMARHAQFNKPLGAWEEDYP---LYHHLTVWTSFGKDPDDDETDHLTIVEVIKAVDL--E---TYR*'
pplu_aligned = 'MPA-----------MKIECRITGTLNGVEFELVGGGEGTPEQGRMTNKMKSTKGALTFSPYLLSHVMGYGFYHFGTYPSGYE--NPFLHAINNGGYTNTRIEKYEDGGVLHVSFSYRYEAGRVIGDFKVVGTGFPEDSVIFTDKII-RSNATVEHLHPM-GDNVLVGSFARTFSLRDGGYYSFVVDSHMHFKSAIHPSILQNGGPMFAFRRVEELH-----SNTELGIVEYQHAFKTPIAFA----*'

prot_aligned = {av_wt:av_aligned, amac_wt:amac_aligned, cgre_wt:cgre_aligned, pplu_wt:pplu_aligned}

In [3]:
def get_pseudo_aa_positions(wt, aligned):
    d = {i:0 for i in range(len(wt))}
    pos_wt=0
    pos_al=0
    for i in range(len(aligned)):
        if aligned[i]!='-':
            d[pos_wt] = pos_al
            pos_al += 1
            pos_wt += 1
        else:
            pos_al += 1
            
    return d

The variable "pseudopos_to_nativepos" is a dictionary linking the aligned/"pseudo" amino acid position with their original positions in the native wild-type. "pseudopos_to_nativeaa" does the same, but indicating the original WT amino acid at each aligned position instead.

**pseudopos_to_nativepos** = {aligned_position : [av_native_pos, amac_native_pos, cgre_native_pos, pplu_native_pos]}

**pseudopos_to_nativeaa** = {aligned_position : [av_native_aa, amac_native_aa, chre_native_aa, pplu_native_aa]}

In [4]:
pseudopos_to_nativepos = {x:[np.nan, np.nan, np.nan, np.nan] for x in range(len(av_aligned))} 

for i in range(4):
    gene = [av_wt, amac_wt, cgre_wt, pplu_wt][i]
    wt2pseudo = get_pseudo_aa_positions(gene, prot_aligned[gene])
    for x in pseudopos_to_nativepos:
        try:
            pseudopos_to_nativepos[x][i] = list(wt2pseudo.keys())[list(wt2pseudo.values()).index(x)]
        except:
            pass

In [5]:
pseudopos_to_nativeaa = {x:[av_aligned[x],amac_aligned[x],cgre_aligned[x],pplu_aligned[x]] for x in range(len(av_aligned))}

The variable "nativepos_to_pseudopos" does the opposite, linking the wildtype position of amino acids with their aligned/"pseudo" positions.

**nativepos_to_pseudopos** = {native_position = [av_aligned_pos, amac_aligned_pos, cgre_aligned_pos, pplu_aligned_pos]}

In [6]:
nativepos_to_pseudopos = {x:[np.nan, np.nan, np.nan, np.nan] for x in range(239)} #238 is the longest protein, plus stop codon

for i in range(4):
    gene = [av_wt, amac_wt, cgre_wt, pplu_wt][i]
    wt2pseudo = get_pseudo_aa_positions(gene, prot_aligned[gene])
    for x in nativepos_to_pseudopos:
        try:
            nativepos_to_pseudopos[x][i] = list(wt2pseudo.values())[list(wt2pseudo.keys()).index(x)]
        except:
            pass