#### Predict epitopes based on residues that:
1. are surface-exposed
2. have nonsyn fixations (or near-fixations)
3. cluster with other surface-exposed residues with nonsyn fixations

In [133]:
import ast
import random
import pandas as pd
from Bio import SeqIO

In [2]:
def get_fixation_results(virus, subtype, gene, mutation_type):
    """
    Read in the codon-specific rates of adaptation as inferred by bhatt_nextstrain-epitopes
    """
    if subtype:
        path_to_results = f'results/{virus}_{subtype}_{gene}_{mutation_type}_fixations.csv'
    
    else:
        path_to_results = f'results/{virus}_{gene}_{mutation_type}_fixations.csv'
    
    #read in results listed the number of fixations (or near fixations) at each codon
    fixations_df = pd.read_csv(path_to_results)
    
    
    codons = list(fixations_df['codon'])
    fixation_counts = list(fixations_df[f'{mutation_type}_fixations'])
    
    fixation_counts_by_codon = []
    for x in range(len(codons)):
        fixation_counts_by_codon.append((codons[x], fixation_counts[x]))
        
    return fixation_counts_by_codon

In [113]:
def adjust_fixation_result_coordinates(virus, subtype, gene, mutation_type):
    """
    Some pdb files list different subunits as different chains, while other list them as part of the same. 
    This will affect the chains and residue number. 
    Adjust coordinates from fixation results to be consistent with 
    """
    
    #get the list of codons and fixation counts at these codons
    fixation_counts_by_codon = get_fixation_results(virus, subtype, gene, mutation_type)
    #then adjust codon coordinates to be consistent with PDB chains and coordinates
    
    #will need to do this somewhat manually for each virus based on the pdb file 
    reference_file_paths =  {'h3n2':{None:{'ha':f'../../../seasonal-flu/config/reference_h3n2_{gene}.gb'}}, 
                             'h1n1pdm':{None:{'ha':f'../../../seasonal-flu/config/reference_h1n1pdm_{gene}.gb'}},
                             'vic':{None:{'ha':f'../../../seasonal-flu/config/reference_vic_{gene}.gb'}},
                             'yam':{None:{'ha':f'../../../seasonal-flu/config/reference_yam_{gene}.gb'}},
                             'oc43':{'a':{'spike':f'../../../seasonal-cov/oc43/separate_lineages/config/oc43_{gene}_reference.gb'}}, 
                             '229e':{None:{'spike':f'../../../seasonal-cov/229e/config/229e_{gene}_reference.gb'}},
                             'nl63':{None:{'spike':f'../../../seasonal-cov/nl63/config/nl63_{gene}_reference.gb'}},
                             'measles':{None:{'h':f'../../../measles/config/measles_reference.gb'}}, 
                             'mumps':{None:{'hn':f'../../../mumps/config/mumps_reference.gb'}}}
    
    reference_file = reference_file_paths[virus][subtype][gene]
    
    #some pdb files have subunits listed as separate chains, some have the listed as one
    chainA_subunit = {'h3n2':'ha1', 'h1n1pdm':'ha1', 'oc43':'spike', '229e':'spike', 'nl63':'spike', 
                      'vic': 'ha1', 'yam': 'ha1', 'measles':'h', 'mumps':'hn'}
    chainB_subunit = {'h3n2':'ha2', 'h1n1pdm':'ha2', 'vic': 'ha2', 'yam': 'ha2'}
    #some have different chains, but coordinate numbers continue
    continue_A_nums_to_B = []
    
    for seq_record in SeqIO.parse(reference_file, "genbank"):
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if 'gene' in feature.qualifiers.keys():
                    if feature.qualifiers['gene'][0].lower() == chainA_subunit[virus]:
                        chainA_len = len(feature.location.extract(seq_record.seq).translate())


    
    
    fixation_counts_adjusted_coordinates = []
    
    for x in fixation_counts_by_codon:
        codon = int(x[0])
        #chainA coordinates should already be adjusted to the proper coordinates 
        #when they were saved by epitopes-count-fixations.ipynb
        if codon <= chainA_len:
            chain = 'A'
            #add 1 for 1-based instead of 0. Noticed all coordinates were off by one
            adjusted_codon = codon
        #but need to change chainB coordinates
        elif codon > chainA_len:
            chain = 'B'
            if virus in continue_A_nums_to_B:
                adjusted_codon = codon

            else:
                adjusted_codon = codon-chainA_len
        fixation_counts_adjusted_coordinates.append((f'{chain}_{adjusted_codon}', x[1]))
        
    
    return fixation_counts_adjusted_coordinates
        

In [5]:
def get_exposure_results(virus, subtype, exposure_cutoff):
    """
    Read in the file listing all residues that have at least `exposure` square angstroms exposed to the solvent 
    """
    
    if subtype:
        with open(f'results/{virus}_{subtype}_surfaceresidues_{exposure_cutoff}angstrom.txt') as f:
            lines = f.readlines()
    else:
        with open(f'results/{virus}_surfaceresidues_{exposure_cutoff}angstrom.txt') as f:
            lines = f.readlines()

    #remove duplicates
    lines = list(set(lines))
    
    #list of chain_residue
    exposed_residues = []
    
    for res in lines:
        chain = ast.literal_eval(res)[0]
        residue = ast.literal_eval(res)[1]
        exposed_residues.append(f'{chain}_{residue}')
            
    return exposed_residues

In [6]:
def get_union_exposure_and_fixation(virus, subtype, gene, mutation_type, exposure_cutoff):
    """
    Find residues that are exposed and have fixations
    """
    
    #fixations at each codon
    fixations_counts_by_codon = adjust_fixation_result_coordinates(virus, subtype, gene, mutation_type)
    
    #list of exposed residues (at specified cutoff, in square angstroms)
    exposed_residues = get_exposure_results(virus, subtype, exposure_cutoff)
    
    #initialize list to store exposed residues with fixations
    exposed_residues_w_fixations = []
    
    for x in fixations_counts_by_codon:
        #if there are any fixations
        fixations = x[1]
        if fixations>0:
            chain_codon = x[0]
            #if this codon is exposed
            if chain_codon in exposed_residues:
                chain = chain_codon.split('_')[0]
                residue = chain_codon.split('_')[1]
                exposed_residues_w_fixations.append((chain, residue, fixations))
                
    return exposed_residues_w_fixations
    

In [35]:
def save_exposed_residues_w_fixations(virus, subtype, gene, mutation_type, exposure_cutoff):

    exposed_residues_w_fixations = get_union_exposure_and_fixation(virus, subtype, gene, mutation_type, exposure_cutoff)
    #just save chain and residue number, not also number of fixations
    exposed_residues_w_fixations = [(x[0], x[1]) for x in exposed_residues_w_fixations]
    
    if subtype:
        save_filename = f'results/fixed_exposed_{virus}_{subtype}_{mutation_type}_{exposure_cutoff}.txt'
    else:
        save_filename = f'results/fixed_exposed_{virus}_{mutation_type}_{exposure_cutoff}.txt'
    with open(save_filename, 'w') as f:
        for line in exposed_residues_w_fixations:
            f.write(str(line))
            f.write('\n')
            
            
    

In [36]:
save_exposed_residues_w_fixations('h3n2', None, 'ha', 'nonsyn', 15)

In [37]:
save_exposed_residues_w_fixations('229e', None, 'spike', 'nonsyn', 15)

In [25]:
save_exposed_residues_w_fixations('oc43', 'a', 'spike', 'nonsyn', 15)

In [115]:
save_exposed_residues_w_fixations('nl63', None, 'spike', 'nonsyn', 20)

In [63]:
save_exposed_residues_w_fixations('vic', None, 'ha', 'nonsyn', 20)

In [72]:
save_exposed_residues_w_fixations('h1n1pdm', None, 'ha', 'nonsyn', 15)

In [79]:
save_exposed_residues_w_fixations('yam', None, 'ha', 'nonsyn', 15)

In [87]:
save_exposed_residues_w_fixations('measles', None, 'h', 'nonsyn', 20)

In [100]:
save_exposed_residues_w_fixations('mumps', None, 'hn', 'nonsyn', 15)

##### Then, in pymol run:

run /Users/katekistler/nextstrain/adaptive-evolution/adaptive_loci_results/fixations_per_site/saveClusters.py

##### Then:

saveClusters pdb=4fnk,virus=h3n2, subtype=None, gene=ha, mutation_type=nonsyn, exposure_cutoff=30, neighbor_cutoff=2

- Can use: `delete all` and then rerun saveClusters with new parameters

##### Then read in this info and make a .pml to color epitopes

In [125]:
def read_in_clusters(virus,subtype, mutation_type, exposure_cutoff, neighbor_cutoff):
    """
    Read in clusters of surface-exposed residues with fixations (or near fixations) that 
    were found by running saveClusters.py in pymol 
    """
    
    if subtype:
        with open(f'results/clusters/{virus}_{subtype}_{mutation_type}_{exposure_cutoff}_{neighbor_cutoff}_clusters.txt') as f:
            lines = f.readlines()
    else:
        with open(f'results/clusters/{virus}_{mutation_type}_{exposure_cutoff}_{neighbor_cutoff}_clusters.txt') as f:
            lines = f.readlines()
    
    clusters_dict = ast.literal_eval(lines[0])
    
    
    return clusters_dict
    

In [116]:
#based on the pdb files, chains that are identical to chains A or B, in the trimer (or dimer)
multimerization_chains = {'h3n2':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'h1n1pdm':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'vic':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'yam':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'oc43':{'A':['B','C']}, 
                        '229e':{'A':['B','C']}, 
                          'nl63':{'A':['B','C']},
                        'measles': {'A':['B']}, 
                          'mumps': {'A':['B']}}

In [89]:
#specific pymol setting depending on pdb structure
pymol_settings = {'vic': ['set assembly, 1'], 'measles': ['remove chain C', 'remove chain D']}

In [90]:
def write_pml_file_clusters(pdb_accession, virus, subtype, mutation_type, 
                            exposure_cutoff, neighbor_cutoff, multimerize=True):
    """
    Write a pml file to color structure by residues that are in clusters of surface residues with nonsyn fixations
    """
    
    cluster_dict = read_in_clusters(virus,subtype, mutation_type, exposure_cutoff, neighbor_cutoff)

    #see if there are any specific pymol settings for this virus' pdb file
    extra_text_lines=False
    if virus in pymol_settings.keys():
        extra_text_lines = pymol_settings[virus]

    
    
    fetch_pdb = f"fetch {pdb_accession}"
    text_lines = [fetch_pdb, "bg_color white", "color 0xfaf7fa", "show surface", "hide sticks",
                  "remove resname SO4", "remove solvent", "set seq_view, 1"]
    if extra_text_lines:
        text_lines = text_lines+extra_text_lines
    
    #colors for separate clusters
#     cluster_colors = {1:'caafcb', 2:'a373a5', 3:'855787', 4:'bb97bc', 5:'dac7db', 
#                       6:'794f7b', 7:'d2bbd3', 8:'905e93', 9:'b38bb4', 10:'9b679d', 
#                       11:'c2a3c4', 12:'6d476f', 13:'caafcb', 14:'a373a5', 15:'855787', 16:'bb97bc'}
    
    for cluster_num, cluster_members in cluster_dict.items():
        for cluster_member in cluster_members:
            chain = cluster_member[0]
            residue = cluster_member[1]
            
            text_lines.append(f"select chain {chain} and resi {residue}")
            text_lines.append(f"color 0x855787, sele")
            
            if multimerize ==True:
                other_chains = multimerization_chains[virus][chain]
                for oc in other_chains:
                    text_lines.append(f"select chain {oc} and resi {residue}")
                    text_lines.append(f"color 0x855787, sele")

        
    if subtype:
        pml_filename = f'pml_colormaps/{virus}_{subtype}_{mutation_type}_{exposure_cutoff}_{neighbor_cutoff}_epitopes.pml'
    else:
        pml_filename = f'pml_colormaps/{virus}_{mutation_type}_{exposure_cutoff}_{neighbor_cutoff}_epitopes.pml'
    with open(pml_filename, 'w') as f:
        for line in text_lines:
            f.write(line)
            f.write('\n')

In [41]:
write_pml_file_clusters('4fnk','h3n2', None, 'nonsyn', 15, 8)

In [44]:
write_pml_file_clusters('6u7h','229e', None, 'nonsyn', 15, 8)

In [93]:
write_pml_file_clusters('4fnk','h3n2', None, 'syn', 20, 4)

In [32]:
write_pml_file_clusters('6ohw','oc43', 'a', 'nonsyn', 15, 8)

In [121]:
write_pml_file_clusters('5szs','nl63', None, 'nonsyn', 20, 6)

In [70]:
write_pml_file_clusters('4nrj','vic', None, 'nonsyn', 15, 6)

In [77]:
write_pml_file_clusters('4m4y','h1n1pdm', None, 'nonsyn', 15, 8)

In [84]:
write_pml_file_clusters('4nrj','yam', None, 'nonsyn', 15, 8)

In [96]:
write_pml_file_clusters('3inb','measles', None, 'nonsyn', 20, 8)

In [104]:
write_pml_file_clusters('6jjn','mumps', None, 'nonsyn', 15, 6)

In [None]:
run /Users/katekistler/nextstrain/adaptive-evolution/adaptive_loci_results/fixations_per_site/pml_colormaps/h3n2_nonsyn_30_3_epitopes.pml



In [143]:

def write_pml_file_color_epitopes(pdb_accession, virus, subtype, mutation_type, 
                            exposure_cutoff, neighbor_cutoff, multimerize=True):
    """
    Write a pml file to color structure according to clusters
    """
    
    cluster_dict = read_in_clusters(virus,subtype, mutation_type, exposure_cutoff, neighbor_cutoff)

    #see if there are any specific pymol settings for this virus' pdb file
    extra_text_lines=False
    if virus in pymol_settings.keys():
        extra_text_lines = pymol_settings[virus]

    
    
    fetch_pdb = f"fetch {pdb_accession}"
    text_lines = [fetch_pdb, "bg_color white", "color 0xfaf7fa", "show surface", "hide sticks",
                  "remove resname SO4", "remove solvent", "set seq_view, 1"]
    if extra_text_lines:
        text_lines = text_lines+extra_text_lines
    
    #colors for separate clusters
    colors_16 = ['f44336', 'e81e63','9c27b0','673ab7', '3f51b5', 
                 '2196f3','03a9f4','00bcd4','009688', '4caf50', 
                 '8bc34a', 'cddc39', 'ffeb3b', 'ffc107', 'ff9800', 'ff5722']
    random.shuffle(colors_16)

    cluster_colors = {x+1:colors_16[x] for x in range(0,16)}
    
    for cluster_num, cluster_members in cluster_dict.items():
        for cluster_member in cluster_members:
            chain = cluster_member[0]
            residue = cluster_member[1]
            
            text_lines.append(f"select chain {chain} and resi {residue}")
            text_lines.append(f"color 0x{cluster_colors[cluster_num]}, sele")
            
            if multimerize ==True:
                other_chains = multimerization_chains[virus][chain]
                for oc in other_chains:
                    text_lines.append(f"select chain {oc} and resi {residue}")
                    text_lines.append(f"color 0x{cluster_colors[cluster_num]}, sele")

        
    if subtype:
        pml_filename = f'pml_colormaps/{virus}_{subtype}_{mutation_type}_{exposure_cutoff}_{neighbor_cutoff}_epitopes_separatecolors.pml'
    else:
        pml_filename = f'pml_colormaps/{virus}_{mutation_type}_{exposure_cutoff}_{neighbor_cutoff}_epitopes_separatecolors.pml'
    with open(pml_filename, 'w') as f:
        for line in text_lines:
            f.write(line)
            f.write('\n')

In [149]:
write_pml_file_color_epitopes('4fnk','h3n2', None, 'nonsyn', 15, 4)