#### Read in results file (written by epitopes-count-fixations.ipynb) that lists a count of fixations (or near fixations at each site in the surface protein) and create a .pml file that will color a PDB protein structure based on the number of fixations at the site (on a heatmap scale)

In [1]:
import glob
import json
import ast
import matplotlib
import pandas as pd
from Bio import SeqIO

In [2]:
def get_fixation_results(virus, subtype, gene, mutation_type):
    """
    Read in the codon-specific rates of adaptation as inferred by bhatt_nextstrain-epitopes
    """
    if subtype:
        path_to_results = f'results/{virus}_{subtype}_{gene}_{mutation_type}_fixations.csv'
    
    else:
        path_to_results = f'results/{virus}_{gene}_{mutation_type}_fixations.csv'
    
    #read in results listed the number of fixations (or near fixations) at each codon
    fixations_df = pd.read_csv(path_to_results)
    
    
    codons = list(fixations_df['codon'])
    fixation_counts = list(fixations_df[f'{mutation_type}_fixations'])

        
    return codons, fixation_counts

In [3]:
def make_color_scale(virus, subtype, gene, mutation_type, determine_max):
    """
    Make a heatmap color code for the rates of adaptation
    """
    
    codons, fixation_counts = get_fixation_results(virus, subtype, gene, mutation_type)
    if determine_max == 'from_h3n2':
        #max rate seen in any gene so far was in OC43 spike. Use this as max so all viruses have same scale
        max_fixations = 5
    elif determine_max == 'from_self':
        max_fixations = max(fixation_counts)
        
            
    #normalize all adaptation rates so they're between 0 and 1 for the color-coding
    norm = matplotlib.colors.Normalize(vmin=0.0, vmax=max_fixations)
    normalized_fixations = [norm(x) for x in fixation_counts]
    
    #make cmap
    cmap = matplotlib.cm.get_cmap('Reds')
    rgba_colors = [cmap(x) for x in normalized_fixations] 
    hex_colors = [matplotlib.colors.to_hex(x)[1:] for x in rgba_colors]
    
    codon_fixations_colors = zip(codons, fixation_counts, hex_colors)
    
    #get heatmap scale bar info
    heatmap_numbers = list(range(0,max_fixations+1))
    heatmap_numbers_normalized = [norm(x) for x in heatmap_numbers]
    heatmap_colors = [cmap(x) for x in heatmap_numbers_normalized] 
    heatmap_hex_colors = [f'0x{matplotlib.colors.to_hex(x)[1:]}' for x in heatmap_colors] 
    

    return codon_fixations_colors, heatmap_numbers, heatmap_hex_colors

In [4]:
def make_multiple_chains(virus, subtype, gene, codon_fixations_colors):
    """
    Some pdb files list different subunits as different chains, while other list them as part of the same. 
    This will affect the chains and coordinates to color
    """
    
    #will need to do this somewhat manually for each virus based on the pdb file 
    reference_file_paths =  {'h3n2':{None:{'ha':f'../../../seasonal-flu/config/reference_h3n2_{gene}.gb'}}, 
                             'h1n1pdm':{None:{'ha':f'../../../seasonal-flu/config/reference_h1n1pdm_{gene}.gb'}}, 
                             'yam':{None:{'ha':f'../../../seasonal-flu/config/reference_yam_{gene}.gb'}}, 
                             'vic':{None:{'ha':f'../../../seasonal-flu/config/reference_vic_{gene}.gb'}}, 
                             'oc43':{'a':{'spike':f'../../../seasonal-cov/oc43/separate_lineages/config/oc43_{gene}_reference.gb'}}, 
                             '229e':{None:{'spike':f'../../../seasonal-cov/229e/config/229e_{gene}_reference.gb'}}, '229e':{None:{'spike':f'../../../seasonal-cov/229e/config/229e_{gene}_reference.gb'}},
                             'nl63':{None:{'spike':f'../../../seasonal-cov/nl63/config/nl63_{gene}_reference.gb'}}, 
                             'measles':{None:{'h':f'../../../measles/config/measles_reference.gb'}}, 
                             'mumps':{None:{'hn':f'../../../mumps/config/mumps_reference.gb'}}}
    
    reference_file = reference_file_paths[virus][subtype][gene]
    
    #some pdb files have subunits listed as separate chains, some have the listed as one
    chainA_subunit = {'h3n2':'ha1', 'h1n1pdm':'ha1','oc43':'spike', '229e':'spike', 'nl63':'spike', 
                      'vic': 'ha1', 'yam': 'ha1', 'measles':'h', 'mumps':'hn'}
    chainB_subunit = {'h3n2':'ha2', 'h1n1pdm':'ha2', 'vic': 'ha2', 'yam': 'ha2'}
    #some have different chains, but coordinate numbers continue
    continue_A_nums_to_B = []
    
    for seq_record in SeqIO.parse(reference_file, "genbank"):
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if 'gene' in feature.qualifiers.keys():
                    if feature.qualifiers['gene'][0].lower() == chainA_subunit[virus]:
                        chainA_len = len(feature.location.extract(seq_record.seq).translate())

        
    adjusted_codon_fixations_colors = []
    
    for x in codon_fixations_colors:
        codon = int(x[0])
        #chainA coordinates should already be adjusted to the proper coordinates 
        #when they were saved by epitopes-count-fixations.ipynb
        if codon <= chainA_len:
            chain = 'A'
            #add 1 for 1-based instead of 0. Noticed all coordinates were off by one
            adjusted_codon = codon
        #but need to change chainB coordinates
        elif codon > chainA_len:
            chain = 'B'
            if virus in continue_A_nums_to_B:
                adjusted_codon = codon

            else:
                adjusted_codon = codon-chainA_len
        adjusted_codon_fixations_colors.append((chain, adjusted_codon, x[0], x[1], x[2]))
        
    
    return adjusted_codon_fixations_colors
        

In [5]:
#based on the pdb files, chains that are identical to chains A or B, in the trimer (or dimer)
multimerization_chains = {'h3n2':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'h1n1pdm':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'vic':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'yam':{'A':['C', 'E'], 'B':['D', 'F']}, 
                        'oc43':{'A':['B','C']}, 
                        '229e':{'A':['B','C']}, 
                          'nl63':{'A':['B','C']}, 
                        'measles': {'A':['B']}, 
                          'mumps': {'A':['B']}}

In [6]:
#specific pymol setting depending on pdb structure
pymol_settings = {'vic': ['set assembly, 1'], 'measles': ['remove chain C', 'remove chain D']}

In [7]:
def write_pml_file(pml_filename, pdb_accession, virus, subtype, gene, mutation_type, 
                   determine_max='from_self', multimerize=True):
    """
    Write .pml file to color every residue in the given pdb structure according to the number of fixations
    """
    
    codon_fixations_colors, heatmap_numbers, heatmap_hex_colors = make_color_scale(virus, subtype, gene, 
                                                                                   mutation_type, determine_max)
    
    adjusted_codon_fixations_colors = make_multiple_chains(virus, subtype, gene, codon_fixations_colors)
    
    #see if there are any specific pymol settings for this virus' pdb file
    extra_text_lines=False
    if virus in pymol_settings.keys():
        extra_text_lines = pymol_settings[virus]
    
    
    fetch_pdb = f"fetch {pdb_accession}"
    text_lines = [fetch_pdb, "bg_color white", "color 0xD3D3D3", "show surface", "hide sticks",
                  "remove resname SO4", "remove solvent", "set seq_view, 1",  
                  f"ramp_new fixations,  {pdb_accession}, {heatmap_numbers},  color={heatmap_hex_colors}"]
    if extra_text_lines:
        text_lines = text_lines+extra_text_lines
    
    for residue in adjusted_codon_fixations_colors:
        text_lines.append(f"select chain {residue[0]} and resi {residue[1]}")
        text_lines.append(f"color 0x{residue[4]}, sele")
        #also color the corresponding residues on other chains of the trimer
        if multimerize ==True:
            other_chains = multimerization_chains[virus][residue[0]]
            for oc in other_chains:
                text_lines.append(f"select chain {oc} and resi {residue[1]}")
                text_lines.append(f"color 0x{residue[4]}, sele")
                
                
        
    with open(pml_filename, 'w') as f:
        for line in text_lines:
            f.write(line)
            f.write('\n')
        
    

In [40]:
write_pml_file('pml_colormaps/h3n2_nonsyn_fixations_colormap.pml', '4fnk', 'h3n2', None, 'ha', 'nonsyn')

In [83]:
write_pml_file('pml_colormaps/229e_nonsyn_fixations_colormap.pml', '6u7h', '229e', None, 'spike', 'nonsyn')

In [50]:
write_pml_file('pml_colormaps/oc43A_nonsyn_fixations_colormap.pml', '6ohw', 'oc43', 'a', 'spike', 'nonsyn')

In [72]:
write_pml_file('pml_colormaps/nl63_nonsyn_fixations_colormap.pml', '5szs', 'nl63', None, 'spike', 'nonsyn')

In [9]:
write_pml_file('pml_colormaps/h1n1pdm_nonsyn_fixations_colormap.pml', '4m4y', 'h1n1pdm', None, 'ha', 'nonsyn')

In [10]:
write_pml_file('pml_colormaps/vic_nonsyn_fixations_colormap.pml', '4nrj', 'vic', None, 'ha', 'nonsyn')



In [49]:
write_pml_file('pml_colormaps/yam_nonsyn_fixations_colormap.pml', '4nrj', 'yam', None, 'ha', 'nonsyn')

In [60]:
write_pml_file('pml_colormaps/measles_nonsyn_fixations_colormap.pml', '3inb', 'measles', None, 'h', 
               'nonsyn')

In [67]:
write_pml_file('pml_colormaps/mumps_nonsyn_fixations_colormap.pml', '6jjn', 'mumps', None, 'hn', 
               'nonsyn')

In [None]:
run /Users/katekistler/nextstrain/adaptive-evolution/adaptive_loci_results/fixations_per_site/pml_colormaps/nl63_nonsyn_fixations_colormap.pml


In [None]:
run /Users/katekistler/nextstrain/adaptive-evolution/adaptive_loci_results/fixations_per_site/findSurfaceResidues.py