In [1]:
import math
import json
import random
import ast
import re
import os
from os import path
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align import AlignInfo
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
from scipy import stats
from collections import Counter

## Count fixations at each site, using same time windows as adaptation analysis

In [2]:
def readin_virus_config(virus):
    config_json = f'config/adaptive_evo_config_{virus}.json'
    with open(config_json) as json_handle:
        configs = json.load(json_handle)
        
    return configs

In [3]:
def readin_data(saved_data_name):
    with open(saved_data_name) as json_handle:
        json_dict = json.load(json_handle)
        (aligned_codons_in_window, outgroup_codons, year_windows) = (json_dict['aligned_codons_in_window'], 
                                                                     json_dict['outgroup_codons'], 
                                                                     json_dict['year_windows'])
    return aligned_codons_in_window, outgroup_codons, year_windows

In [4]:
def get_alignment_outgroup_data(virus, subtype, gene, window, min_seqs, year_max, year_min):
    """
    If the alignment and outgroup data has already been organized for this 
    virus, subtype, gene, window, min_seqs, year_min, year_max combo, then read in the data. 
    Otherwise, calculate it and save it. Return aligned_codons_in_window, outgroup_codons, year_windows
    """
    
    filepath = 'adaptation_results/intermediates/'
    
    saved_data_name = f'{filepath}input_data_{virus}_{subtype}_{gene}_{window}_{min_seqs}_{year_min}_{year_max}.json'
    
    #prepare data and save it if this has not already been done
    if not path.exists(saved_data_name):
        print(f'need to run adaptation analysis for {virus} {gene}')
    
    #read in the prepared alignment and outgroup data
    (aligned_codons_in_window, outgroup_codons, year_windows) = readin_data(saved_data_name)
    
    #reformat alignment outgroup to flat list of codons
    outgroup_codons = [x[0] for x in outgroup_codons]

    return aligned_codons_in_window, outgroup_codons, year_windows

In [5]:
def walk_through_sites(outgroup_codons, alignment_seqs, midfreq_high):
    """
    Walk through each codon in each sequence of the alignment and determine whether 
    there was a synonymous mut, nonsynonymous mut or neither. 
    If there was a mutation, want to know its nucleotide position
    """

    
    #initialize arrays to count fixations at each codon 
    nonsyn_fixations_in_window = np.zeros(len(outgroup_codons))
    syn_fixations_in_window = np.zeros(len(outgroup_codons))
    
    #initialize arrays to count fixations at each nucleotide
    nonsyn_fixations_in_window_nt = np.zeros(len(outgroup_codons)*3)
    syn_fixations_in_window_nt = np.zeros(len(outgroup_codons)*3)
    
    #initialize dictionary to keep track of codons {pos:codon seq} that have fixed during this time window, 
    #so that outgroup_seq can be updated
    fixed_codons = {}
    
    #list of codons in alignment organized as [['ATG','ATG','ATG'],['TAG','TAG','TAG']]
    alignment_codons = [[] for x in outgroup_codons]
    for isolate_codons in alignment_seqs:
        for i in range(len(isolate_codons)):
            alignment_codons[i].append(str(isolate_codons[i]))  

    
    #walk through sequence codon by codon
    for i in range(len(outgroup_codons)):
        #only consider unambiguous sequencing
        if set('AGCT').intersection(set(str(outgroup_codons[i]))) == set(str(outgroup_codons[i])):
            #find fixations or near-fixations
            fixation_type, fixed_codon_seq, fixed_nts = find_mutation_fixations(i, outgroup_codons[i], 
                                                                     alignment_codons[i], midfreq_high)
            if fixed_codon_seq!=None:
                fixed_codons[i] = fixed_codon_seq
            if fixation_type== 'nonsynonymous':
                nonsyn_fixations_in_window[i]+=1
                nonsyn_fixations_in_window_nt[i:i+3] =fixed_nts
            elif fixation_type== 'synonymous':
                syn_fixations_in_window[i]+=1
                syn_fixations_in_window_nt[i:i+3] = fixed_nts
                
                    
    return (nonsyn_fixations_in_window, syn_fixations_in_window, fixed_codons, 
            nonsyn_fixations_in_window_nt, syn_fixations_in_window_nt)


In [6]:
def find_mutation_fixations(pos, outgroup_codon, alignment_codons, midfreq_high):
    """
    At a given codon position, find whether any mutations occurred, 
    and if so what frequency they are at in the population
    
    Classify mutations that reach between midfreq_high and 100% as fixations
    """ 

    #only consider unabiguous sequencing
    alignment_codons = [x for x in alignment_codons if set('AGCT').intersection(set(str(x))) == set(str(x))]

    #get a count of the different codon sequences observed at this position
    codon_seq_counts = Counter(alignment_codons)
    codon_seqs = list(codon_seq_counts.keys())
    codon_freqs = {c: (codon_seq_counts[c] / len(alignment_codons)) for c in codon_seq_counts}

    
    #all codons are the same in the alignment
    if len(codon_seqs)==1:
        #check if they are the same as the outgroup
        #or whether they are a fixed mutation
        if outgroup_codon==codon_seqs[0]:
            site_type = 'no_fixation'
        #or whether they are a fixed mutation
        elif outgroup_codon!=codon_seqs[0]:
            site_type = 'fixation'
            fixed_codon = codon_seqs[0]
            
    #if there are multiple codon sequences observed at this position, 
    #see if there are any mutations present at a frequency of midfreq_high or higher
    elif len(codon_seqs)!=1:
        #default is no fixation, can be overwritten if one codon is present at high freq and is a mutation
        site_type = 'no_fixation'
        for cod in codon_seqs:
            #check whether any of the codon sequences are present at high enough frequency
            if codon_freqs[cod] >= midfreq_high:
                #check if it is a mutation
                if cod != outgroup_codon:
                    site_type = 'fixation'
                    fixed_codon = cod


    # check if fixation was synonymous or nonsynonymous                
    if site_type == 'fixation':
        outgroup_aa = Seq(outgroup_codon).translate()
        alignment_codon_aa = Seq(fixed_codon).translate()
        if outgroup_aa == alignment_codon_aa:
            fixation_type = 'synonymous'
        elif outgroup_aa != alignment_codon_aa:
            fixation_type = 'nonsynonymous'
        #find which nt positions have fixed
        fixed_nts = [1 if outgroup_codon[i]!=fixed_codon[i] else 0 for i in range(len(outgroup_codon))]
    elif site_type == 'no_fixation':
        fixation_type = None
        fixed_codon = None
        fixed_nts = [0,0,0]
    
    return fixation_type, fixed_codon, fixed_nts

In [7]:
def count_codon_fixations(alignment_sequences, outgroup_codons, midfreq_high):
    """
    For each time window, find if any mutations have fixed 
    (or nearly fixed, meaning reach a frequency exceeding midfreq_high).
    Find whether those mutations are nonsynonymous or synonymous.
    Keep a count of fixations/near-fixations at each codon
    """
    


    #initiate arrays to record fixations (or near-fixations) at all codons in alignment
    #keep track of this for both nonsynonymous and synonymous mutations
    nonsynonymous_fixations = np.zeros(len(outgroup_codons))
    synonymous_fixations = np.zeros(len(outgroup_codons))
    
    nonsynonymous_fixations_nt = np.zeros(len(outgroup_codons)*3)
    synonymous_fixations_nt = np.zeros(len(outgroup_codons)*3)
    
    nonsyn_nt_fixations_by_year = {}
        

    for years, alignment_seqs in alignment_sequences.items():

  
        #look for fixations in each time window
        (nonsyn_fixations_in_window, syn_fixations_in_window, fixed_codons, 
            nonsyn_fixations_in_window_nt, syn_fixations_in_window_nt) = walk_through_sites(outgroup_codons, 
                                                                                                alignment_seqs, midfreq_high)
        nonsynonymous_fixations += nonsyn_fixations_in_window
        synonymous_fixations += syn_fixations_in_window
        
        nonsynonymous_fixations_nt +=nonsyn_fixations_in_window_nt
        synonymous_fixations_nt +=syn_fixations_in_window_nt
        
        #keep track of how many nt fixations there are in each year window (by window midpoint)
        window_midpoint = (int(years.split('-')[0])+int(years.split('-')[1]))/2
        nonsyn_nt_fixations_by_year[window_midpoint] = sum(nonsyn_fixations_in_window_nt)
        
        #update outgroup_seq at codons that have fixed mutations 
        for pos, seq in fixed_codons.items():
            outgroup_codons[pos] = seq
    

    return (nonsynonymous_fixations, synonymous_fixations, 
            nonsynonymous_fixations_nt, nonsyn_nt_fixations_by_year, nonsyn_nt_fixations_by_year)


In [27]:
def count_fixations(virus, subtype, gene, 
         window=5, min_seqs=3, year_max=None, year_min=None, midfreq_high=1.0):
    """
    Count the number of fixations at each nucleotide site
    
    Return the total length of the gene, the total number of sites with at least one nonsyn mutation, 
    and the total number of sites with multiple nonsyn mutations
    """
    #for viruses with under 12 years of data, the adaptation analysis uses smaller windows
    if virus in ['h1n1pdm', 'mumps']:
        window, min_seqs = 3,3
    
    (aligned_codons_in_window, outgroup_codons, year_windows) = get_alignment_outgroup_data(virus, subtype, gene, 
                                                                                            window, min_seqs, 
                                                                                            year_max, year_min)
    
    total_len_gene = len(outgroup_codons)*3
    
    (nonsynonymous_fixations, synonymous_fixations,
     nonsynonymous_fixations_nt, synonymous_fixations_nt, 
     nonsyn_nt_fixations_by_year) = count_codon_fixations(aligned_codons_in_window, 
                                                          outgroup_codons, midfreq_high)
    

    #tally the number of sites with fixations (any number)
    codons_w_nonsyn_fixations = np.count_nonzero(nonsynonymous_fixations)
    codons_w_syn_fixations = np.count_nonzero(synonymous_fixations)
    nt_sites_w_a_nonsyn_fixation = np.count_nonzero(nonsynonymous_fixations_nt)
    
    #tally the number of sites with more than one nonsynonymous fixations
    codons_w_multiple_nonsyn_fixations = len([x for x in nonsynonymous_fixations if x>=2])
    codons_w_multiple_syn_fixations = len([x for x in synonymous_fixations if x>=2])
    nts_w_multiple_nonsyn_fixations = len([x for x in nonsynonymous_fixations_nt if x>=2])
    nts_w_multiple_syn_fixations = len([x for x in synonymous_fixations_nt if x>=2])
    
    print(f'gene length: {total_len_gene}')
    print(f'Nt sites with at least one nonsyn fixation: {nt_sites_w_a_nonsyn_fixation}')
    print(f'Nt sites with only one nonsyn fixation: {nt_sites_w_a_nonsyn_fixation - nts_w_multiple_nonsyn_fixations}')
    print(f'Nt sites with multiple nonsyn fixation: {nts_w_multiple_nonsyn_fixations}')

In [12]:
count_fixations('229e', None, 'S1')


gene length: 1635
Nt sites with at least one nonsyn fixation: 9
Nt sites with only one nonsyn fixation: 9
Nt sites with multiple nonsyn fixation: 0


In [28]:
count_fixations('h3n2', None, 'ha1')

gene length: 987
Nt sites with at least one nonsyn fixation: 52
Nt sites with only one nonsyn fixation: 44
Nt sites with multiple nonsyn fixation: 8


In [39]:
count_fixations('oc43', 'a', 'S1')

gene length: 2259
Nt sites with at least one nonsyn fixation: 11
Nt sites with only one nonsyn fixation: 9
Nt sites with multiple nonsyn fixation: 2
