In [13]:
import baltic as bt
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib import cm
import seaborn as sns
%matplotlib inline

from Bio import SeqIO, Seq
from collections import defaultdict
import json
import math
from itertools import combinations, product, izip, permutations
import pandas as pd
import numpy as np
from pprint import pprint
from scipy.stats import linregress
from copy import deepcopy

sns.set(style='whitegrid', font_scale = 1.2)

In [44]:
out_path = '../titer_model/rarefaction_output/nhp_output/'
tree_path = '../titer_model/titered_output/titered_tree.json'
reference_path = '../data/reference/genotype_refs_pyke2016.csv'
titer_path = '../titer_model/rarefaction_output/nhp_output/titered_nhp_substitution_model.json'
titer_cluster_path = '../titer_model/rarefaction_output/nhp_output/colinear_mutations.txt'

In [45]:
json_translation={'height': 'div', 'name': 'strain'}
tree = bt.loadJSON(tree_path, json_translation)

genotype_strains_path = '../data/reference/strain_genotypes.tsv'
strain_genotypes = pd.read_csv(genotype_strains_path, sep='\t', index_col=None, header=None, names=['strain', 'genotype'])

strain_genotypes = { k['strain'] : k['genotype'] for idx, k in strain_genotypes.iterrows()}

for k in tree.leaves:
    if k.traits['strain'] in strain_genotypes:
        k.traits['genotype'] = strain_genotypes[k.traits['strain']]
        k.traits['serotype'] = strain_genotypes[k.traits['strain']].split('_')[0]
    else:
        print 'annotation not found for ', k.traits['strain']


Tree height: 0.329384
Tree length: 2.129878
strictly bifurcating tree
annotations present

Numbers of objects in tree: 93 (46 nodes and 47 leaves)



In [46]:
antigenic_mutations = json.load(open(titer_path, 'r'))['mutations']
mutation_clusters = [l.strip().split() for l in open(titer_cluster_path, 'r')]

antigenic_clusters = {}
for cluster in mutation_clusters:
    n_antigenic = 0
    
    for mut in cluster:
        if mut in antigenic_mutations:
            antigenic_effect = antigenic_mutations[mut]
            del antigenic_mutations[mut]
            n_antigenic += 1

    assert n_antigenic <= 1
    if n_antigenic == 1:
        antigenic_clusters[tuple(cluster)] = antigenic_effect        

In [47]:
# ofile = open('../titer_model/titered_output/titered_monovalent_output/antigenic_mutations.tsv', 'a')
# for mut, val in antigenic_mutations.items():
#     ofile.write(mut+'\t'+str(round(val,2))+'\n')
# for cluster, val in antigenic_clusters.items():
#     ofile.write(','.join(cluster)+'\t'+str(round(val,2))+'\n')

In [48]:
def get_mutations(seq1, seq2):
    '''
    loops over all translations (listed in self.proteins) and returns a list of
    between as tuples (protein, mutation) e.g. (HA1, 159F)
    '''
    muts = []
    muts.extend(['E:'+aa1+str(pos+1)+aa2 for pos, (aa1, aa2)
                in enumerate(izip(seq1, seq2)) if aa1!=aa2])
    return muts

def get_single_mut_Dij(muts, antigenic_mutations):
    Dij = sum([antigenic_mutations[m] for m in muts if m in antigenic_mutations])
    return Dij

def get_mut_cluster_Dij(muts, antigenic_clusters):
    Dij = 0.
    for cluster, cluster_Dij in antigenic_clusters.items():
        if any([cluster_mut in muts for cluster_mut in cluster]):
            Dij += cluster_Dij
    return Dij

def get_Dij(sera, virus, 
            antigenic_mutations=antigenic_mutations, 
            antigenic_clusters=antigenic_clusters,):
#             potencies=potencies,
#             avidities=avidities):
    
    sera_seq = aa_seqs[sera.traits['strain']]
    virus_seq = aa_seqs[virus.traits['strain']]
    
#     try:
#         sera_potency = potencies[str(sera.traits['clade'])]['mean_potency']
#     except KeyError:
#         sera_potency = 0.
#     try:
#         virus_avidity = avidities[str(virus.traits['clade'])]
#     except KeyError:
#         virus_avidity = 0.
    
    muts = get_mutations(sera_seq, virus_seq)
    mutation_effect = get_single_mut_Dij(muts, antigenic_mutations) + get_mut_cluster_Dij(muts, antigenic_clusters)
    
    return mutation_effect #+ sera_potency + virus_avidity

In [49]:
seqs = {s.name : s for s in SeqIO.parse(open('../data/sequences/titered_strains_alignment.mfa', 'r'), 'fasta')}
aa_seqs = {k: Seq.translate(str(v.seq).replace('-', 'N')) for k,v in seqs.items()}

In [50]:
def get_clade_Dij(sera_clade, virus_clade, clade_trait='genotype', 
                  tree=tree, aa_seqs=aa_seqs,
                  antigenic_mutations=antigenic_mutations, antigenic_clusters=antigenic_clusters):
    
    sera_clade_tips = [ k for k in tree.leaves if k.traits[clade_trait] == sera_clade ]
    virus_clade_tips = [ k for k in tree.leaves if k.traits[clade_trait] == virus_clade ]
    
    dTiter = 0.
    N_pairs = 0.
    for sera, virus in product(sera_clade_tips,virus_clade_tips):
        Dij = get_Dij(sera, virus, antigenic_mutations, antigenic_clusters)
        dTiter += Dij
        N_pairs += 1.
        
    return dTiter / N_pairs

In [51]:
genotypes = set([k.traits['genotype'] for k in tree.Objects if 'genotype' in k.traits])
genotype_distances = defaultdict(dict)
for serum, virus in permutations(genotypes, 2): # Asymmetrical: test (serum1, virus2) and (serum2, virus1)
    Dij = get_clade_Dij(serum, virus, 'genotype')
    genotype_distances[serum][virus] = Dij
    
for genotype in genotypes: # Distance to self is always 0.
    genotype_distances[genotype][genotype] = 0.

In [52]:
def write_distances_to_tsv(distances, path):
    
    ofile = open(path, 'w')
    
    for serum, distance_dict in distances.items():
        for virus, Dij in distance_dict.items():
            ofile.write(serum+'\t'+virus+'\t'+'%.2f'%Dij+'\n')
            

write_distances_to_tsv(genotype_distances, out_path+'genotype_Dij.tsv')