In [225]:
import ast
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
%matplotlib inline

In [210]:
all_df = pd.read_csv('../dataframes/h3n2_ha_12y_hi.csv')
egg_df = df[df['passage']=='egg']
aligned_fasta = SeqIO.index("../results/aligned_who_h3n2_ha_12y_concat_hi.fasta", "fasta")
#filter data for only paired sequences
df = all_df[all_df['pair_id']!=0]

In [128]:
def nt_start(aa_position):
    start_nt = (aa_position*3)+48-3
    return(start_nt)
def nt_end(aa_position):
    end_nt = (aa_position*3)+48
    return(end_nt)

In [223]:
def find_codon(aa_position, strain):
    codon = str(aligned_fasta[strain].seq[nt_start(aa_position):nt_end(aa_position)])
    return(codon)

In [181]:
def count_codons(aa_position):
    genotype_groups = egg_df.groupby(str(aa_position))
    
    codons_by_genotype = {}

    for aa_geno, strains in genotype_groups:
        codons_by_genotype[aa_geno] = []
        geno_codons = []
        for k,v in strains.iterrows():
            codons_by_genotype[aa_geno]+=[str(aligned_fasta[v['strain']].seq[nt_start(aa_position):
                                                                                  nt_end(aa_position)])]
    
    codon_count_by_genotype = {}
    
    for k,v in codons_by_genotype.items():
        codon_count_by_genotype[k] = {i:v.count(i) for i in v}
        
    return(codon_count_by_genotype)


count_codons(219)

{'F': {'TWT': 6, 'TTT': 23, 'THT': 1, 'TTC': 3},
 'S': {'TCT': 432,
  'TYT': 8,
  'TCC': 44,
  'TYC': 2,
  'TMT': 9,
  'THT': 5,
  'TCA': 1},
 'Y': {'TAT': 33, 'TMT': 1, 'TWT': 2}}

In [211]:
# For egg-passaged strains with a non-egg-passaged paired sequence, find nucleotide mutations

#Re-organize DF to one row per pair
sub_egg = df[df['passage']=='egg'][['source', 'egg_muts']]
sub_u = df[df['passage']=='unpassaged'][['source', 'strain']].rename(columns = {'strain':'unpassaged_pair'})
sub_u['unpassaged_pair'] = True
sub_cell = df[df['passage']=='cell'][['source', 'strain']].rename(columns = {'strain':'cell_pair'})
sub_cell['cell_pair'] = True

pairs_u_df = sub_egg.merge(sub_u)
pairs_cell_df = sub_egg.merge(sub_cell)
pairs_cell_u_df = sub_u.merge(sub_cell)
pairs_df = pairs_u_df.merge(pairs_cell_df, how='outer')

In [251]:
# For egg-passaged strains with a non-egg-passaged paired sequence, find nucleotide mutations

def count_nt_muts(muts):
    nucleotide_mutations = {m:[] for m in muts}
    starting_codons = {m:[] for m in muts}

    for k,v in pairs_df.iterrows():
        for egg_mut in ast.literal_eval(v['egg_muts']):


            if egg_mut in muts:
                egg_id = str(v['source'])+'-egg'
                egg_mut_pos = int(re.findall('\d+', egg_mut)[0])
                egg_codon = find_codon(egg_mut_pos, egg_id)

                if v['unpassaged_pair']==True:
                    unpass_id = str(v['source'])
                    unpass_codon = find_codon(egg_mut_pos, unpass_id)
                    nt_mut_pos = [i for i in range(len(unpass_codon)) if unpass_codon[i] != egg_codon[i]]
                    nt_muts = ''
                    for p in nt_mut_pos:
                        nt_mut = str(unpass_codon[p])+str(nt_start(egg_mut_pos)+p)+str(egg_codon[p])
                        nt_muts+=nt_mut

                    nucleotide_mutations[egg_mut]+=[nt_muts]
                    starting_codons[egg_mut]+=[unpass_codon]

                if v['cell_pair']==True:
                    cell_id = str(v['source'])+'-cell'
                    cell_codon = find_codon(egg_mut_pos, cell_id)
                    nt_mut_pos = [i for i in range(len(cell_codon)) if cell_codon[i] != egg_codon[i]]
                    nt_muts = ''
                    for p in nt_mut_pos:
                        nt_mut = str(cell_codon[p])+str(nt_start(egg_mut_pos)+p)+str(egg_codon[p])
                        nt_muts+=nt_mut

                    nucleotide_mutations[egg_mut]+=[nt_muts]
                    starting_codons[egg_mut]+=[cell_codon]

    nucleotide_mutation_count = {}
    for k,v in nucleotide_mutations.items():
        nucleotide_mutation_count[k] = {i:v.count(i) for i in v}
    
    starting_codon_count = {}
    for k,v in starting_codons.items():
        starting_codon_count[k] = {i:v.count(i) for i in v}
        
    return(nucleotide_mutation_count, starting_codon_count)

In [252]:
mutations = ['T160K', 'L194P', 'G186V', 'D225G', 'S219F', 'S219Y', 'T203I', 'H156R', 'H156Q', 'A138S', 'N246H']

count_nt_muts(mutations)

({'T160K': {'C526A': 63, 'R525AC526A': 1, 'G526A': 1},
  'L194P': {'T628C': 118},
  'G186V': {'G604T': 88},
  'D225G': {'A721G': 32, 'A721R': 2},
  'S219F': {'C703T': 19, '': 1, 'C703W': 6},
  'S219Y': {'C703A': 16, 'C703W': 1},
  'T203I': {'C655T': 20},
  'H156R': {'A514G': 12},
  'H156Q': {'C515A': 17},
  'A138S': {'G459T': 25},
  'N246H': {'A783C': 10}},
 {'T160K': {'ACA': 63, 'RCA': 1, 'AGA': 1},
  'L194P': {'CTG': 117, 'CTA': 1},
  'G186V': {'GGT': 87, 'GGC': 1},
  'D225G': {'GAT': 34},
  'S219F': {'TCT': 25, 'TTC': 1},
  'S219Y': {'TCT': 17},
  'T203I': {'ACA': 20},
  'H156R': {'CAC': 12},
  'H156Q': {'CAC': 17},
  'A138S': {'GCT': 25},
  'N246H': {'AAC': 10}})

# Codon is almost always the same prior to egg-passaging (no nt diversity)
## Check accessibility of mutations (i.e. can the amino acid mutation occur in any other way with just one nt mutation)


T160K (ACA->AAA): this is the only one-nt mutation from ACA to yield Lysine  
L194P (CTG->CCG): this is the only one-nt mutation from CTG to yield Proline  
G186V (GGT->GTT): this is the only one-nt mutation from GGT to yield Valine  
D225G (GAT->GGT): this is the only one-nt mutation from GAT to yield Glycine  
S219F (TCT->TTT): this is the only one-nt mutation from TCT to yield Phenylalanine  
S219Y (TCT->TAT): this is the only one-nt mutation from TCT to yield Tyrosine  
T203I (ACA->ATA): this is the only one-nt mutation from ACA to yield Isoleucine  
H156R (CAC->CGC): this is the only one-nt mutation from CAC to yield Arginine   
H156Q (CAC->CAA): !!! CAG could also yield Glutamine with just one-nt change  
A138S (GCT->TCT): this is the only one-nt mutation from GCT to yield Serine  
N246H (AAC->CAC): this is the only one-nt mutation from AAC to yield Histidine  





In [208]:
print(str(aligned_fasta['A/Kansas/14/2017'].seq))

ATGAAGACTATCATTGCTTTGAGCTGCATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAACTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTCGTTGAACGAAACAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCATCCCTTAGATCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAAAAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGTGTTCACCACCCGGGTACGGACAAGGACCAAATCTCCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCAGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAGTGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCAAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCATGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAA

In [204]:
#Find genotype of HongKong and Perth HA plasmids

hongkong_plasmid = Seq('ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAGCTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCCGTATGCTCAATCATCAGGAAGAATCATAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGGAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA')

hongkong_wt = Seq('ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAGCTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGGAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA')

perth_plasmid = Seq('ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCTCCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGGATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAGTATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAACTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA')



def print_codon(site):

    print('HongKong plasmid at '+str(site)+' : '+str(hongkong_plasmid[nt_start(site):nt_end(site)]))
    print('HongKong wildtype plasmid at '+str(site)+' : '+str(hongkong_wt[nt_start(site):nt_end(site)]))
    print('Perth plasmid at '+str(site)+' : '+str(perth_plasmid[nt_start(site)-10:nt_end(site)+10]))
    
print_codon(186)



HongKong plasmid at 186 : GGT
HongKong wildtype plasmid at 186 : GGT
Perth plasmid at 186 : TCTCCACCCGGGTACGGACAAAG
