# Test mutation frequencies as targets

Test out mutation frequencies as an alternate target for the fitness model. First, load mutation frequencies calculated from a multiple sequence alignment (MSA) and the MSA itself. Prepare a table of mutations and their frequencies by strain.

Afterwards, load a table of strain (tip) attributes and use the tip timepoints, names, and frequencies to reweight the original mutation frequencies.

In [85]:
import Bio.AlignIO
from collections import defaultdict
import json
import pandas as pd

In [9]:
timepoint = "2009-10-01"

In [73]:
genes = ["SigPep", "HA1", "HA2"]

In [10]:
data_root = "../results/builds/h3n2/20_viruses_per_month/sample_0/2000-10-01--2015-10-01/timepoints/%s/segments/ha/" % timepoint

In [17]:
with open(data_root + "mutation_frequencies.json", "r") as fh:
    mutation_frequencies = json.load(fh)

In [18]:
mutation_frequencies["HA1:106A"]

[0.824686,
 0.74245,
 0.400488,
 0.212494,
 0.451428,
 0.749903,
 0.997728,
 0.998199,
 0.990296,
 0.957388,
 0.954165,
 0.983087,
 0.984669,
 0.955469,
 0.990321,
 0.982236,
 0.9999,
 0.999892,
 0.99968]

In [19]:
mutation_frequencies["HA1:106V"]

[0.175314,
 0.25755,
 0.599512,
 0.787506,
 0.543099,
 0.247582,
 0.00224,
 0.001764,
 0.009434,
 0.041013,
 0.043119,
 0.015264,
 0.012998,
 0.035105,
 0.006788,
 0.011065,
 6.2e-05,
 6.7e-05,
 0.000199]

In [31]:
mut_df = pd.DataFrame([
    {"site": mutation[:-1], "residue": mutation[-1], "frequency": frequencies[-1], "timepoint": timepoint}
    for mutation, frequencies in mutation_frequencies.items()
    if mutation != "pivots" and not mutation.endswith("counts")
])

In [32]:
mut_df.head(10)

Unnamed: 0,frequency,residue,site,timepoint
0,1.0,P,HA1:103,2009-10-01
1,0.0,Q,HA1:103,2009-10-01
2,0.0,S,HA1:103,2009-10-01
3,0.999879,D,HA1:104,2009-10-01
4,0.000121,E,HA1:104,2009-10-01
5,0.002354,H,HA1:105,2009-10-01
6,0.997646,Y,HA1:105,2009-10-01
7,0.99968,A,HA1:106,2009-10-01
8,3.3e-05,S,HA1:106,2009-10-01
9,8.8e-05,T,HA1:106,2009-10-01


In [33]:
(mut_df["residue"] == "r").sum()

0

In [34]:
mut_df["residue"].value_counts()

I    82
S    76
K    69
N    67
V    67
R    59
T    57
L    50
D    43
A    39
G    39
E    34
F    29
Q    27
M    25
H    23
Y    23
P    22
C    10
W     5
*     1
Name: residue, dtype: int64

In [36]:
mut_df["site"].value_counts()

HA1:140      6
HA2:32       6
HA1:188      6
HA1:278      6
HA1:144      6
HA1:92       6
HA1:173      5
HA1:229      5
SigPep:12    5
HA1:121      5
SigPep:9     5
HA1:3        5
HA1:304      5
SigPep:10    5
HA1:10       5
HA1:312      5
HA1:25       5
HA1:5        5
HA1:50       5
HA2:147      5
HA1:198      4
HA1:192      4
HA1:7        4
HA2:161      4
HA1:165      4
SigPep:2     4
HA2:97       4
HA1:145      4
SigPep:15    4
HA1:264      4
            ..
HA1:175      2
HA2:87       2
HA2:203      2
HA1:20       2
HA1:174      2
HA1:227      2
HA1:285      2
HA1:2        2
HA2:139      2
HA1:63       2
HA2:80       2
HA2:65       2
HA2:182      2
HA2:24       2
HA2:138      2
HA1:205      2
HA1:265      2
HA1:160      2
HA1:65       2
HA1:196      2
HA2:145      2
HA1:59       2
HA1:163      2
HA1:239      2
HA1:203      2
HA1:226      2
HA2:211      2
HA2:149      2
HA2:176      2
HA1:220      2
Name: site, Length: 303, dtype: int64

In [59]:
max_frequency_by_site = mut_df.groupby("site")["frequency"].max().reset_index()

In [67]:
max_frequency_by_site[max_frequency_by_site["site"] == "HA2:181"]

Unnamed: 0,site,frequency
230,HA2:181,0.984476


In [68]:
sites_to_track = max_frequency_by_site.loc[max_frequency_by_site["frequency"] < 0.95, "site"].values

In [69]:
sites_to_track

array(['HA1:144', 'HA1:162', 'HA1:212', 'HA1:213', 'HA1:261', 'HA1:62'],
      dtype=object)

In [70]:
sites_to_track.shape

(6,)

In [71]:
muts_to_track_df = mut_df[mut_df["site"].isin(sites_to_track)].copy()

In [72]:
muts_to_track_df

Unnamed: 0,frequency,residue,site,timepoint
71,6.2e-05,D,HA1:144,2009-10-01
72,1.1e-05,G,HA1:144,2009-10-01
73,6e-06,I,HA1:144,2009-10-01
74,0.620483,K,HA1:144,2009-10-01
75,0.379393,N,HA1:144,2009-10-01
76,4.4e-05,S,HA1:144,2009-10-01
108,0.913649,P,HA1:162,2009-10-01
109,0.07223,Q,HA1:162,2009-10-01
110,0.014121,S,HA1:162,2009-10-01
224,0.346921,A,HA1:212,2009-10-01


In [78]:
sites_to_track

array(['HA1:144', 'HA1:162', 'HA1:212', 'HA1:213', 'HA1:261', 'HA1:62'],
      dtype=object)

In [79]:
genes_to_tracks = sorted(set([site.split(":")[0] for site in sites_to_track]))

In [80]:
genes_to_tracks

['HA1']

In [86]:
positions_per_gene = defaultdict(list)

for site in sites_to_track:
    gene, position = site.split(":")
    positions_per_gene[gene].append(int(position))

In [87]:
positions_per_gene

defaultdict(list, {'HA1': [144, 162, 212, 213, 261, 62]})

In [90]:
samples_and_sites = []

for gene in genes_to_tracks:
    with open(data_root + "aa-seq_%s.fasta" % gene, "r") as fh:
        alignment = Bio.AlignIO.read(fh, "fasta")
        
        for record in alignment:
            # Skip internal nodes.
            if record.id.startswith("NODE"):
                continue
                
            # Store this record's sequence for each site to track.
            for position in positions_per_gene[gene]:
                samples_and_sites.append({
                    "site": "%s:%s" % (gene, position),
                    "residue": record.seq[position - 1],
                    "sample": record.id
                })

samples_and_sites_df = pd.DataFrame(samples_and_sites)

In [91]:
samples_and_sites_df.shape

(8100, 3)

In [95]:
samples_and_sites_df.head(10)

Unnamed: 0,residue,sample,site
0,N,A/Canterbury/96/2000,HA1:144
1,P,A/Canterbury/96/2000,HA1:162
2,T,A/Canterbury/96/2000,HA1:212
3,V,A/Canterbury/96/2000,HA1:213
4,R,A/Canterbury/96/2000,HA1:261
5,E,A/Canterbury/96/2000,HA1:62
6,N,A/Canterbury/99/2000,HA1:144
7,P,A/Canterbury/99/2000,HA1:162
8,T,A/Canterbury/99/2000,HA1:212
9,V,A/Canterbury/99/2000,HA1:213


In [112]:
number_of_samples_by_site_and_residue_df = samples_and_sites_df.groupby(["site", "residue"]).count().reset_index().rename(
    columns={"sample": "number_of_samples"}
)

In [124]:
number_of_samples_by_site_and_residue_df

Unnamed: 0,site,residue,number_of_samples
0,HA1:144,D,133
1,HA1:144,G,1
2,HA1:144,I,2
3,HA1:144,K,96
4,HA1:144,N,1115
5,HA1:144,S,3
6,HA1:162,P,1338
7,HA1:162,Q,10
8,HA1:162,S,2
9,HA1:212,A,41


In [96]:
muts_to_track_df.head()

Unnamed: 0,frequency,residue,site,timepoint
71,6.2e-05,D,HA1:144,2009-10-01
72,1.1e-05,G,HA1:144,2009-10-01
73,6e-06,I,HA1:144,2009-10-01
74,0.620483,K,HA1:144,2009-10-01
75,0.379393,N,HA1:144,2009-10-01


In [118]:
muts_to_track_df = muts_to_track_df.merge(number_of_samples_by_site_and_residue_df, on=["site", "residue"])

In [119]:
muts_to_track_df["frequency_per_sample"] = muts_to_track_df["frequency"] / muts_to_track_df["number_of_samples"]

In [120]:
muts_to_track_df.head()

Unnamed: 0,frequency,residue,site,timepoint,frequency_per_sample,number_of_samples
0,6.2e-05,D,HA1:144,2009-10-01,4.661654e-07,133
1,1.1e-05,G,HA1:144,2009-10-01,1.1e-05,1
2,6e-06,I,HA1:144,2009-10-01,3e-06,2
3,0.620483,K,HA1:144,2009-10-01,0.006463365,96
4,0.379393,N,HA1:144,2009-10-01,0.0003402628,1115


In [133]:
mut_freqs_per_sample = muts_to_track_df.merge(samples_and_sites_df, on=["site", "residue"])

In [134]:
mut_freqs_per_sample.head()

Unnamed: 0,frequency,residue,site,timepoint,frequency_per_sample,number_of_samples,sample
0,6.2e-05,D,HA1:144,2009-10-01,4.661654e-07,133,A/NewYork/197/2003
1,6.2e-05,D,HA1:144,2009-10-01,4.661654e-07,133,A/NewYork/193/2003
2,6.2e-05,D,HA1:144,2009-10-01,4.661654e-07,133,A/Queensland/12/2001
3,6.2e-05,D,HA1:144,2009-10-01,4.661654e-07,133,A/Queensland/17/2001
4,6.2e-05,D,HA1:144,2009-10-01,4.661654e-07,133,A/NewYork/401/2001


In [135]:
mut_freqs_per_sample.groupby(["site", "residue"]).aggregate({"frequency": "first", "frequency_per_sample": "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,frequency_per_sample
site,residue,Unnamed: 2_level_1,Unnamed: 3_level_1
HA1:144,D,6.2e-05,6.2e-05
HA1:144,G,1.1e-05,1.1e-05
HA1:144,I,6e-06,6e-06
HA1:144,K,0.620483,0.620483
HA1:144,N,0.379393,0.379393
HA1:144,S,4.4e-05,4.4e-05
HA1:162,P,0.913649,0.913649
HA1:162,Q,0.07223,0.07223
HA1:162,S,0.014121,0.014121
HA1:212,A,0.346921,0.346921


In [136]:
mut_freqs_per_sample = mut_freqs_per_sample.drop(columns=["frequency", "number_of_samples"]).rename(columns={"frequency_per_sample": "frequency"}).copy()

In [137]:
mut_freqs_per_sample["mutation"] = mut_freqs_per_sample["site"] + mut_freqs_per_sample["residue"]

In [138]:
mut_freqs_per_sample.head()

Unnamed: 0,residue,site,timepoint,frequency,sample,mutation
0,D,HA1:144,2009-10-01,4.661654e-07,A/NewYork/197/2003,HA1:144D
1,D,HA1:144,2009-10-01,4.661654e-07,A/NewYork/193/2003,HA1:144D
2,D,HA1:144,2009-10-01,4.661654e-07,A/Queensland/12/2001,HA1:144D
3,D,HA1:144,2009-10-01,4.661654e-07,A/Queensland/17/2001,HA1:144D
4,D,HA1:144,2009-10-01,4.661654e-07,A/NewYork/401/2001,HA1:144D
