# Test mutation frequencies as targets

Test out mutation frequencies as an alternate target for the fitness model. First, load mutation frequencies calculated from a multiple sequence alignment (MSA) and the MSA itself. Prepare a table of mutations and their frequencies by strain.

Afterwards, load a table of strain (tip) attributes and use the tip timepoints, names, and frequencies to reweight the original mutation frequencies.

In [85]:
import Bio.AlignIO
from collections import defaultdict
import json
import pandas as pd

In [9]:
timepoint = "2009-10-01"

In [73]:
genes = ["SigPep", "HA1", "HA2"]

In [10]:
data_root = "../results/builds/h3n2/20_viruses_per_month/sample_0/2000-10-01--2015-10-01/timepoints/%s/segments/ha/" % timepoint

In [17]:
with open(data_root + "mutation_frequencies.json", "r") as fh:
    mutation_frequencies = json.load(fh)

In [18]:
mutation_frequencies["HA1:106A"]

[0.824686,
 0.74245,
 0.400488,
 0.212494,
 0.451428,
 0.749903,
 0.997728,
 0.998199,
 0.990296,
 0.957388,
 0.954165,
 0.983087,
 0.984669,
 0.955469,
 0.990321,
 0.982236,
 0.9999,
 0.999892,
 0.99968]

In [19]:
mutation_frequencies["HA1:106V"]

[0.175314,
 0.25755,
 0.599512,
 0.787506,
 0.543099,
 0.247582,
 0.00224,
 0.001764,
 0.009434,
 0.041013,
 0.043119,
 0.015264,
 0.012998,
 0.035105,
 0.006788,
 0.011065,
 6.2e-05,
 6.7e-05,
 0.000199]

In [31]:
mut_df = pd.DataFrame([
    {"site": mutation[:-1], "residue": mutation[-1], "frequency": frequencies[-1], "timepoint": timepoint}
    for mutation, frequencies in mutation_frequencies.items()
    if mutation != "pivots" and not mutation.endswith("counts")
])

In [32]:
mut_df.head(10)

Unnamed: 0,frequency,residue,site,timepoint
0,1.0,P,HA1:103,2009-10-01
1,0.0,Q,HA1:103,2009-10-01
2,0.0,S,HA1:103,2009-10-01
3,0.999879,D,HA1:104,2009-10-01
4,0.000121,E,HA1:104,2009-10-01
5,0.002354,H,HA1:105,2009-10-01
6,0.997646,Y,HA1:105,2009-10-01
7,0.99968,A,HA1:106,2009-10-01
8,3.3e-05,S,HA1:106,2009-10-01
9,8.8e-05,T,HA1:106,2009-10-01


In [33]:
(mut_df["residue"] == "r").sum()

0

In [34]:
mut_df["residue"].value_counts()

I    82
S    76
K    69
N    67
V    67
R    59
T    57
L    50
D    43
A    39
G    39
E    34
F    29
Q    27
M    25
H    23
Y    23
P    22
C    10
W     5
*     1
Name: residue, dtype: int64

In [36]:
mut_df["site"].value_counts()

HA1:140      6
HA2:32       6
HA1:188      6
HA1:278      6
HA1:144      6
HA1:92       6
HA1:173      5
HA1:229      5
SigPep:12    5
HA1:121      5
SigPep:9     5
HA1:3        5
HA1:304      5
SigPep:10    5
HA1:10       5
HA1:312      5
HA1:25       5
HA1:5        5
HA1:50       5
HA2:147      5
HA1:198      4
HA1:192      4
HA1:7        4
HA2:161      4
HA1:165      4
SigPep:2     4
HA2:97       4
HA1:145      4
SigPep:15    4
HA1:264      4
            ..
HA1:175      2
HA2:87       2
HA2:203      2
HA1:20       2
HA1:174      2
HA1:227      2
HA1:285      2
HA1:2        2
HA2:139      2
HA1:63       2
HA2:80       2
HA2:65       2
HA2:182      2
HA2:24       2
HA2:138      2
HA1:205      2
HA1:265      2
HA1:160      2
HA1:65       2
HA1:196      2
HA2:145      2
HA1:59       2
HA1:163      2
HA1:239      2
HA1:203      2
HA1:226      2
HA2:211      2
HA2:149      2
HA2:176      2
HA1:220      2
Name: site, Length: 303, dtype: int64

In [59]:
max_frequency_by_site = mut_df.groupby("site")["frequency"].max().reset_index()

In [67]:
max_frequency_by_site[max_frequency_by_site["site"] == "HA2:181"]

Unnamed: 0,site,frequency
230,HA2:181,0.984476


In [68]:
sites_to_track = max_frequency_by_site.loc[max_frequency_by_site["frequency"] < 0.95, "site"].values

In [69]:
sites_to_track

array(['HA1:144', 'HA1:162', 'HA1:212', 'HA1:213', 'HA1:261', 'HA1:62'],
      dtype=object)

In [70]:
sites_to_track.shape

(6,)

In [153]:
muts_to_track_df = mut_df[mut_df["site"].isin(sites_to_track)].copy()

In [154]:
muts_to_track_df

Unnamed: 0,frequency,residue,site,timepoint
71,6.2e-05,D,HA1:144,2009-10-01
72,1.1e-05,G,HA1:144,2009-10-01
73,6e-06,I,HA1:144,2009-10-01
74,0.620483,K,HA1:144,2009-10-01
75,0.379393,N,HA1:144,2009-10-01
76,4.4e-05,S,HA1:144,2009-10-01
108,0.913649,P,HA1:162,2009-10-01
109,0.07223,Q,HA1:162,2009-10-01
110,0.014121,S,HA1:162,2009-10-01
224,0.346921,A,HA1:212,2009-10-01


In [155]:
sites_to_track

array(['HA1:144', 'HA1:162', 'HA1:212', 'HA1:213', 'HA1:261', 'HA1:62'],
      dtype=object)

In [156]:
genes_to_tracks = sorted(set([site.split(":")[0] for site in sites_to_track]))

In [157]:
genes_to_tracks

['HA1']

In [158]:
positions_per_gene = defaultdict(list)

for site in sites_to_track:
    gene, position = site.split(":")
    positions_per_gene[gene].append(int(position))

In [159]:
positions_per_gene

defaultdict(list, {'HA1': [144, 162, 212, 213, 261, 62]})

In [160]:
samples_and_sites = []

for gene in genes_to_tracks:
    with open(data_root + "aa-seq_%s.fasta" % gene, "r") as fh:
        alignment = Bio.AlignIO.read(fh, "fasta")
        
        for record in alignment:
            # Skip internal nodes.
            if record.id.startswith("NODE"):
                continue
                
            # Store this record's sequence for each site to track.
            for position in positions_per_gene[gene]:
                samples_and_sites.append({
                    "site": "%s:%s" % (gene, position),
                    "residue": record.seq[position - 1],
                    "sample": record.id
                })

samples_and_sites_df = pd.DataFrame(samples_and_sites)

In [161]:
samples_and_sites_df.shape

(8100, 3)

In [162]:
samples_and_sites_df.head(10)

Unnamed: 0,residue,sample,site
0,N,A/Canterbury/96/2000,HA1:144
1,P,A/Canterbury/96/2000,HA1:162
2,T,A/Canterbury/96/2000,HA1:212
3,V,A/Canterbury/96/2000,HA1:213
4,R,A/Canterbury/96/2000,HA1:261
5,E,A/Canterbury/96/2000,HA1:62
6,N,A/Canterbury/99/2000,HA1:144
7,P,A/Canterbury/99/2000,HA1:162
8,T,A/Canterbury/99/2000,HA1:212
9,V,A/Canterbury/99/2000,HA1:213


In [163]:
number_of_samples_by_site_and_residue_df = samples_and_sites_df.groupby(["site", "residue"]).count().reset_index().rename(
    columns={"sample": "number_of_samples"}
)

In [164]:
number_of_samples_by_site_and_residue_df

Unnamed: 0,site,residue,number_of_samples
0,HA1:144,D,133
1,HA1:144,G,1
2,HA1:144,I,2
3,HA1:144,K,96
4,HA1:144,N,1115
5,HA1:144,S,3
6,HA1:162,P,1338
7,HA1:162,Q,10
8,HA1:162,S,2
9,HA1:212,A,41


In [165]:
muts_to_track_df.head()

Unnamed: 0,frequency,residue,site,timepoint
71,6.2e-05,D,HA1:144,2009-10-01
72,1.1e-05,G,HA1:144,2009-10-01
73,6e-06,I,HA1:144,2009-10-01
74,0.620483,K,HA1:144,2009-10-01
75,0.379393,N,HA1:144,2009-10-01


In [166]:
#muts_to_track_df = muts_to_track_df.merge(number_of_samples_by_site_and_residue_df, on=["site", "residue"])

In [167]:
#muts_to_track_df["frequency_per_sample"] = muts_to_track_df["frequency"] / muts_to_track_df["number_of_samples"]

In [168]:
muts_to_track_df.head()

Unnamed: 0,frequency,residue,site,timepoint
71,6.2e-05,D,HA1:144,2009-10-01
72,1.1e-05,G,HA1:144,2009-10-01
73,6e-06,I,HA1:144,2009-10-01
74,0.620483,K,HA1:144,2009-10-01
75,0.379393,N,HA1:144,2009-10-01


In [169]:
mut_freqs_per_sample = muts_to_track_df.merge(samples_and_sites_df, on=["site", "residue"])

In [170]:
mut_freqs_per_sample.head()

Unnamed: 0,frequency,residue,site,timepoint,sample
0,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/197/2003
1,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/193/2003
2,6.2e-05,D,HA1:144,2009-10-01,A/Queensland/12/2001
3,6.2e-05,D,HA1:144,2009-10-01,A/Queensland/17/2001
4,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/401/2001


In [172]:
#mut_freqs_per_sample.groupby(["site", "residue"]).aggregate({"frequency": "first", "frequency_per_sample": "sum"})

In [136]:
#mut_freqs_per_sample = mut_freqs_per_sample.drop(columns=["frequency", "number_of_samples"]).rename(columns={"frequency_per_sample": "frequency"}).copy()

In [173]:
mut_freqs_per_sample["mutation"] = mut_freqs_per_sample["site"] + mut_freqs_per_sample["residue"]

In [174]:
mut_freqs_per_sample.head()

Unnamed: 0,frequency,residue,site,timepoint,sample,mutation
0,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/197/2003,HA1:144D
1,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/193/2003,HA1:144D
2,6.2e-05,D,HA1:144,2009-10-01,A/Queensland/12/2001,HA1:144D
3,6.2e-05,D,HA1:144,2009-10-01,A/Queensland/17/2001,HA1:144D
4,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/401/2001,HA1:144D


## Scale mutation frequencies by sample frequencies

Although we have estimated mutation frequencies per sample in the data frame above, we want to have a generic function that reweights the frequencies in these frames by any collection of per-sample weights (usually sample frequencies). For example, we will want to know what the initial frequency of each mutation is at a timepoint based on scaling of the frequencies of the samples with those mutations. Then we want to calculate the rescaled frequencies for the same mutations after projecting the sample frequencies into the future.

Below is an attempt to design this generic function.

In [141]:
# Load sample (tip) attributes for this timepoint which includes frequency per sample.
samples_df = pd.read_csv(data_root + "tip_attributes.tsv", sep="\t", usecols=["strain", "timepoint", "frequency"])

In [142]:
samples_df.head()

Unnamed: 0,strain,timepoint,frequency
0,A/Alaska/4/2008,2009-10-01,0.0
1,A/Albania/250/2009,2009-10-01,0.000933
2,A/Anguilla/4711/2009,2009-10-01,0.00711
3,A/Argentina/305/2007,2009-10-01,0.0
4,A/Argentina/3689/2007,2009-10-01,0.0


In [143]:
samples_df.shape

(1350, 3)

In [145]:
samples_df.groupby("timepoint")["frequency"].sum()

timepoint
2009-10-01    1.000008
Name: frequency, dtype: float64

In [175]:
mut_freqs_per_sample.head()

Unnamed: 0,frequency,residue,site,timepoint,sample,mutation
0,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/197/2003,HA1:144D
1,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/193/2003,HA1:144D
2,6.2e-05,D,HA1:144,2009-10-01,A/Queensland/12/2001,HA1:144D
3,6.2e-05,D,HA1:144,2009-10-01,A/Queensland/17/2001,HA1:144D
4,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/401/2001,HA1:144D


In [178]:
mut_freqs_per_sample.shape

(8100, 6)

In [179]:
mut_freqs_with_sample_freqs = mut_freqs_per_sample.merge(
    samples_df,
    how="inner",
    left_on=["timepoint", "sample"],
    right_on=["timepoint", "strain"],
    suffixes=["", "_strain"]
)

In [184]:
mut_freqs_with_sample_freqs.shape

(8100, 9)

In [180]:
mut_freqs_with_sample_freqs.head()

Unnamed: 0,frequency,residue,site,timepoint,sample,mutation,strain,frequency_strain
0,6.2e-05,D,HA1:144,2009-10-01,A/NewYork/197/2003,HA1:144D,A/NewYork/197/2003,0.0
1,0.913649,P,HA1:162,2009-10-01,A/NewYork/197/2003,HA1:162P,A/NewYork/197/2003,0.0
2,0.653044,T,HA1:212,2009-10-01,A/NewYork/197/2003,HA1:212T,A/NewYork/197/2003,0.0
3,0.656256,V,HA1:213,2009-10-01,A/NewYork/197/2003,HA1:213V,A/NewYork/197/2003,0.0
4,0.871628,R,HA1:261,2009-10-01,A/NewYork/197/2003,HA1:261R,A/NewYork/197/2003,0.0


In [181]:
mut_freqs_with_sample_freqs["scaled_frequency"] = mut_freqs_with_sample_freqs["frequency"] * mut_freqs_with_sample_freqs["frequency_strain"]

In [183]:
mut_freqs_with_sample_freqs.groupby(["timepoint", "site"])["frequency"].sum()

timepoint   site   
2009-10-01  HA1:144     482.597964
            HA1:162    1223.212904
            HA1:212     867.752304
            HA1:213     872.820096
            HA1:261    1140.635850
            HA1:62      532.492156
Name: frequency, dtype: float64

In [185]:
mut_freqs_with_sample_freqs.groupby(["timepoint", "site"])["frequency_strain"].sum()

timepoint   site   
2009-10-01  HA1:144    1.000008
            HA1:162    1.000008
            HA1:212    1.000008
            HA1:213    1.000008
            HA1:261    1.000008
            HA1:62     1.000008
Name: frequency_strain, dtype: float64

In [182]:
mut_freqs_with_sample_freqs.groupby(["timepoint", "site"])["scaled_frequency"].sum()

timepoint   site   
2009-10-01  HA1:144    0.508994
            HA1:162    0.844299
            HA1:212    0.577045
            HA1:213    0.580045
            HA1:261    0.769685
            HA1:62     0.507551
Name: scaled_frequency, dtype: float64

In [188]:
mut_freqs_with_sample_freqs.loc[:, ["site", "residue", "frequency"]].drop_duplicates().groupby("site")["frequency"].sum()

site
HA1:144    0.999999
HA1:162    1.000000
HA1:212    1.000000
HA1:213    1.000000
HA1:261    1.000000
HA1:62     1.000000
Name: frequency, dtype: float64