# Inspect the state of the exponential growth model class

Run an example model (LBI) and investigate the state of the model class at different steps of the model execution. This code will test the model interface and underlying functionality.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

%matplotlib inline

plt.style.use("huddlej")

In [2]:
pwd

'/Users/jlhudd/projects/nextstrain/flu-forecasting/analyses'

In [3]:
sys.path.insert(0, "../src")

In [4]:
from fit_model import ExponentialGrowthModel
from forecast.metrics import mean_absolute_error, sum_of_squared_errors

## Load data

Define the range of possible coefficient values to explore.

In [5]:
coefficients = np.array([0.9])

In [6]:
delta_time = 1.0
l1_lambda = 0.0

In [7]:
last_timepoint = pd.to_datetime("2014-10-01")

In [8]:
# Load tip attributes.
X = pd.read_csv(
    "../results/builds/h3n2/20_viruses_per_month/sample_0/2000-10-01--2015-10-01/standardized_tip_attributes.tsv",
    sep="\t",
    parse_dates=["timepoint"]
)

In [9]:
X = X.loc[:, ["strain", "clade_membership", "timepoint", "cTiterSub_x", "lbi", "ep", "ne", "delta_frequency", "frequency"]].copy()

In [10]:
X = X[X["timepoint"] <= last_timepoint]

In [11]:
X.head()

Unnamed: 0,strain,clade_membership,timepoint,cTiterSub_x,lbi,ep,ne,delta_frequency,frequency
0,A/Aichi/181/2004,e5a7ac4,2004-10-01,-0.877902,-0.778522,9,6,-1.066069,0.003256
1,A/Aichi/183/2004,e5a7ac4,2004-10-01,-0.877902,-0.728971,10,6,-1.066069,0.003087
2,A/Anhui/789/2004,66684e0,2004-10-01,-0.877902,-1.088123,11,8,-0.844699,0.003449
3,A/Argentina/126/2004,e5a7ac4,2004-10-01,-0.879079,-0.628908,10,7,-1.066069,0.004187
4,A/Argentina/26/2004,2f64328,2004-10-01,-1.702271,-1.184077,7,8,-2.323949,0.002362


In [12]:
X.shape

(6578, 9)

In [13]:
# Load final clade tip frequencies.
final_clade_frequencies = pd.read_csv(
    "../results/builds/h3n2/20_viruses_per_month/sample_0/2000-10-01--2015-10-01/final_clade_frequencies.tsv",
    sep="\t",
    parse_dates=["initial_timepoint", "final_timepoint"]
)

In [14]:
final_clade_frequencies.head()

Unnamed: 0,initial_timepoint,clade_membership,final_timepoint,strain,frequency
0,2004-10-01,e5a7ac4,2005-10-01,A/RioGrandeDoSul/290/2005,0.002219
1,2004-10-01,e5a7ac4,2005-10-01,A/Thailand/44/2005,7.4e-05
2,2004-10-01,66684e0,2005-10-01,,0.0
3,2004-10-01,2f64328,2005-10-01,,0.0
4,2004-10-01,8f3fc76,2005-10-01,A/Hanoi/ISBM16/2005,0.000778


In [15]:
y = final_clade_frequencies.groupby([
    "initial_timepoint",
    "clade_membership"
])["frequency"].sum().reset_index()

In [16]:
y = y.rename(
    columns={"initial_timepoint": "timepoint"}
)

In [17]:
y = y[y["timepoint"] <= last_timepoint]

In [18]:
y.head()

Unnamed: 0,timepoint,clade_membership,frequency
0,2004-10-01,2f64328,0.0
1,2004-10-01,66684e0,0.0
2,2004-10-01,84dfd90,0.000227
3,2004-10-01,8f3fc76,0.018687
4,2004-10-01,dffb0cc,0.0


In [19]:
y.groupby("timepoint")["frequency"].sum()

timepoint
2004-10-01    0.964599
2005-04-01    0.998802
2005-10-01    1.000009
2006-04-01    0.999998
2006-10-01    0.999997
2007-04-01    1.000000
2007-10-01    1.000002
2008-04-01    0.999994
2008-10-01    1.000001
2009-04-01    0.999998
2009-10-01    0.999994
2010-04-01    0.999995
2010-10-01    0.999990
2011-04-01    0.999991
2011-10-01    0.991572
2012-04-01    0.999686
2012-10-01    0.995549
2013-04-01    0.999946
2013-10-01    1.000004
2014-04-01    1.000002
2014-10-01    0.999980
Name: frequency, dtype: float64

## Inspect LBI model

In [20]:
predictors = ["lbi"]
model = ExponentialGrowthModel(
    predictors=predictors,
    delta_time=delta_time,
    l1_lambda=l1_lambda,
    cost_function=sum_of_squared_errors
)

In [21]:
training_error = model.fit(X, y)

Optimization terminated successfully.
         Current function value: 4.812626
         Iterations: 15
         Function evaluations: 30


In [22]:
training_error

4.812626405870529

In [23]:
model.coef_

array([0.90168873])

In [32]:
coefficients = model.coef_

In [24]:
model.cost_function

<function forecast.metrics.sum_of_squared_errors(observed, estimated, **kwargs)>

In [25]:
model.predictors

['lbi']

In [26]:
model.l1_lambda

0.0

In [27]:
model.get_fitnesses?

[0;31mSignature:[0m [0mmodel[0m[0;34m.[0m[0mget_fitnesses[0m[0;34m([0m[0mcoefficients[0m[0;34m,[0m [0mpredictors[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Apply the coefficients to the predictors and sum them to get strain
fitnesses.

Parameters
----------
coefficients : ndarray or list
    coefficients for given predictors

predictors : ndarray
    predictor values per sample (p x n matrix for p predictors and n samples)

Returns
-------
ndarray :
    fitnesses per sample
[0;31mFile:[0m      ~/projects/nextstrain/flu-forecasting/src/fit_model.py
[0;31mType:[0m      method


In [31]:
model._fit??

[0;31mSignature:[0m [0mmodel[0m[0;34m.[0m[0m_fit[0m[0;34m([0m[0mcoefficients[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0m_fit[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mcoefficients[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Calculate the error between observed and estimated values for the given[0m
[0;34m        parameters and data.[0m
[0;34m[0m
[0;34m        Parameters[0m
[0;34m        ----------[0m
[0;34m        coefficients : ndarray[0m
[0;34m            coefficients for each of the model's predictors[0m
[0;34m[0m
[0;34m        X : pandas.DataFrame[0m
[0;34m            standardized tip attributes by timepoint[0m
[0;34m[0m
[0;34m        y : pandas.DataFrame[0m
[0;34m            final clade frequencies at delta time in the future from each[0m
[0;34m            timepoint in the given tip attribu

In [33]:
y_hat = model.predict(X, coefficients)

In [35]:
y_hat.head()

Unnamed: 0,timepoint,clade_membership,frequency
0,2004-10-01,2f64328,0.019963
1,2004-10-01,66684e0,0.001376
2,2004-10-01,84dfd90,0.004968
3,2004-10-01,8f3fc76,0.130853
4,2004-10-01,dffb0cc,0.004295


In [36]:
model.predict??

[0;31mSignature:[0m [0mmodel[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0mX[0m[0;34m,[0m [0mcoefficients[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mpredict[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0mcoefficients[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Calculate the estimate final frequencies of all clades in the given tip[0m
[0;34m        attributes data frame using previously calculated beta coefficients.[0m
[0;34m[0m
[0;34m        Parameters[0m
[0;34m        ----------[0m
[0;34m        X : pandas.DataFrame[0m
[0;34m            standardized tip attributes by timepoint[0m
[0;34m[0m
[0;34m        coefficients : ndarray[0m
[0;34m            optional coefficients to use for each of the model's predictors[0m
[0;34m            instead of the model's currently defined coefficients[0m
[0;34m[0m
[0;34m        Return

In [40]:
estimated_frequencies = []
for timepoint, timepoint_df in X.groupby("timepoint"):
    # Select predictors from the timepoint.
    predictors = timepoint_df.loc[:, model.predictors].values
    print("Predictors: ", predictors[:10])

    # Select frequencies from timepoint.
    initial_frequencies = timepoint_df["frequency"].values
    print("Initial freqs: ", initial_frequencies[:10])

    # Calculate fitnesses.
    fitnesses = model.get_fitnesses(coefficients, predictors)
    print("Fitnesses: ", fitnesses[:10])

    # Project frequencies.
    projected_frequencies = model.project_frequencies(
        initial_frequencies,
        fitnesses,
        model.delta_time
    )
    print("Projected freqs: ", projected_frequencies[:10])

    # Sum the estimated frequencies by clade.
    projected_timepoint_df = timepoint_df[["timepoint", "clade_membership"]].copy()
    projected_timepoint_df["frequency"] = projected_frequencies
    projected_clade_frequencies = projected_timepoint_df.groupby([
        "timepoint",
        "clade_membership"
    ])["frequency"].sum().reset_index()

    estimated_frequencies.append(projected_clade_frequencies)

Predictors:  [[-0.77852224]
 [-0.72897056]
 [-1.088123  ]
 [-0.6289078 ]
 [-1.18407725]
 [-0.6289078 ]
 [-1.18407725]
 [-1.18407725]
 [-1.11549014]
 [-1.13882758]]
Initial freqs:  [3.256e-03 3.087e-03 3.449e-03 4.187e-03 2.362e-03 4.187e-03 2.362e-03
 2.362e-03 4.400e-05 9.150e-04]
Fitnesses:  [-0.70198473 -0.65730454 -0.98114825 -0.56707907 -1.06766911 -0.56707907
 -1.06766911 -1.06766911 -1.00582489 -1.026868  ]
Projected freqs:  [1.20196626e-03 1.19165042e-03 9.63078970e-04 1.76888510e-03
 6.04885349e-04 1.76888510e-03 6.04885349e-04 6.04885349e-04
 1.19868329e-05 2.44080995e-04]
Predictors:  [[-0.9782014 ]
 [-0.67314721]
 [-0.15639282]
 [ 0.46354076]
 [-0.35151089]
 [ 0.05354171]
 [-0.39817519]
 [-0.68081329]
 [-0.10792415]
 [ 0.28781015]]
Initial freqs:  [8.1000e-05 4.4470e-03 3.3800e-04 6.3210e-03 5.4060e-03 1.2449e-02
 1.2399e-02 6.1380e-03 6.0000e-04 1.1127e-02]
Fitnesses:  [-0.88203318 -0.60696926 -0.14101765  0.41796948 -0.31695341  0.04827796
 -0.35903008 -0.61388167 -0.0973