In [1]:
import sys
sys.path.append('../')
import joblib

from scipy.signal import savgol_filter

from Modules.Utils.Imports import *
from Modules.Utils.DRUMSLasso import *
from Modules.Utils.GetLowestGPU import *
import Modules.Loaders.DataFormatter as DF
from Modules.Models.BuildBINNs import AdaMaskBINNCovasim
from Modules.Models.BuildBINNs import chi
from Modules.Utils.ModelWrapper import ModelWrapper

from Notebooks.utils import get_case_name
from queue import PriorityQueue

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device(GetLowestGPU(pick_from=[0,1,2,3]))
# helper functions
def to_torch(x):
    return torch.from_numpy(x).float().to(device)
def to_numpy(x):
    return x.detach().cpu().numpy()

Device set to cpu


In [3]:
# instantiate BINN model parameters and path
path = '../Data/covasim_data/drums_data/'

population = int(500e3)
test_prob = 0.1
trace_prob = 0.3
keep_d = True
retrain = False
dynamic = True
masking = 0
multiple = True
parallelb = True
n_runs = 64
chi_type = 'piecewise'

# model parameters
maskb = False
masking_learned = False

case_name = get_case_name(population, test_prob, trace_prob, keep_d, dynamic=dynamic, chi_type=chi_type)

In [4]:
if not masking==0:
    if masking==1:
        case_name = case_name + '_maskingdem'
    elif masking==2:
        case_name = case_name + '_maskinguni'
    elif masking==3:
        case_name = case_name + '_maskingnorm'

if multiple:
    params = DF.load_covasim_data(path, population, test_prob, trace_prob, keep_d, case_name + '_' + str(n_runs), plot=False)
else:
    params = DF.load_covasim_data(path, population, test_prob, trace_prob, keep_d, case_name, plot=False)

In [5]:
# multiple==True and parallelb==False means that data is a list and not normalized
if multiple and not parallelb:
    data = np.mean(params['data'], axis=0)
    data = (data / params['population'])
    avg_masking = np.mean(params['avg_masking'], axis=0)
    avg_masking = (avg_masking / params['population'])
# multiple==True and parallelb==True means that the data is a 2d array and normalized
elif multiple and parallelb:
    data = params['data'] # parallel simulations store normalized data
    avg_masking = params['avg_masking']
# otherwise, the data is from a single simulation and is not normalized
else:
    data = params['data']
    data = (data / params['population']).to_numpy()
    avg_masking = params['avg_masking']
    avg_masking = (avg_masking / params['population'])
    
params.pop('data')

N = len(data)
t_max = N - 1
t = np.arange(N)[:,None]

tracing_array = params['tracing_array']

In [6]:
model_path = '../models/covasim'
if maskb:
    model_path += '/mask'
    if masking_learned:
        model_path += '/learned_masking'
    else:
        model_path += '/observed_masking'
else:
    model_path += '/no_mask'

In [7]:
#--------------------no masking----------------------#
# model_folder = '/2023-07-20_17-53-03' # no masking, 500e3 pop, dynamic piecewise, 64 avg., 800e3 epochs, lr=1e-5
# model_folder = '/2023-07-21_18-42-24' # no masking, 500e3 pop, dynamic piecewise, 64 avg., 1e6 epochs, lr=1e-6
model_folder = '/2023-07-22_10-20-01' # no masking, 500e3 pop, dynamic piecewise, 64 avg., 1e6 epochs, lr=5e-6
# model_folder = '/2023-07-23_00-48-24' # no masking, 500e3 pop, dynamic piecewise, 64 avg., 1e6 epochs, lr=9e-6
# model_folder = '/2023-07-23_15-17-23' # no masking, 500e3 pop, dynamic piecewise, 64 avg., 1e6 epochs, lr=9e-6
# model_folder = '/2023-07-24_23-10-16' # no masking, 500e3 pop, dynamic piecewise, 64 avg., 1e6 epochs, lr=1e-5

#------------------normal masking--------------------#
# mydir = '../models/covasim/mask/2023-07-16_16-48-08' # norm masking, 500e3 pop, dynamic piecewise, keepd, 2048 avg., 800,000 epochs, lr=5e-5, deep eta, beta
# mydir = '../models/covasim/no_mask/2023-07-18_18-53-36' # no masking, 500e3 pop, dynamic piecewise, keepd, 64 avg., 500,000 epochs, lr=5e-5, deep eta
# model_folder = '/2023-07-20_18-13-01' # masking-norm, observed M, 500e3, dynamic piecewise, 64 avg., 800e3, lr=5e-5
# model_folder = '/2023-07-21_18-41-30' # masking-norm, observed M, 500e3, dynamic piecewise, 64 avg., 800e3, lr=5e-5
# model_folder = '/2023-07-21_21-48-16' # masking-norm, observed M, 1e6, dynamic piecewise, 64 avg., 800e3, lr=5e-5
# model_folder = '/2023-07-22_10-16-47' # masking-norm, observed M, 1e6, dynamic piecewise, 64 avg., 800e3, lr=5e-5
# model_folder = '/2023-07-22_12-30-47' # masking-norm, observed M, 1e6, dynamic piecewise, 64 avg., 800e3, lr=5e-5
# model_folder = '/2023-07-23_00-01-28' # masking-norm, observed M, 1e6, dynamic piecewise, 64 avg., 800e3, lr=5e-5
# model_folder = '/2023-07-23_00-27-07' # masking-norm, observed M, 1e6, dynamic piecewise, 64 avg., 800e3, lr=5e-5
# model_folder = '/2023-07-23_15-14-54' # masking-norm, observed M, 1e6, dynamic piecewise, 64 avg., 800e3, lr=4e-5
# model_folder = '/2023-07-27_22-37-14' # masking-norm, observed M, 1e6, dynamic piecewise, 2048 avg., 800e3, lr=4e-5

#---------------demographic masking------------------#
# model_folder = '/2023-07-20_22-20-10' # masking-dem, observed M, 500e3, dynamic piecewise, 64 avg., 600e3, lr=5e-5
# model_folder = '/2023-07-23_15-15-56' # masking-dem, observed M, 500e3, dynamic piecewise, 64 avg., 600e3, lr=3e-6
# model_folder = '/2023-07-24_23-09-21' # masking-dem, observed M, 500e3, dynamic piecewise, 64 avg., 600e3, lr=3e-6
# model_folder = '/2023-07-24_23-09-34' # masking-dem, observed M, 500e3, dynamic piecewise, 64 avg., 600e3, lr=4e-6
# model_folder = '/2023-07-25_20-44-25' # masking-dem, observed M, 500e3, dynamic piecewise, 64 avg., 700e3, lr=4e-6


mydir = model_path + model_folder

In [8]:
yita_lb = 0.0
yita_ub = 1.0
beta_lb = 0.0
beta_ub = 0.5
tau_lb = 0.05
tau_ub = 0.4
eta_deep = True
beta_deep = True
tau_deep = False

In [9]:
binn = AdaMaskBINNCovasim(params, 
                t_max_real=t_max, 
                tracing_array=tracing_array, 
                yita_lb=yita_lb, 
                yita_ub=yita_ub,
                beta_lb=beta_lb,
                beta_ub=beta_ub,
                tau_lb=tau_lb,
                tau_ub=tau_ub, 
                chi_type=chi_type,
                eta_deep=eta_deep,
                beta_deep=beta_deep,
                tau_deep=tau_deep,
                maskb=maskb,
                masking_learned=masking_learned).to(device)
parameters = binn.parameters()
model = ModelWrapper(binn, None, None, save_name=os.path.join(mydir, case_name))

In [10]:
# load model weights
# model.save_name = '../Weights/'
# model.save_name += case_name
if retrain:
    model.save_name += '_retrain'
model.save_name += '_best_val'
model.load(model.save_name + '_model', device=device)
save_path = model.save_folder

# grab initial condition
u0 = data[0, :].copy()

In [11]:
# grab value ranges
yita_lb, yita_ub = model.model.yita_lb, model.model.yita_ub
beta_lb, beta_ub = model.model.beta_lb, model.model.beta_ub
tau_lb, tau_ub = model.model.tau_lb, model.model.tau_ub

In [12]:
# learned contact_rate function
def contact_rate(u):
    res = binn.eta_func(to_torch(u)) # [:,[0,3,4]]
    return to_numpy(res)

# learned effective tracing rate function
def beta(u):
    res = binn.beta_func(to_torch(u))
    return to_numpy(res)

# learned diagnosis of quarantined rate function
def tau(u):
    res = binn.tau_func(to_torch(u))
    return to_numpy(res)

def chi_func(t):
    chi_t = chi(1 + to_torch(t) * t_max, trace_prob, chi_type)
    return chi_t

In [13]:
if masking > 0:
    all_data = np.concatenate([data, avg_masking[:,None]], axis=1) # STEAYDQRFM
else:
    all_data = data

if maskb:
    eta_input = np.concatenate([data[:,[0,3,4]], avg_masking[:,None]], axis=1) #SAYM
else:
    eta_input = np.concatenate([data[:,[0,3,4]]], axis=1) #SAY
eta0 = contact_rate(eta_input) # eta(S,A,Y,M)
eta_values = yita_lb + (yita_ub - yita_lb) * eta0[:, 0][:, None]

chi_t = to_numpy(chi_func(t))
beta_input = np.concatenate([np.sum(data[:,[0,3,4]], axis=1)[:,None], chi_t], axis=1)
beta_values = beta(beta_input)

tau_input = data[:,[3,4]]
tau0 = tau(tau_input)
tau_values = tau_lb + (tau_ub - tau_lb) * tau0

### Helper Functions
- `get_best_num_comps`: Uses min Priority Queue data structure to get the equation with $K$ number of features with next lowest MSE.
- `determine_deg`: Determines the degree of features to perform LASSO on with user specified minimum and maximum degree and a threshold of improvement.

In [14]:
def get_best_num_comps(num_comps=-1, pq=None):
    if pq.empty():
        raise Exception('The Priority Queue given is empty')
    bl_dict = pq.get()[-1]
    if num_comps==-1:
        return bl_dict
    if 'f ' in bl_dict['Equation'].split('+'):
        return bl_dict
    if len(bl_dict['Equation'].split('+')) - 1 <= num_comps:
        return bl_dict
    
    good_eq = False
    while not good_eq and not pq.empty():
        cand = pq.get()[-1]
        curr_num = len(cand['Equation'].split('+')) - 1
        if curr_num <= num_comps:
            good_eq = True
            return cand
    if not good_eq:
        raise Exception('No equation with that number of components was found')
    
def determine_deg(X_dict, y_values, min_deg, max_deg, imp_threshold):
    deg = min_deg
    for i in range(min_deg, max_deg+1):
        lasso_dict = DRUMS_Lasso(X_dict, y_values, degree=i, intercept=True, alphas=np.array([0]))
        curr_mse = lasso_dict['MSE']
        if i == min_deg:
            mse = curr_mse
            continue
        if mse / curr_mse >= imp_threshold:
            deg = i
            mse = curr_mse
    return deg

***
### LASSO on $\eta$

Determine the max degree of terms to include

- We specify a max degree and perform regular, no penalized lienear regression. We take the lowest degree with a specified improvement threshold. For example, if the MSE of degree=2 is 10 and the MSE of degree=3 is 2 and we specified the improvement threshold to be 1.5, since degree=3 is $10/2 = 5$ times lower in MSE, which is greater than $1.5$, we take degree=3.

In [None]:
eta_all_comps = False

if eta_all_comps:
    if maskb:
        comps = list('STEAYDQRFM')
    else:
        comps = list('STEAYDQRF')
    X_dict = {key : value for key, value in zip(comps, all_data.T)}
else:
    if maskb:
        eta_input_data = all_data[:,[0,3,4,9]]
        comps = list('SAYM')
    else:
        eta_input_data = all_data[:,[0,3,4]]
        comps = list('SAY')
    X_dict = {key : value for key, value in zip(comps, eta_input_data.T)}

alphas_list = np.linspace(float(1e-8), float(1e-4), num=20000)

eta_deg = determine_deg(X_dict, eta_values, 1, 5, 2)

In [None]:
eq_set = set()
eq_q = PriorityQueue()
min_eta_comps = 1000

for i in alphas_list:
    lasso_dict = DRUMS_Lasso(X_dict, eta_values, degree=eta_deg, intercept=True, alphas=np.array([i]))
    if not lasso_dict['Equation'] in eq_set:
        eq_set.add(lasso_dict['Equation'])
        eq_q.put((lasso_dict['MSE'], i, lasso_dict['Equation'], lasso_dict))
        min_eta_comps = len(lasso_dict['Equation'].split('+')) - 1
        if 'f ' in lasso_dict['Equation'].split('+'):
            break

In [21]:
bl_eta_dict = get_best_num_comps(pq=eq_q)
max_eta_comps = len(bl_eta_dict['Equation'].split('+')) - 1
best_list = [bl_eta_dict]
for i in reversed(range(min_eta_comps, max_eta_comps)):
    nb_dict = get_best_num_comps(num_comps=i, pq=eq_q)
    best_list.append(nb_dict)

In [22]:
if not os.path.exists(model.save_folder + '\\eta_eq_coef'):
    os.makedirs(os.path.join(model.save_folder, 'eta_eq_coef'))
    
file_path = model.save_folder  + '\\eta_eq_coef'

for i, elem in enumerate(reversed(best_list)):
    nb_file_name = case_name + '_' + str(n_runs) + '_sparse_coef_' + str(i + min_eta_comps) + 'comps'
    joblib.dump(elem, os.path.join(file_path, nb_file_name), compress=True)

joblib.dump(bl_eta_dict, os.path.join(file_path, case_name + '_' + str(n_runs) + '_sparse_coef_best'), compress=True)

['../models/covasim/no_mask/2023-07-22_10-20-01\\500000_0.1_0.3_dynamic_piecewise\\eta_eq_coef\\500000_0.1_0.3_dynamic_piecewise_64_sparse_coef_best']

***
### LASSO on $\beta$

In [26]:
beta_all_comps = False
if beta_all_comps:
    comps = list('STEAYDQRF')
    X_dict = {key : value for key, value in zip(comps, all_data.T)}
else:
    beta_input_data = np.concatenate([np.sum(all_data[:,[0,3,4]], axis=1)[:,None], chi_t], axis=1)
    comps = list('sx')
    X_dict = {key : value for key, value in zip(comps, beta_input_data.T)}

alphas_list = np.linspace(float(1e-8), float(1e-4), num=2000)

beta_deg = determine_deg(X_dict, beta_values, 1, 4, 1.1)

  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model.fit(X, y)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model.fit(X, y)
  model = cd_fast.enet_coordinate_descent(
 

In [27]:
eq_set = set()
eq_q = PriorityQueue()
min_beta_comps = 1000

for i in alphas_list:
    lasso_dict = DRUMS_Lasso(X_dict, beta_values, degree=beta_deg, intercept=True, alphas=np.array([i]))
    if not lasso_dict['Equation'] in eq_set:
        eq_set.add(lasso_dict['Equation'])
        eq_q.put((lasso_dict['MSE'], i, lasso_dict['Equation'], lasso_dict))
        min_beta_comps = len(lasso_dict['Equation'].split('+')) - 1
        if 'f ' in lasso_dict['Equation'].split('+'):
            break

  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  y = column_or_1d(y, warn=True)
  model = cd_fast.e

In [28]:
bl_beta_dict = get_best_num_comps(pq=eq_q)
max_beta_comps = len(bl_beta_dict['Equation'].split('+')) - 1
best_list = [bl_beta_dict]
for i in reversed(range(min_beta_comps, max_beta_comps)):
    nb_dict = get_best_num_comps(num_comps=i, pq=eq_q)
    best_list.append(nb_dict)

In [29]:
if not os.path.exists(model.save_folder + '\\beta_eq_coef'):
    os.makedirs(os.path.join(model.save_folder, 'beta_eq_coef'))
    
file_path = model.save_folder  + '\\beta_eq_coef'

for i, elem in enumerate(reversed(best_list)):
    nb_file_name = case_name + '_' + str(n_runs) + '_sparse_coef_' + str(i + min_beta_comps) + 'comps'
    joblib.dump(elem, os.path.join(file_path, nb_file_name), compress=True)

joblib.dump(bl_beta_dict, os.path.join(file_path, case_name + '_' + str(n_runs) + '_sparse_coef_best'), compress=True)

['../models/covasim/no_mask/2023-07-22_10-20-01\\500000_0.1_0.3_dynamic_piecewise\\beta_eq_coef\\500000_0.1_0.3_dynamic_piecewise_64_sparse_coef_best']

***
### LASSO on $\tau$

In [19]:
tau_all_comps = False
if tau_all_comps:
    comps = list('STEAYDQRF')
    X_dict = {key : value for key, value in zip(comps, all_data.T)}
else:
    tau_input_data = all_data[:,[3,4]]
    comps = list('AY')
    X_dict = {key : value for key, value in zip(comps, tau_input_data.T)}

alphas_list = np.linspace(float(1e-4), float(1), num=20000)

tau_deg = determine_deg(X_dict, tau_values, 1, 1, 2)

  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model.fit(X, y)
  model = cd_fast.enet_coordinate_descent(


In [20]:
eq_set = set()
eq_q = PriorityQueue()
min_tau_comps = 1000

for i in alphas_list:
    lasso_dict = DRUMS_Lasso(X_dict, tau_values, degree=tau_deg, intercept=True, alphas=np.array([i]))
    if not lasso_dict['Equation'] in eq_set:
        eq_set.add(lasso_dict['Equation'])
        eq_q.put((lasso_dict['MSE'], i, lasso_dict['Equation'], lasso_dict))
        min_tau_comps = len(lasso_dict['Equation'].split('+')) - 1
        if 'f ' in lasso_dict['Equation'].split('+'):
                break

  y = column_or_1d(y, warn=True)


In [21]:
bl_tau_dict = get_best_num_comps(pq=eq_q)
max_tau_comps = len(bl_tau_dict['Equation'].split('+')) - 1
best_list = [bl_tau_dict]
for i in reversed(range(min_tau_comps, max_tau_comps)):
    nb_dict = get_best_num_comps(num_comps=i, pq=eq_q)
    best_list.append(nb_dict)

In [22]:
if not os.path.exists(model.save_folder + '\\tau_eq_coef'):
    os.makedirs(os.path.join(model.save_folder, 'tau_eq_coef'))
    
file_path = model.save_folder  + '\\tau_eq_coef'

for i, elem in enumerate(reversed(best_list)):
    nb_file_name = case_name + '_' + str(n_runs) + '_sparse_coef_' + str(i + min_tau_comps) + 'comps'
    joblib.dump(elem, os.path.join(file_path, nb_file_name), compress=True)

joblib.dump(bl_tau_dict, os.path.join(file_path, case_name + '_' + str(n_runs) + '_sparse_coef_best'), compress=True)

['../models/covasim/no_mask/2023-07-22_10-20-01\\500000_0.1_0.3_dynamic_piecewise\\tau_eq_coef\\500000_0.1_0.3_dynamic_piecewise_64_sparse_coef_best']

***
### Loading learned coefficients

In [19]:
print(f'The maximum number of components for eta: {max_eta_comps}')
print(f'The maximum number of components for beta: {max_beta_comps}')
print(f'The maximum number of components for tau: {max_tau_comps}')

NameError: name 'max_eta_comps' is not defined

In [33]:
eta_dl = joblib.load(mydir + '/' + case_name + '/eta_eq_coef/' + case_name + '_' + str(n_runs) + '_sparse_coef_best')
beta_dl = joblib.load(mydir + '/' + case_name + '/beta_eq_coef/' + case_name + '_' + str(n_runs) + '_sparse_coef_best')
tau_dl = joblib.load(mydir + '/' + case_name + '/tau_eq_coef/' + case_name + '_' + str(n_runs) + '_sparse_coef_best')

In [34]:
eta_eq = eta_dl['Equation']
beta_eq = beta_dl['Equation']
tau_eq = tau_dl['Equation']

print(f'eta = {eta_eq[4:]}')
print(np.round(eta_dl['MSE'], 10))
print(f'beta = {beta_eq[4:]}')
print(np.round(beta_dl['MSE'], 10))
print(f'tau = {tau_eq[4:]}')
print(np.round(tau_dl['MSE'], 10))

eta = 0.12386*S + -0.45155*A + 3.91910*Y + 0.02027*S^2 + -6.42370*S A + 4.64430*S Y + -0.01131*S^3 + -6.30371*S^2 A + 3.64015*S^2 Y + -0.00980*S^4 + -2.47776*S^3 A + -0.00405*S^5 + -2.00666*S^4 Y + 0.13393
5.5319e-06
beta = 3.49793*s + -5.83841*x + -1.85993*s^2 + 6.03933*s x + -1.36894
0.0001209505
tau = 0.05251
1e-10
