In [None]:
from imp import reload
import sys, os
from copy import deepcopy
print('python version %s' % sys.version)

import numpy as np
print('numpy version %s' % np.__version__)

import scipy as sp
import scipy.stats as st
print('scipy version %s' % sp.__version__)

import pandas as pd
print('pandas version %s' % pd.__version__)

import matplotlib
from matplotlib import cm
import matplotlib.pyplot as plt
print('matplotlib version %s' % matplotlib.__version__)

python version 3.8.3 (default, Jul  2 2020, 11:26:31) 
[Clang 10.0.0 ]


In [None]:
SIMULATION_DIRECTORY = 'simulations'
DATA_DIRECTORY       = ''
SCRIPT_DIR           = 'simulation-scripts'
INFERENCE_DATE       = '2021-08-14'
SARS_DATA_DIR        = os.path.join(DATA_DIRECTORY, INFERENCE_DATE)

## Inference in simulations (run for different parameters and such)

In [1]:
# make initial population
# the starting population sizes for the 6 variants
init_pop = [1000, 1000, 1000, 1000,  1000,  1000]
# the initial sequences where numbers specify location on the genome of a mutation
init_seqs = [[0],  [1],  [2],  [3],   [4],   [5]]
# the selection coefficients for the mutations, in order of site numbers
init_sel = [0,    0, 0.03, 0.03, -0.03, -0.03]
init_file = os.path.join(SIMULATION_DIRECTORY, "initial1.npz")
f = open(init_file, mode='wb')
np.savez_compressed(f, counts=init_pop,
                    sequences=init_seqs, selection=init_sel)
f.close()

# print code to run in terminal
# file containing the branching process simulation
sim_file = os.path.join(SIMULATION_DIRECTORY, "sim-init-sample50-T50")

# file containing the inference from the branching process
inf_file = os.path.join(SIMULATION_DIRECTORY, 'inf-init-sample50-T50')

# file containing repeats of the simulation and inference 1000 times
rep_file = os.path.join(SIMULATION_DIRECTORY, 'epi-init-sample50-T50')
temp1 = os.path.join(SIMULATION_DIRECTORY, 'temp1')
temp2 = os.path.join(SIMULATION_DIRECTORY, 'temp2')

sim_inf = os.path.join(SCRIPT_DIR, 'Sim-epi.py')

NameError: name 'os' is not defined

In [None]:
sim_script = os.path.join(SCRIPT_DIR, 'branching.py')
inf_script = os.path.join(SCRIPT_DIR, 'epi-infer-multiple.py')
sim_out    = os.path.join(SIMULATION_DIRECTORY, 'sim-init-sample50-T50')
inf_out    = os.path.join(SIMULATION_DIRECTORY, 'inf-init-sample50-T50')

%run {sim_script} -o {sim_out} --simulations 1 --pop_limit 10000 --sample 50 --mu 0 -T 50 \
-i {init_file}
%run {inf_script} --data {sim_out + '.npz'} -o {inf_out} -R 2 --pop_size 10000

In [16]:
%run {sim_inf} --n_runs 1000 -o {rep_file} -out1 {temp1} -out2 {temp2} --sample 50 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file + '.npz'}

### Comparison of using real vs. time-varying parameters

In [22]:
# Make directories for the files
rep_finite_dir  = os.path.join(SIMULATION_DIRECTORY, 'replicates-finite')
rep_perfect_dir = os.path.join(SIMULATION_DIRECTORY, 'replicates-perfect')
dirs            = [rep_finite_dir, rep_perfect_dir]
sub_dirs        = ['lin', 'para-up', 'para-down']
for directory in dirs:
    if not os.path.exists(directory):
        os.makedirs(directory)
    for dir2 in sub_dirs:
        new_dir = os.path.join(directory, dir2)
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)

In [19]:
# Run a bunch of repeat simulations with perfect sampling and poor sampling and compute the inference
temp1     = os.path.join(SIMULATION_DIRECTORY, 'temp1')
temp2     = os.path.join(SIMULATION_DIRECTORY, 'temp2')

population_para1 = 25 * (np.arange(51)**2) + 1000
pop1_file = os.path.join(rep_finite_dir, 'para-up', 'population-para-up.npy')
np.save(os.path.join(rep_finite_dir, 'para-up', 'population-para-up.npy'), population_para1)
np.save(os.path.join(rep_perfect_dir, 'para-up', 'population-para-up.npy'), population_para1)
# finite sampling
pop1_out = os.path.join(rep_finite_dir, 'para-up', 'epi-init-parabolic-up')

# perfect sampling
pop1_out2 = os.path.join(rep_perfect_dir, 'para-up', 'epi-init-parabolic-up')

population_para2 = -300 * ((np.arange(51)-25)**2) + 200000
pop2_file = os.path.join(rep_finite_dir, 'para-down', 'population-para-down.npy')
np.save(os.path.join(rep_finite_dir, 'para-down', 'population-para-down.npy'), population_para2)
np.save(os.path.join(rep_perfect_dir, 'para-down', 'population-para-down.npy'), population_para2)
# finite sampling
pop2_out = os.path.join(rep_finite_dir, 'para-down', 'epi-init-parabolic-down')

# perfect sampling
pop2_out2 = os.path.join(rep_perfect_dir, 'para-down', 'epi-init-parabolic-down')

population_lin = np.arange(51) * 2000
pop_lin_file = os.path.join(rep_finite_dir, 'lin', 'population-lin.npy')
np.save(os.path.join(rep_finite_dir, 'lin', 'population-lin.npy'), population_lin)
np.save(os.path.join(rep_perfect_dir, 'lin', 'population-lin.npy'), population_lin)
# finite sampling
pop_lin_out = os.path.join(rep_finite_dir, 'lin', 'epi-init-linear')

# perfect sampling
pop_lin_out2 = os.path.join(rep_perfect_dir, 'lin', 'epi-init-linear')

sim_inf = os.path.join(SCRIPT_DIR, 'Sim-epi-tv-params-local.py')

In [21]:
# Run the simulations 
%run {sim_inf} --n_runs 1000 -o {pop1_out} -out1 {temp1} -out2 {temp2} --sample 25 --pop_limit 1000000 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file} --PopSize {pop1_file} 
%run {sim_inf} --n_runs 1000 -o {pop1_out2} -out1 {temp1} -out2 {temp2} --pop_limit 1000000 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file} --PopSize {pop1_file} 
%run {sim_inf} --n_runs 1000 -o {pop2_out} -out1 {temp1} -out2 {temp2} --sample 25 --pop_limit 1000000 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file} --PopSize {pop2_file} 
%run {sim_inf} --n_runs 1000 -o {pop2_out2} -out1 {temp1} -out2 {temp2} --pop_limit 1000000 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file} --PopSize {pop2_file} 
%run {sim_inf} --n_runs 1000 -o {pop_lin_out} -out1 {temp1} -out2 {temp2} --sample 25 --pop_limit 1000000 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file} --PopSize {pop_lin_file}
%run {sim_inf} --n_runs 1000 -o {pop_lin_out2} -out1 {temp1} -out2 {temp2} --pop_limit 1000000 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file} --PopSize {pop_lin_file} 

simulations/temp1 simulations/temp2
simulations/replicates-finite/para-up/epi-init-parabolic-up
simulations/temp1 simulations/temp2
simulations/replicates-perfect/para-up/epi-init-parabolic-up
simulations/temp1 simulations/temp2
simulations/replicates-finite/para-down/epi-init-parabolic-down
simulations/temp1 simulations/temp2
simulations/replicates-perfect/para-down/epi-init-parabolic-down
simulations/temp1 simulations/temp2


  coefficient1 = (1 / ((1 / (N * k)) + ((k / R) / (N * k - 1))))


simulations/replicates-finite/lin/epi-init-linear
simulations/temp1 simulations/temp2


  coefficient1 = (1 / ((1 / (N * k)) + ((k / R) / (N * k - 1))))


simulations/replicates-perfect/lin/epi-init-linear


## Migration comparison in simulations (correcting for it or not)

In [32]:
T           = 100    # the number of time points in the simulation
selection   = [0]    # the selection coefficient for the mutation on the inflowing sequences
counts      = [[[25] for i in range(T+1)]]    # the number of sequences inflowing at each time
sequences   = [[[[10]] for i in range(T+1)]]    # the sequences inflowing at each time
inflow_file = os.path.join(SIMULATION_DIRECTORY, 'inflow25.npz') 
f = open(inflow_file, mode='wb')
np.savez_compressed(f, selection=selection, counts=counts, sequences=sequences)
f.close()

In [33]:
inflow_dir = SIMULATION_DIRECTORY
out_file = os.path.join(inflow_dir, 'epi-init-in1')
sim_inf  = os.path.join(SCRIPT_DIR, 'Sim-epi-inflow-local.py')

In [34]:
%run {sim_inf} --n_runs 1000 -o {out_file} -out1 temp1 -out2 temp2 --sample 100 -k 0.1 -R 2 --mu 0 --simulations 1 -T 100 -i {init_file} --in_flow {inflow_file}

temp1 temp2
simulations/epi-init-in1


## AUROC for different samples sizes, time-series lengths, population sizes (small), and number of simulations

In [36]:
temp1 = os.path.join(SIMULATION_DIRECTORY, 'temp1')
temp2 = os.path.join(SIMULATION_DIRECTORY, 'temp2')

sample = [5, 10, 25, 50, 100, 150, 200]
out_dir_sample = os.path.join(SIMULATION_DIRECTORY, 'epi-init-sample')
if not os.path.exists(out_dir_sample):
    os.mkdirs(out_dir_sample)

times = [5, 10, 25, 50, 100, 150, 200]
out_dir_times = os.path.join(SIMULATION_DIRECTORY, 'epi-init-T')
if not os.path.exists(out_dir_times):
    os.mkdirs(out_dir_times)

simulations = [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]
out_dir_sims = os.path.join(SIMULATION_DIRECTORY, 'epi-init-sim')
if not os.path.exists(out_dir_sims):
    os.mkdirs(out_dir_sims)

popsize = [25, 50, 100, 250, 500, 750, 1000]
out_dir_pop = os.path.join(SIMULATION_DIRECTORY, 'epi-init-smallpop')
if not os.path.exists(out_dir_pop):
    os.mkdirs(out_dir_pop)
    
sim_inf = os.path.join(SCRIPT_DIR, 'Sim-epi-cluster.py')

In [38]:
# Run the simulations
for i in range(len(sample)):
    sample_size = sample[i]
    out_path    = os.path.join(out_dir_sample, f'sample{sample_size}')
    %run {sim_inf} --n_runs 1000 -o {out_path} -out1 {temp1} -out2 {temp2} --sample {str(sample_size)} --pop_limit 10000 -k 0.1 -R 2 --mu 0 --simulations 1 -T 50 -i {init_file}
for i in range(len(times)):
    time     = times[i]
    out_path = os.path.join(out_dir_times, f'T{time}')
    %run {sim_inf} --n_runs 1000 -o {out_path} -out1 {temp1} -out2 {temp2} --sample 25 --pop_limit 10000 -k 0.1 -R 2 --mu 0 --simulations 1 -T {str(time)} -i {init_file}
for i in range(len(simulations)):
    num_sims = simulations[i]
    out_path = os.path.join(out_dir_sims, f'sim{num_sims}')
    %run {sim_inf} --n_runs 1000 -o {out_path} -out1 {temp1} -out2 {temp2} --sample 50 --pop_limit 10000 -k 0.1 -R 2 --mu 0 --simulations {str(num_sims)} -T 10 -i {init_file}
for i in range(len(popsize)):
    pop      = popsize[i]
    out_path = os.path.join(out_dir_pop, f'N{pop}')
    %run {sim_inf} --n_runs 1000 -o {out_path} -out1 {temp1} -out2 {temp2} --sample 50 --pop_limit {str(pop)} -k 0.1 -R 2 --mu 0 --simulations 10 -T 25 -i {init_file}