# Compilation of test data for performance tests

This notebook records the parameters for Wright-Fisher simulations used to generate our test data sets, as well as commands for running covariance estimation algorithms on the test data and compiling the results. 

## Import required libraries and define global variables

In [3]:
# Full library list and version numbers

print('This notebook was prepared using:')

import sys
print('python version %s' % sys.version)

import numpy as np
print('numpy version %s' % np.__version__)

import seaborn as sns
print('seaborn version %s' % sns.__version__)

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
print('matplotlib version %s' % mpl.__version__)

import scipy
from scipy import stats
print('scipy version %s' % scipy.__version__)

import importlib

sys.path.append('./src')
import simulation_setup as SS
import subsample as subsample
import data_parser as DP

# GLOBAL VARIABLES

N = 1000        # number of sequences in the population
L = 50          # sequence length
T = 700         # number of generations
MU = 1e-3       # mutation rate
NUM_SELECTIONS = 10  # number of sets of selection coefficients that we used
NUM_TRIALS = 20  # for each set of selection coefficients, we simulated for 20 times
SAMPLE = [1000, 500, 100, 50, 10]  # sampling depth options when subsampling
RECORD = [1, 3, 5, 10]  # sampling time interval options when subsampling
LINEAR = [1, 5, 10, 20, 50, 100, 300]  # list of linear shrinkage strengths/sampling time intervals, e.g., interval=5, linear=5 ==> strength=25'
GAMMA = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]  # non-linear shrinkage parameter choices that we tested
TRUNCATE = [200, 300, 400, 500, 600, 700]  # truncate options to test performance using different lengths of data
WINDOW = [0, 1, 2, 3, 4, 5, 10, 20, 40, 80, 160]  # window choices that we tested

# loss functions for non-linear shrinkage that we tested
LOSS = ['Fro | $\hat{C}-C$', 'Fro | $\hat{C}^{-1}-C^{-1}$', 'Fro | $C^{-1}\hat{C}-I$', 
        'Fro | $\hat{C}^{-1}C-I$', 'Fro | $\hat{C}^{-1/2}C\hat{C}^{-1/2}-I$', 
        'Nuc | $\hat{C}-C$', 'Nuc | $\hat{C}^{-1}-C^{-1}$', 'Nuc | $C^{-1}\hat{C}-I$', 
        'Nuc | $\hat{C}^{-1}C-I$', 'Nuc | $\hat{C}^{-1/2}C\hat{C}^{-1/2}-I$'] 

# GitHub directories
DATA_DIR = './data'
SELECTION_DIR = './src/selection'
INITIAL_DIR = './src/initial'
SIMULATION_OUTPUT_DIR = './data/simulation_output'
SUBSAMPLE_OUTPUT_DIR = './data/subsample_output'
ESTIMATION_OUTPUT_DIR = './data/estimation_output'
JOB_DIR = './src/jobs'

# relative directories looking from shell scripts in JOB_DIR
SRC_DIR_REL = '..'
DATA_DIR_REL = '../../data'
SELECTION_DIR_REL = '../../src/selection'
INITIAL_DIR_REL = '../../src/initial'
SIMULATION_OUTPUT_DIR_REL = '../../data/simulation_output'
SUBSAMPLE_OUTPUT_DIR_REL = '../../data/subsample_output'
ESTIMATION_OUTPUT_DIR_REL = '../../data/estimation_output'

This notebook was prepared using:
python version 3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
numpy version 1.19.1
seaborn version 0.11.0
matplotlib version 3.3.2
scipy version 1.5.2


In [4]:
def reload():
    importlib.reload(SS)
    importlib.reload(subsample)
    importlib.reload(DP)

## Section 1. Generation of test data through Wright-Fisher simulations¶

Wright-Fisher simulations are performed using src/Wright-Fisher.py. The output of these simulations is saved for processing. The code below creates multiple job files for running many simulations in parallel on a computer cluster.

### Generate selection coefficients

In [5]:
# 10 sets of selection coefficients used for simulations
SS.generate_selection_coefficients(overwrite=False)
selections = [SS.load_selection(s) for s in range(NUM_SELECTIONS)]

### Generate initial distributions

In [4]:
# The initial distribution with four genotypes
SS.generate_default_initial_distribution(overwrite=False)

In [5]:
initial = SS.load_default_initial()
SS.print_initial_distribution(initial)

Genotype 1 01111010010011111100101001101111001110101010111110 count=260
Genotype 2 11111010000010000100110010011100011110111010001001 count=210
Genotype 3 01100111010010100001101101000011111011011010101111 count=280
Genotype 4 10011100010100100000011011011011111111111000011000 count=250



In [8]:
# 20 initial distributions of genotypes, with 4 founder genotypes
SS.generate_initial_distributions_four_founder_genotypes(overwrite=False)

In [10]:
# Check initial distributions, print out an example
i = 15
initial = SS.load_initial_four_founder_genotypes(i)
SS.print_initial_distribution(initial, max_line=30)

Genotype 1 10101011010000010010010111011011000011100111110010 count=210
Genotype 2 00110001001100001011100010010011100010100010101101 count=300
Genotype 3 11001011001001000010101000001101000011010000011010 count=250
Genotype 4 10011111011111100010111110100000000001111110011110 count=240



In [6]:
# 20 initial distributions of genotypes, with 5 to 24 founder genotypes
SS.generate_initial_distributions(overwrite=False)

In [7]:
# Check initial distributions, print out an example
i = 15
initial = SS.load_initial(i)
SS.print_initial_distribution(initial, max_line=30)

Genotype 1 01111101101011111101110011000000100111110101101011 count=40
Genotype 2 01000101110111110010111111101011101111100010100101 count=30
Genotype 3 10100011111011111110110101110000001001000001001010 count=70
Genotype 4 11101000000000110101100100100000110001110011000001 count=50
Genotype 5 01010111001110001101000110010010001111001101001100 count=50
Genotype 6 01101100111011111110010111011011101011101000001111 count=80
Genotype 7 10000111100100100010101101100101100001111000010110 count=20
Genotype 8 00111110000010111010010011101011001001011010111100 count=100
Genotype 9 10011111111010000011001100011000001100111001011101 count=20
Genotype 10 11010111100000111100011110001100010110001001010101 count=20
Genotype 11 10001001101100111100111110110010111101100110101110 count=40
Genotype 12 10001010010101111011000100001001000011010010110010 count=60
Genotype 13 00000001100100010010111110111000101010101101000100 count=30
Genotype 14 10100111010100001001001011011100010000000100110000 count=30


### Generate simulations

In [8]:
# In total, there are num_selections x num_trials = 200 simulations.
# Create jobs to run on cluster
SS.generate_simulation_scripts()

In [13]:
# Simulations all with four founder genotypes, but different
reload()
SS.generate_simulation_scripts(varying_founder_genotype=True)

In [9]:
# Simulations with varying initial distributions (5 to 24 founder genotypes)
SS.generate_simulation_scripts(varying_initial=True)

In [10]:
# Simulations with recombination
SS.generate_simulation_scripts(recombination=True, r=1e-5)

In [11]:
# Can run locally as well. Will take several hours.
# SS.generate_simulation_scripts(local=True)
# SS.generate_simulation_scripts(local=True, varying_initial=True)
# SS.generate_simulation_scripts(local=True, recombination=True, r=1e-5)

### Test if simulations are complete

In [12]:
SS.test_load_simulation()

In [16]:
reload()
SS.test_load_simulation(varying_founder_genotype=True)

In [13]:
SS.test_load_simulation(varying_initial=True)

In [14]:
SS.test_load_simulation(recombination=True, r=1e-5)

## Section 2. Subsample the simulated results for later inference under various limited sampling scenarios
We subsample the simulated results varying sampling depth and time interval. The output of is saved for investigating performance of inference under different limited sampling effects. The code below creates multiple files for running many subsampling jobs in parallel on a computer cluster.

In [14]:
# In total, there are num_selections x num_trials x len(SAMPLE) x len(RECORD) = 4000 
# subsampling runs, aggregated as 200 jobs.
SS.generate_subsample_scripts()
SS.generate_subsample_scripts(varying_founder_genotype=True)
SS.generate_subsample_scripts(varying_initial=True)
SS.generate_subsample_scripts(recombination=True, r=1e-5)

### Store time-series covariance for a case for a supplementary figure

In [16]:
SS.generate_subsample_script_with_time_series_covariance()

### Test if subsamples are complete

In [17]:
SS.test_load_subsample()

In [17]:
reload()
SS.test_load_subsample(varying_founder_genotype=True)

In [18]:
SS.test_load_subsample(varying_initial=True)

In [19]:
SS.test_load_subsample(recombination=True, r=1e-5)

## Section 3. Running the covariance estimation algorithms and compiling output
Finally we estimate covariance and infer selection coefficients for all subsampled cases generated above. The output of is saved for plotting. The code below creates multiple files to run in parallel on a computer cluster.

### Estimation runs with minimal output
The jobs defined below run the estimation algorithm and store a minimal size of output, which contains covariance matrix only for some cases.

In [20]:
reload()

In [21]:
# In total, there are len(TRUNCATE) x len(WINDOW) = 66 jobs
# Each job (for a particular (tr, win) pair) outputs ~8MB, totaling ~500MB
SS.generate_estimate_scripts()

In [18]:
SS.generate_estimate_scripts(varying_founder_genotype=True)

In [22]:
SS.generate_estimate_scripts(varying_initial=True)

In [23]:
SS.generate_estimate_scripts(recombination=True, r=1e-5)

In [19]:
SS.test_load_estimation(varying_founder_genotype=True)

In [24]:
SS.test_load_estimation()
SS.test_load_estimation(varying_initial=True)
SS.test_load_estimation(recombination=True, r=1e-5)

### Estimation runs with complete output
The jobs defined below run the estimation algorithm and store complete output.

In [25]:
PARAMS_COMPLETE_OUTPUT = {
    'truncate_list': [700],
    'window_list': [20],
    'complete_output': True,
}

In [26]:
# Each job (for a particular (tr, win) pair) outputs ~500MB
# If running for all (tr, win) pairs, the total output will be ~30GB
SS.generate_estimate_scripts(**PARAMS_COMPLETE_OUTPUT)

In [27]:
SS.generate_estimate_scripts(**PARAMS_COMPLETE_OUTPUT, 
                             varying_initial=True)

In [28]:
SS.generate_estimate_scripts(**PARAMS_COMPLETE_OUTPUT, 
    recombination=True, r=1e-5)

## Section 4. Running Evoracle
We also test Evoracle on our simulated data, as a compare method. 

In [29]:
SS.generate_evoracle_scripts()

Then run the evoracle scripts. Instructions on installing the *Evoracle* package can be found in https://github.com/maxwshen/evoracle

In [30]:
SS.test_load_evoracle()

## Section 5. Running haploSep

In [31]:
# First save simulation trajectories as input for haploSep
DP.save_simulation_traj_for_haplosep()

Then run the R script, *data/haploSep/run_haploSep_for_simulated_data.R*

In [34]:
%%time
# After running haploSep using the R script, 
# parse the results and save the parsed output
_ = DP.parse_simulation_results_for_haplosep(save=True)

CPU times: user 32min 40s, sys: 10.6 s, total: 32min 51s
Wall time: 32min 14s
