In [2]:
import tarfile
from pathlib import Path
from io import BytesIO
import re

import numpy as np
import pandas as pd
import tqdm.notebook

### Empirical signatures

In [3]:
sweeps = dict()
with tarfile.open(snakemake.input["empirical"]) as tar:
    members = tar.getmembers()
    for member in members:
        if (member.isdir()) or ('sweep' not in member.name):
            continue
        array_file = BytesIO()
        array_file.write(tar.extractfile(member).read())
        array_file.seek(0)
        sweeps[Path(member.name).stem] = np.load(array_file)

### Simulated sweeps averages

In [51]:
params = pd.read_table(snakemake.input["parameters"])
params = params.loc[params.sweep_mode == "sgv (true)", ['uuid', 'log_selection_coefficient']]
params = params.assign(selbin=pd.cut(params.log_selection_coefficient, bins=snakemake.params["num_sel_bins"]))
params = params.assign(
    lower = [round(10**x.left, 2) for x in params.selbin],
    upper = [round(10**x.right, 2) for x in params.selbin]
)
params = params.assign(
    selbin_str = [f"{i} < s < {j}" for i, j in zip(params.lower, params.upper)]
)
params = params.set_index('uuid')

In [52]:
params

In [56]:
simulated = {
    key: list() for key in params.selbin_str.unique()
}

In [57]:
with tarfile.open(snakemake.input["simulated"]) as tar:
    members = tar.getmembers()
    for member in members:
        name = member.name.split('.')[0]
        if (member.isdir()) or (name not in params.index):
            continue
        array_file = BytesIO()
        array_file.write(tar.extractfile(member).read())
        array_file.seek(0)
        x = np.load(array_file)
        selbin = params.selbin_str.loc[name]
        simulated[selbin].append(x)

In [58]:
for key, arrays in simulated.items():
    mean = np.stack(arrays).mean(axis=0)
    sweeps[key] = mean

### Turn signals into tidy table

In [61]:
from utils.project_parameters import summary_statistic_order, smallest_window, locus_size, data_dimension
from utils.prepare_data import save_data
from utils.feature_calculation import get_windows

window_sizes, center_pos_dict = get_windows(
    locus_size,
    data_dimension,
    start_pos=1,
    smallest_window=smallest_window,
)
window_sizes = list(reversed(window_sizes)) # Due to the way Numpy assigns dimension labels

In [66]:
def tidify(signal, name, s=None):
    coords, values = zip(*np.ndenumerate(signal))
    df = (
        pd
        .DataFrame(coords, columns=['window_size', 'position', 'feature'])
        .assign(value=values)
    )
    df = df.assign(
        window_size=[window_sizes[i] for i in df.window_size],
        feature=[summary_statistic_order[i] for i in df.feature],
        dataset=name
    )
    return df

In [67]:
final_dataframes = []
final_dataframes.extend([tidify(data, name) for name, data in sweeps.items()])
result = pd.concat(final_dataframes)

In [68]:
save_data(result, snakemake.output[0])