In [2]:
from pathlib import Path
from random import sample
import tarfile
from io import TextIOWrapper
from collections import Counter

import tqdm.notebook
import pandas as pd

In [3]:
from utils.conversion import ms_to_numpy
from utils.prepare_data import save_data

In [4]:
neutral_sim_folder = next(Path(snakemake.config["raw_sim_folder"]).glob('*neutral*'))
ms_files = sample(
    list(neutral_sim_folder.glob('genotypes/*.tar.gz')),
    k=snakemake.params["num_neutral_simulations_to_use"]
)

In [5]:
sfs = Counter()
for ms_file in tqdm.notebook.tqdm(ms_files):
    with tarfile.open(ms_file) as f:    
        members = f.getmembers()
        for member in members:
            if member.isdir():
                continue
            fileobj = TextIOWrapper(f.extractfile(member))
            positions, genotypes = ms_to_numpy(fileobj)
            sfs.update(genotypes.sum(axis=1))

In [6]:
keys, values = zip(*sfs.items())
sfs_df = pd.DataFrame({'num_alternate': keys, 'num_sites': values}).sort_values(by='num_alternate')

In [7]:
save_data(sfs_df, snakemake.output[0])