In [2]:
import numpy as np
import pandas as pd

from utils.prepare_data import read_data

First, convert this SFS into a folded SFS, since we don't have the ancestral and derived states.

In [3]:
N = snakemake.params["sample_size"]

In [4]:
sfs = read_data(snakemake.input[0])
sfs = (
    sfs
    .assign(num_minor=np.minimum(sfs.num_alternate, N - sfs.num_alternate))
    .groupby("num_minor")
    .sum("num_sites")
    .drop("num_alternate", axis='columns')
    .reset_index()
)
sfs = sfs.loc[sfs.num_minor != 0]

Get the heterozygosity of a site with $p$ minor allele count:

In [8]:
sfs = sfs.assign(
    heteroz = (2*sfs.num_minor*(N - sfs.num_minor))/(N*(N - 1)),
)
sfs = sfs.assign(total_heteroz = sfs.heteroz*sfs.num_sites)

In [10]:
pi = sfs.total_heteroz.sum()/snakemake.params["total_genome_size"]

In [11]:
pi

In [13]:
mu = pi/(4*snakemake.params["reference_ne"])

In [15]:
rec = mu/snakemake.params["mu_over_r"]

In [18]:
with open(snakemake.output[0], 'w') as f:
    f.write(f"Estimated pi: {pi}\n")
    f.write(f"Estimated mutation rate: mu={mu}\n")
    f.write(f"Implied recombination rate, with mu/r={snakemake.params['mu_over_r']}: r={rec}")