In [12]:
import numpy as np
import pandas as pd

import allel

from utils.prepare_data import read_data, save_data

In [3]:
data = allel.read_vcf(snakemake.input["vcf"])
sample_size = len(data["samples"])*2

In [4]:
sample_size

In [5]:
records = list()

for position, haps in zip(data['variants/POS'], data['calldata/GT']):
    genotypes = haps.ravel()
    records.append((position, genotypes.sum(), len(genotypes), 1))
    
sf_data = pd.DataFrame.from_records(records, columns=['position', 'x', 'n', 'folded'])
save_data(sf_data, snakemake.output["data"])

In [15]:
sfs = read_data(snakemake.input["sfs"])

# This is a janky way of going from a SFS of sample size 220 to sample size 216
# Technically, there's a real downsampling solution in dadi
sfs = sfs.loc[(sfs.num_alternate > 0) & (sfs.num_alternate < sample_size)]

# Now let's fold this thing
folded = sfs.assign(num_derived = np.minimum(sfs.num_alternate, sample_size - sfs.num_alternate))
folded = pd.DataFrame(folded.groupby('num_derived').num_sites.sum()).reset_index()
folded = pd.DataFrame({
    'num_minor': folded.num_derived,
    'num_sites': folded.num_sites,
    'prop_num_sites': folded.num_sites/folded.num_sites.sum()
})
# Add a row for monomorphic sites with 0 minor alleles
monomorphic = pd.DataFrame({'num_minor': 0, 'prop_num_sites': 0, 'num_sites': 0}, index=[-1])
sfs_result = pd.concat([monomorphic, folded]).reset_index(drop=True)

In [20]:
sfs_result[['num_minor', 'prop_num_sites']].to_csv(snakemake.output["sfs"], header=False, index=False, sep='\t')