Run Rob's code first to define functions.

In [42]:
from ness_vcf import SFS
from pysam import TabixFile
from annotation import annotation_table

def SFSs_from_annotation(annotation_tabix, coordinates, min_alleles=None, neutral_only=False):
    """
    This function will return a dictionary of SFS objects
    The dictionary will contain one SFS for each number of alleles that can be called
        ie min_alleles to total number of individuals sequenced
    It is possible to combine these SFSs by:
        rounding MAF * (number of individuals sequenced) and keeping only one SFS
    Arguments:
     - take a TabixFile of the annotation table
     - the chromosome, start and end (1-based inclusive)
     - an optional minimum number of alleles - below this the site is shiite so don't take a bite
     - neutral_only skips sites that aren't intergenic, intronic or 4-fold degenerate
    """
    SFSs = {}
    
    #Loop through input list of (chromosome, start, end) from gff3
    for i in coordinates:
        chromosome, start, end = i
        for line in annotation_tabix.fetch(chromosome, start-1, end):
        # `annotation_line` is a class that has all the annotation table columns as attributes 
            a = annotation_table.annotation_line(line) #a has a lot of attributes
            allele_counts = a.quebec_alleles
            if neutral_only and sum([int(i) for i in [a.intergenic, a.intronic, a.fold4]]) == 0: #these are all neutral/silent sites
                continue
            try:
                MAF, total_alleles_called  = MAF_from_allele_count(allele_counts,min_alleles=min_alleles)
                #if MAF > 0: print(MAF, total_alleles_called)
            except TypeError: 
                continue
            if min_alleles != None and total_alleles_called < min_alleles: #filter sites with too few alleles called
                continue
            if total_alleles_called not in SFSs: 
                #make SFS dictionary where key = n for SFS vector of length n
                #you can't feed program with different alleles, so try to standardize and round them somehow in proportion to max alleles
                SFSs[total_alleles_called] = SFS([0]*(total_alleles_called+1))
            SFSs[total_alleles_called].add(MAF,total_alleles_called)


    return SFSs


def MAF_from_allele_count(allele_counts, min_alleles=None): # num of rare alleles/num of total alleles
    """
    return the minor allele frequency and the number of called alleles
    take a single allele_counts from annotation table ie, A:C:G:T    
    optionally min_alleles will filter sites with too few alleles called
    """
    minor_allele_count = sorted([int(i) for i in allele_counts.split(":")])[-2]
    total_alleles_called = sum([int(i) for i in allele_counts.split(":")])
    if min_alleles != None and total_alleles_called <= min_alleles:
        return None
    try:
        MAF = minor_allele_count/float(total_alleles_called)
        return (MAF,total_alleles_called)
    except ZeroDivisionError:
        return None
    
annotation_tabix = TabixFile(filename="/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/annotation_table.txt.gz")
##annotation_table has clones filtered out

In [43]:
import pickle

### generate dfe-alpha input file
m = No. SFSs with different numbers of alleles sampled (m)

allele = No. alleles sampled in SFS i (xi) [= no. elements in unfolded vector]

In [45]:
m = 1
allele = 13

In [46]:
with open('../../data/intermediate_data_from_gff/gene_set.pickle', "rb") as f:
    gene_set = pickle.load(f)
    
SFSs= SFSs_from_annotation(annotation_tabix, gene_set, min_alleles=12, neutral_only=True)
neutral = SFSs[allele].sfs

SFSs= SFSs_from_annotation(annotation_tabix, gene_set, min_alleles=12, neutral_only=False)
selected = SFSs[allele].sfs

ls = [m, allele, selected, neutral]

In [47]:
with open('../../data/dfe_alpha/input/sfs_input_file.txt', 'w') as f:
    for i in ls[:2]: f.write(str(i)+"\n")
    for i in ls[2:4]: 
        for val in i: f.write(str(val)+" ") 
        f.write(str(val)+"\n") 
f.close()

## try to run est_dfe

In [50]:
%%bash
export PATH=$PATH:/scratch/research/tmp_apps/dfe-alpha-release-2.16/
source ~/.bashrc
echo $PATH
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/
echo $LD_LIBRARY_PATH
cd ../../data/dfe_alpha/test/input/
ls -a
est_dfe -c est_dfe_alpha_config_file.txt

/usr/local/bin:/usr/local/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/opt/amos/bin:/opt/bbmap:/opt/bcl2fastq2/bin:/opt/bcftools/1.10.2:/opt/beast/bin:/opt/bedtools2/bin:/opt/blat:/opt/bowtie2:/opt/bwa/0.7.17:/opt/canu/Linux-amd64/bin:/opt/diamond:/opt/fastpmaster/0.20.1:/opt/fastqc/0.11.9:/opt/freebayes/1.3.1:/opt/idba/bin:/opt/megan:/opt/miniasm:/opt/minimap2:/opt/mummer/bin:/opt/ORG.asm-1.0.00-alpha10/bin:/opt/pandoc/2.9.2.1:/opt/pilon:/opt/qualimap/2.2.1:/opt/quast:/opt/samblaster:/opt/samtools/1.10:/opt/sratoolkit/bin:/opt/star:/opt/vsearch/bin:/opt/yaha/bin:HOME/bin:/opt/amos/bin:/opt/bbmap:/opt/bcl2fastq2/bin:/opt/bcftools/1.10.2:/opt/beast/bin:/opt/bedtools2/bin:/opt/blat:/opt/bowtie2:/opt/bwa/0.7.17:/opt/canu/Linux-amd64/bin:/opt/diamond:/opt/fastpmaster/0.20.1:/opt/fastqc/0.11.9:/opt/freebayes/1.3.1:/opt/idba/bin:/opt/megan:/opt/miniasm:/opt/minimap2:/opt/mummer/bin:/opt/ORG.asm-1.0.00-alpha10/bin:/opt/pandoc/2.9.2.1:/

**directory_config.dat** `<str>`: Name of directory containing the data files for the 1 and 2 epoch models. If absent, default directory specified in directory_config.dat is used



In [None]:
est_dfe -c est_dfe_config_file.txt
