Run Rob's code first to define functions.

In [13]:
from ness_vcf import SFS
from pysam import TabixFile
from annotation import annotation_table

def SFSs_from_annotation(annotation_tabix, coordinates, min_alleles=None, neutral_only=False):
    """
    This function will return a dictionary of SFS objects
    The dictionary will contain one SFS for each number of alleles that can be called
        ie min_alleles to total number of individuals sequenced
    It is possible to combine these SFSs by:
        rounding MAF * (number of individuals sequenced) and keeping only one SFS
    Arguments:
     - take a TabixFile of the annotation table
     - the chromosome, start and end (1-based inclusive)
     - an optional minimum number of alleles - below this the site is shiite so don't take a bite
     - neutral_only skips sites that aren't intergenic, intronic or 4-fold degenerate
    """
    SFSs = {}
    
    #Loop through input list of (chromosome, start, end) from gff3
    for i in coordinates:
        chromosome, start, end = i
        for line in annotation_tabix.fetch(chromosome, start-1, end):
        # `annotation_line` is a class that has all the annotation table columns as attributes 
            a = annotation_table.annotation_line(line) 
            allele_counts = a.quebec_alleles
            if neutral_only and sum([int(i) for i in [a.intergenic, a.intronic, a.fold4]]) == 0:
                continue
            try:
                MAF, total_alleles_called  = MAF_from_allele_count(allele_counts,min_alleles=min_alleles)
                #if MAF > 0: print(MAF, total_alleles_called)
            except TypeError: 
                continue
            if min_alleles != None and total_alleles_called < min_alleles: #filter sites with too few alleles called
                continue
            if total_alleles_called not in SFSs:
                SFSs[total_alleles_called] = SFS([0]*(total_alleles_called+1))
            SFSs[total_alleles_called].add(MAF,total_alleles_called)


    return SFSs


def MAF_from_allele_count(allele_counts, min_alleles=None):
    """
    return the minor allele frequency and the number of called alleles
    take a single allele_counts from annotation table ie, A:C:G:T    
    optionally min_alleles will filter sites with too few alleles called
    """
    minor_allele_count = sorted([int(i) for i in allele_counts.split(":")])[-2]
    total_alleles_called = sum([int(i) for i in allele_counts.split(":")])
    if min_alleles != None and total_alleles_called <= min_alleles:
        return None
    try:
        MAF = minor_allele_count/float(total_alleles_called)
        return (MAF,total_alleles_called)
    except ZeroDivisionError:
        return None
    
annotation_tabix = TabixFile(filename="/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/annotation_table.txt.gz")
##annotation_table has clones filtered out

In [14]:
import pickle

In [None]:
for line in annotation_tabix.fetch('chromosome_1',0,20):
    a = annotation_table.annotation_line(line) 
    print(dir(a))
    print(a.line)

In [26]:
with open('../../data/intermediate_data_from_gff/gene_set.pickle', "rb") as f:
    gene_set = pickle.load(f)
    
SFSs= SFSs_from_annotation(annotation_tabix, gene_set, min_alleles=12, neutral_only=True)
neutral = SFSs[allele].sfs

SFSs= SFSs_from_annotation(annotation_tabix, gene_set, min_alleles=12, neutral_only=False)
selected = SFSs[allele].sfs

### generate dfe-alpha input file
m = No. SFSs with different numbers of alleles sampled (m)

allele = No. alleles sampled in SFS i (xi) [= no. elements in unfolded vector]

In [40]:
m = 1
allele = 13
ls = [m, allele, selected, neutral]

In [59]:
with open('../../data/dfe_alpha/input/sfs_input_file.txt', 'w') as f:
    for i in ls: f.write(str(i)+"\n")
f.close()

## generate dfe-alpha config files

In [67]:
%%bash
#pwd
cd ../../data/dfe_alpha/input
pwd
est_dfe -c est_dfe_config_file.txt

/home/chenwe72/gitRepo/lipid_selection/data/dfe_alpha/input


bash: line 4: est_dfe: command not found


In [64]:
%%bash
pwd

/home/chenwe72/gitRepo/lipid_selection/analysis/03_extract_candidate_gene_info_from_gff


In [None]:
est_dfe -c est_dfe_config_file.txt
