# Generate neutral and selected SFS vectors of all genes

In [37]:
import sys
sys.path.append('../../scripts/')
#sys.path
import pandas as pd
import Search_algorithms as sag
import gzip
import math
import re
import time
import numpy as np
import pickle

In [38]:
gff_file_path='/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/Creinhardtii_v5.3_223_gene.gff3.gz'
merged = pd.read_csv("../../data/intermediate_data_from_gff/merged_v5_3_1.csv") 

In [39]:
merged[:10]

Unnamed: 0,transcript_id,num_detected,num_sampled,proportion,source,annotation_version,gene_id,gene_symbol,pathway_id,transcript_id_v5.3.1
0,Cre01.g000017.t1.1,0,2,0.0,"['Bajhaiya_2016', 'Kwak_2017']",v5.5,Cre01.g000017,,,g2.t1
1,Cre01.g000033.t1.1,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000033,,,g3.t1
2,Cre01.g000050.t1.1,0,1,0.0,['Gargouri_2015'],v5.5,Cre01.g000050,,,Cre01.g000050.t1.3
3,Cre01.g000100.t1.1,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000100,,,Cre01.g000100.t1.3
4,Cre01.g000150.t1.2,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000150,ZRT2,,Cre01.g000150.t1.2
5,Cre01.g000200.t1.1,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000200,,,Cre01.g000200.t1.3
6,Cre01.g000250.t1.2,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000250,,,Cre01.g000250.t1.2
7,Cre01.g000300.t1.1,1,1,0.125,['Bajhaiya_2016'],v5.5,Cre01.g000300,,"['Creinhardtii PWY-5667', 'Creinhardtii PWY-74...",Cre01.g000300.t1.3
8,Cre01.g000350.t1.1,1,2,0.125,"['Hemme_2014', 'Bajhaiya_2016']",v5.5,Cre01.g000350,,,Cre01.g000350.t1.3
9,Cre01.g000400.t1.2,0,1,0.0,['Bajhaiya_2016'],v5.5,Cre01.g000400,,,Cre01.g000400.t1.2


In [19]:
def tryextract(i, pattern):
    
    '''This function takes in a string and returns the first matching group in the search pattern'''
    
    try: 
        m = re.search(pattern, i).group(1)
        return(m)
    
    except AttributeError:
        return(None)

In [20]:
#Import gff as working dataframe. Extract Name and PAC id from the column 'attribute' as separate columns
with gzip.open(gff_file_path, "rt", encoding="utf-8") as z:
    
    df = pd.read_csv(z,delimiter=r"\s+",skiprows=1,header=None)
    df.columns = ['chromosome', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
    df['Name'] = df.attributes.apply(lambda x: tryextract(x, r"Name=(.+);pacid"))
    df['ID'] = df.attributes.apply(lambda x: tryextract(x, r"ID=(PAC:[0-9]+)"))
    

In [21]:
subset = df[df.feature =="mRNA"].sort_values(by="Name").reset_index()
search_list = list(subset.Name)
pac_id = list(np.unique(df.ID.dropna()))

In [22]:
#Order list of matching PAC id's to 'transcript_id_v5.3.1'
#import re
t0 = time.time()
pacid_list = []

for row in range(0,len(merged)):
    transcript_id = merged.loc[row,'transcript_id_v5.3.1']
    pacid = None
    
    index = sag.BinarySearch(search_list, transcript_id)
    
    if index > -1: 
        pacid = subset.loc[index, "ID"]
                    
    pacid_list.append(pacid)

t1 = time.time()
print("Time taken", t1-t0, "s")

Time taken 10.597784519195557 s


In [23]:
merged['PAC_id'] = pacid_list
#master_gene_sfs = merged

In [24]:
t0 = time.time()
exon_positions = []
features_condition = ['five_prime_UTR', 'CDS', 'three_prime_UTR']
#keep gff rows that contain CDS positions, remove gff rows that are missing PAC_ID's
filtered_df = df[df.feature.isin(features_condition)].dropna(subset = ['ID'])
#pos =0
#loop through sampled transcipts with PAC_ID's; transcript filtered by condition variable
for pac_index in range(0,len(pac_id)): 
    
    temp = filtered_df[filtered_df.ID==pac_id[pac_index]]
    #print(transcript)
    if len(temp)== 0: exon_positions.append(None)
        
    else:
        exon_positions.append([])
        #add CDS coordinates as tuple (chromosome, start, end) to gene_set
        for i in list(temp.index):
            exon_positions[pac_index].append((temp.chromosome[i], temp.start[i], temp.end[i]))
    #pos+=1
    #if count>20: break
        
t1 = time.time()
print(t1-t0)

208.23728322982788


In [40]:
with open('exon_positions.pickle', 'wb') as f:
    pickle.dump(exon_positions, f)

In [41]:
exon_positions = pickle.load(open("exon_positions.pickle", "rb"))

In [6]:
def reduce_SFS(SFS_dict, max_alleles = 18):
    
    '''This function aggregates SFS vectors from SFSs_from_annotation() dictionary output into one SFS vector.
    Default max_alleles = 18'''
    
    ref_SFS = range(0,max_alleles)
    new_sfs = [0]*max_alleles
    for key in SFS_dict.keys():
        
        sfs_length = len(SFS_dict[key].sfs)
        #print(key)
        normalized_index = [round(i/sfs_length*max_alleles)-1 for i in range(1,sfs_length+1)]
        #print(normalized_index)
        for i in range(0,sfs_length): new_sfs[normalized_index[i]]+=SFS_dict[key].sfs[i]
        
    return(new_sfs)

In [None]:
#Set up input for testing collapse_SFS()
#coordinates = exon_positions[4]
coordinates = [('chromosome_8', 2509558, 2509598), 
               ('chromosome_8', 2509599, 2509853), 
               ('chromosome_8', 2510289, 2510399), 
               ('chromosome_8', 2510839, 2510925), 
               ('chromosome_8', 2511587, 2511649), 
               ('chromosome_8', 2511975, 2512075), 
               ('chromosome_8', 2512492, 2512594), 
               ('chromosome_8', 2512961, 2513116), 
               ('chromosome_8', 2513372, 2513452), 
               ('chromosome_8', 2513453, 2514419)]
neutral = SFSs_from_annotation(annotation_tabix, coordinates, min_alleles=12, neutral_only=True)

In [None]:
#Test if collapse_SFS() returns error
reduce_SFS(neutral)

SFS vectors from SFS dictionary output of `SFSs_from_annotation()` contains the number of invariants as the first element.

In [None]:
for key in neutral.keys():
    print ("SFS vector", neutral[key].sfs)
    print("Number of alleles (also length of SFS vector):", neutral[key].alleles)
    print("Number of invariant sites:", neutral[key].invariant())
    print("\n")

In [36]:
t0 = time.time()
selected_SFS = []
neutral_SFS = []
#for i in range(len(exon_positions)):
for i in range(0,100):
    coordinates = exon_positions[i]
    if coordinates != None:
        selected = SFSs_from_annotation(coordinates, min_alleles=12, neutral_only=False)
        selected_SFS.append(reduce_SFS(selected, max_alleles = 18))
        neutral = SFSs_from_annotation(coordinates, min_alleles=12, neutral_only=True)
        neutral_SFS.append(reduce_SFS(neutral, max_alleles = 18))
t1 = time.time()
print("100 genes took:", t1-t0, "seconds.")
print("Estimated calculation time:", len(exon_positions)/100*(t1-t0)/60, "minutes")

100 genes took: 18.34585189819336 seconds.
Estimated calculation time: 59.712690286636345 minutes


In [7]:
#t0 = time.time()
def collect_reduced_SFS(exon_positions):
#for i in range(len(exon_positions)):
    coordinates = exon_positions
    if coordinates != None:
        try:
            selected = SFSs_from_annotation(annotation_tabix, coordinates, min_alleles=12, neutral_only=False)
            selected_SFS = reduce_SFS(selected, max_alleles = 18)
        except ValueError: selected_SFS = None
        try: 
            neutral = SFSs_from_annotation(annotation_tabix, coordinates, min_alleles=12, neutral_only=True)
            neutral_SFS = reduce_SFS(neutral, max_alleles = 18)
        except ValueError: neutral_SFS =  None
        
        #return coordinates, selected_SFS, neutral_SFS
        return selected_SFS, neutral_SFS
#t1 = time.time()
#print("100 genes took:", t1-t0, "seconds.")
#print("Estimated calculation time:", len(exon_positions)/100*(t1-t0)/60, "minutes")

In [224]:
#test collect_reduced_SFS
for i in range(10):
    print(collect_reduced_SFS(exon_positions[i]))


([227, 0, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
([820, 39, 9, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [126, 9, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0])
([2115, 62, 40, 28, 13, 19, 13, 14, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0], [273, 12, 11, 3, 5, 6, 6, 3, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0])
([4233, 85, 33, 18, 11, 14, 10, 6, 13, 1, 0, 0, 0, 0, 0, 0, 0, 0], [791, 38, 16, 8, 8, 8, 8, 4, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0])
([1337, 32, 18, 3, 1, 13, 5, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [127, 4, 4, 1, 1, 6, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
([2329, 40, 27, 13, 33, 12, 14, 3, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0], [228, 5, 3, 2, 3, 2, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0])
([3870, 136, 42, 39, 19, 36, 19, 26, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0], [549, 31, 14, 16, 6, 13, 9, 5, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0])
([3847, 136, 42, 39, 19, 36, 19, 26, 12, 3, 0, 0, 0, 0, 0, 0, 0, 0], [526, 31, 14, 16, 6, 13, 9, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0])
([

In [9]:
#import sys
#sys.path.append('/home/chenwe72/local/lib/python3.5/site-packages/')
import psutil #psutil-5.7.0
from multiprocessing import Pool #multiprocessing-2.6.2.1
import time
# logical=True counts threads, but we are interested in cores

In [None]:
print(psutil.cpu_count(logical=False))
pool = Pool(psutil.cpu_count(logical=False))
print(pool)

In [242]:
variable = 5
def add_var(input):
    output = input+variable
    return output
def test(input):
    output = add_var(input)*2
    return output
%timeit list(map(test, range(24)))
print(list(map(test, range(24))))

7.59 µs ± 18.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
[10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56]


In [None]:
exon_positions[0:5]

In [240]:
print(list(map(collect_reduced_SFS, exon_positions[:5])))

[([227, 0, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([820, 39, 9, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [126, 9, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([2115, 62, 40, 28, 13, 19, 13, 14, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0], [273, 12, 11, 3, 5, 6, 6, 3, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([4233, 85, 33, 18, 11, 14, 10, 6, 13, 1, 0, 0, 0, 0, 0, 0, 0, 0], [791, 38, 16, 8, 8, 8, 8, 4, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0]), ([1337, 32, 18, 3, 1, 13, 5, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [127, 4, 4, 1, 1, 6, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])]


In [235]:
with Pool(processes=6) as pool: 
    print(list(pool.map(test, range(24))))
    
with Pool(processes=1) as pool: 
    print(list(pool.map(collect_reduced_SFS, exon_positions[1:2])))

[10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56]
[([820, 39, 9, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [126, 9, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0])]


In [236]:
with Pool(processes=1) as pool: 
    print(list(pool.map(collect_reduced_SFS, exon_positions[1:10])))

[([820, 39, 9, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [126, 9, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([2115, 62, 40, 28, 13, 19, 13, 14, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0], [273, 12, 11, 3, 5, 6, 6, 3, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([4233, 85, 33, 18, 11, 14, 10, 6, 13, 1, 0, 0, 0, 0, 0, 0, 0, 0], [791, 38, 16, 8, 8, 8, 8, 4, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0]), ([1337, 32, 18, 3, 1, 13, 5, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [127, 4, 4, 1, 1, 6, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([2329, 40, 27, 13, 33, 12, 14, 3, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0], [228, 5, 3, 2, 3, 2, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0]), ([3870, 136, 42, 39, 19, 36, 19, 26, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0], [549, 31, 14, 16, 6, 13, 9, 5, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0]), ([3847, 136, 42, 39, 19, 36, 19, 26, 12, 3, 0, 0, 0, 0, 0, 0, 0, 0], [526, 31, 14, 16, 6, 13, 9, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [237]:
with Pool(processes=2) as pool: 
    print(list(pool.map(collect_reduced_SFS, exon_positions[1:2])))

[([820, 39, 9, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [126, 9, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0])]


In [None]:
with Pool(processes=2) as pool: 
    print(list(pool.map(collect_reduced_SFS, exon_positions[1:3])))

In [23]:
from functools import partial
    
def main():
    pool = Pool(processes=1)
    func = partial(SFSs_from_annotation, min_alleles=12, neutral_only=False)
    for i in list(pool.map(func, exon_positions[0:1])):
        for key, value in i.items():
            print(value.sfs)

if __name__ == '__main__':
    main()

[188, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [21]:
def main():
    pool = Pool(processes=1)
    func = partial(SFSs_from_annotation, min_alleles=12, neutral_only=False)
    for i in list(pool.map(func, exon_positions[0:3])):
        for key, value in i.items():
            print(value.sfs)

if __name__ == '__main__':
    main()

[188, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[24, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[17, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[774, 36, 9, 1, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[169, 4, 9, 4, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[387, 16, 6, 4, 5, 5, 1, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1156, 31, 17, 12, 4, 6, 4, 7, 7, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[68, 2, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[140, 4, 3, 0, 5, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[195, 7, 6, 3, 2, 3, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0]


In [22]:
def main():
    pool = Pool(processes=2)
    func = partial(SFSs_from_annotation, min_alleles=12, neutral_only=False)
    for i in list(pool.map(func, exon_positions[0:3])):
        for key, value in i.items():
            print(value.sfs)

if __name__ == '__main__':
    main()

[188, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[24, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[17, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[774, 36, 9, 1, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[169, 4, 9, 4, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[387, 16, 6, 4, 5, 5, 1, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1156, 31, 17, 12, 4, 6, 4, 7, 7, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[68, 2, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[140, 4, 3, 0, 5, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[195, 7, 6, 3, 2, 3, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0]


In [25]:
def main():
    pool = Pool(processes=1)
    func = partial(SFSs_from_annotation, min_alleles=12, neutral_only=False)
    selected_vector = list(pool.map(func, exon_positions[0:1]))
    
    func = partial(reduce_SFS, max_alleles = 18)
    print(list(pool.map(func, sfs_vectors)))

if __name__ == '__main__':
    main()

[[227, 0, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [26]:
def main():
    pool = Pool(processes=1)
    func = partial(SFSs_from_annotation, min_alleles=12, neutral_only=False)
    sfs_vectors = list(pool.map(func, exon_positions[0:3]))
    
    func = partial(reduce_SFS, max_alleles = 18)
    print(list(pool.map(func, sfs_vectors)))

if __name__ == '__main__':
    main()

[[227, 0, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [820, 39, 9, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2115, 62, 40, 28, 13, 19, 13, 14, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [32]:
t0 = time.time()
def main():
    pool = Pool(processes=4)
    
    exon_portion = exon_positions[0:100]
    
    func = partial(SFSs_from_annotation, min_alleles=12, neutral_only=False)
    selected = list(pool.map(func, exon_portion))
    
    func = partial(SFSs_from_annotation, min_alleles=12, neutral_only=True)
    neutral = list(pool.map(func, exon_portion))
    
    func = partial(reduce_SFS, max_alleles = 18)
    reduced_selected = list(pool.map(func, selected))
    
    func = partial(reduce_SFS, max_alleles = 18)
    reduced_neutral = list(pool.map(func, neutral))
    
    return reduced_selected, reduced_neutral

if __name__ == '__main__':
    reduced_selected, reduced_neutral = main()
    
t1 = time.time()

In [33]:
print(t1-t0)
print(reduced_selected)

4.720616340637207
[[227, 0, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [820, 39, 9, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2115, 62, 40, 28, 13, 19, 13, 14, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4233, 85, 33, 18, 11, 14, 10, 6, 13, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1337, 32, 18, 3, 1, 13, 5, 10, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2329, 40, 27, 13, 33, 12, 14, 3, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0], [3870, 136, 42, 39, 19, 36, 19, 26, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0], [3847, 136, 42, 39, 19, 36, 19, 26, 12, 3, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1448, 16, 23, 16, 4, 17, 8, 8, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [2518, 58, 36, 16, 19, 35, 21, 28, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0], [352, 4, 4, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [711, 16, 23, 23, 6, 14, 6, 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1635, 38, 12, 3, 27, 19, 14, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2644, 70, 30, 38, 22, 19, 14, 8, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2563, 67, 30, 36, 22, 19, 14,

In [13]:
def SFSs_from_annotation(coordinates, min_alleles=None, neutral_only=False): 
    annotation_tabix = TabixFile(filename="/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/annotation_table.txt.gz")
    """
    This function will return a dictionary of SFS objects
    The dictionary will contain one SFS for each number of alleles that can be called
        ie min_alleles to total number of individuals sequenced
    It is possible to combine these SFSs by:
        rounding MAF * (number of individuals sequenced) and keeping only one SFS
    Arguments:
     - take a TabixFile of the annotation table
     - the chromosome, start and end (1-based inclusive)
     - an optional minimum number of alleles - below this the site is shiite so don't take a bite
     - neutral_only skips sites that aren't intergenic, intronic or 4-fold degenerate
    """
    SFSs = {}
    
    #Loop through input list of (chromosome, start, end) from gff3
    for i in coordinates:
        chromosome, start, end = i
        for line in annotation_tabix.fetch(chromosome, start-1, end):
        # `annotation_line` is a class that has all the annotation table columns as attributes 
            a = annotation_table.annotation_line(line) #a has a lot of attributes
            allele_counts = a.quebec_alleles
            if neutral_only and sum([int(i) for i in [a.intergenic, a.intronic, a.fold4]]) == 0: #these are all neutral/silent sites
                #because we are only looking at CDS, we will never get intergenic and intronic sequences
                #our SFS in neutral  is a fold4 site
                #fold0 sites go in selected SFS vector in est_dfe
                continue
            try:
                MAF, total_alleles_called  = MAF_from_allele_count(allele_counts,min_alleles=min_alleles)
                #if MAF > 0: print(MAF, total_alleles_called)
            except TypeError: 
                continue
            if min_alleles != None and total_alleles_called < min_alleles: #filter sites with too few alleles called
                continue
            if total_alleles_called not in SFSs: 
                #make SFS dictionary where key = n for SFS vector of length n
                #you can't feed program with different alleles, so try to standardize and round them somehow in proportion to max alleles
                SFSs[total_alleles_called] = SFS([0]*(total_alleles_called+1))
            SFSs[total_alleles_called].add(MAF,total_alleles_called)


    return SFSs


def MAF_from_allele_count(allele_counts, min_alleles=None): # num of rare alleles/num of total alleles
    """
    return the minor allele frequency and the number of called alleles
    take a single allele_counts from annotation table ie, A:C:G:T    
    optionally min_alleles will filter sites with too few alleles called
    """
    minor_allele_count = sorted([int(i) for i in allele_counts.split(":")])[-2]
    total_alleles_called = sum([int(i) for i in allele_counts.split(":")])
    if min_alleles != None and total_alleles_called <= min_alleles:
        return None
    try:
        MAF = minor_allele_count/float(total_alleles_called)
        return (MAF,total_alleles_called)
    except ZeroDivisionError:
        return None
    

##annotation_table has clones filtered out

In [45]:
%%bash

UsageError: %%bash is a cell magic, but the cell body is empty.


In [48]:
%%bash
tabix --H /scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/annotation_table.txt.gz

##chromosome=sequence from Chlamydomonas reinhardtii v5.3 genome or mtMinus, mtDNA or cpDNA [STRING]
##position=1-based position in on the chromosome [INTEGER] 
##reference_base=nucleotide carried by Chlamydomonas reinhardtii v5.3 genome at that position and chromosome [STRING]
##genic=position is part of a gene (UTR, intron, exon, CDS) [0/1]
##exonic=position is part of an exon (UTR, CDS)[0/1]
##intronic=position is part of an intron (inferred from positions of exons)[0/1]
##intergenic=position is not annotated as genic[0/1]
##utr5=position is 5' UTR[0/1]
##utr3=position is 3' UTR[0/1]
##fold0=position is 0-fold degenerate [0/1]
##fold4=position is 4-fold degenerate [0/1]
##fold2=position is 2-fold degenerate [0/1]
##fold3=position is 3-fold degenerate [0/1]
##CDS=position is protein coding [0/1]
##mRNA=position is part of the transcribed region. This unfortunately includes the introns[0/1]
##rRNA=an annotated ribosomal RNA[0/1]
##tRNA=an annoated tranfer RNA [0/1]
##feature_names=A p

tabix: invalid option -- '-'


In [203]:
if __name__ == '__main__':
    
    import sys
    sys.path.append('../../scripts/')
    from ness_vcf import SFS
    from pysam import TabixFile
    from annotation import annotation_table
    import alice_custom as ac
    import pickle

    annotation_tabix = TabixFile(filename="/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/annotation_table.txt.gz")
    exon_positions = pickle.load(open("exon_positions.pickle", "rb"))
    
    selected_SFS = []
    neutral_SFS = []
    
    with Pool(processes=6) as pool:
        for i in pool.map(SFSs_from_annotation, (annotation_tabix, exon_positions[:10], min_alleles=12, neutral_only=False)) :
            #pos, selected, neutral = i
            #selected_SFS.append(selected)
            #neutral_SFS.append(neutral)
            print(i)

#print(selected_SFS)
#with open('selected_SFS.pickle', 'wb') as f:
#    pickle.dump(selected_SFS, f)

#if __name__ == '__main__':
#    with Pool(processes=3) as pool:
#        results = pool.map_async(collect_reduced_SFS, exon_positions[:10])

SyntaxError: invalid syntax (<ipython-input-203-4fa6e0f2d284>, line 18)