In [175]:
import pandas as pd
import pickle as pickle
import numpy as np

In [None]:
#Define snakemake variables
#Change to code cell before running snakemake

clean_primary_data_path = str(snakemake.input[0])
sampled_genes_path = str(snakemake.input[1])
divergence_path = str(snakemake.input[2])

condition_sets= snakemake.config['condition_sets']

type_of_study = snakemake.config['type_of_study']
not_type_of_study = snakemake.config['not_type_of_study']
pathway = snakemake.config['pathway']

MANIPULATED_GENE = snakemake.config['MANIPULATED_GENE']
MIN_NUM_SAMPLED = snakemake.config['MIN_NUM_SAMPLED']
MIN_NUM_DETECTED = snakemake.config['MIN_NUM_DETECTED']

NC_POOL_SOURCE = snakemake.config['NC_POOL_SOURCE']

output_path = str(snakemake.params.prefix)

local_params = [type_of_study, not_type_of_study,MANIPULATED_GENE,MIN_NUM_SAMPLED, MIN_NUM_DETECTED,pathway]

#Define notebook variables
#Change to markdown before running snakemake
clean_primary_data_path = "../../data/workflow/clean_primary_data.pk"
sampled_genes_path = "../../data/workflow/sampled_genes.pk"
divergence_path = "../../data/workflow/SFS_and_divergence.pk"

condition_sets= [1,2]
type_of_study = ['transcriptomics']
not_type_of_study = ['RT_qPCR']
MANIPULATED_GENE = 2
MIN_NUM_SAMPLED = 0
MIN_NUM_DETECTED = 0
pathway = []
NC_POOL_SOURCE = 'genome'
local_params = [type_of_study, not_type_of_study,MANIPULATED_GENE,MIN_NUM_SAMPLED, MIN_NUM_DETECTED,pathway]
candidate_output_path = "../../data/workflow/candidate_genes.pk"
noncandidate_output_path = "../../data/workflow/noncandidate_genes.pk"

In [95]:
def convert_to_dictionary(snakemake_config_var, condition_sets):
    
    '''
    This function checks whether filter conditions from snakemake config parameters
    are in dictionary format. 
    - If the conditions are not in dictionary format, it is converted to dictionary format.
    - When there are multiple condition sets but the condition is not in dictionary format, 
    assume the condition applies to all condition sets.
    '''
    var_as_Dict = {}
    try:
        items = snakemake_config_var.items()
    except (AttributeError, TypeError):
        for i in condition_sets:
            var_as_Dict[i] = snakemake_config_var
    return var_as_Dict

In [70]:
def filter_by_study(clean_primary_data_path = clean_primary_data_path, 
                    MANIPULATED_GENE = MANIPULATED_GENE, 
                    type_of_study = type_of_study, 
                    not_type_of_study = not_type_of_study):
    
    '''
    This function filters which studies to include in candidate and noncandidate gene pools
    MANIPULATED_GENE = <int>
    type_of_study = <list>
    not_type_of_study = <list>
    '''
    df = pd.read_pickle(clean_primary_data_path)
    
    # Did the study manipulated a gene or not?
    try:
        if MANIPULATED_GENE == 2:
            condition_1 = (df.source != None)
        if MANIPULATED_GENE == 1:
            condition_1 = (df.manipulated_gene == 1)
        if MANIPULATED_GENE == 0:
            condition_1 = (df.manipulated_gene == 0)
    except NameError:
        condition_1 = (df.source != None)
    
    # Is the study included in type_of_study or not_type_of_study?
    try:
        if len(type_of_study) != 0:
            # df['match'] is False if study type is not in type_of_study and is in not_type_of_study
            df['match'] = df.apply(lambda x:
                               True if (any(item in x.type_of_study for item in type_of_study))&(~any(item in x.type_of_study for item in not_type_of_study)) \
                               else False, 
                               axis = 1)
        # df['match'] is False if study type is in not_type_of_study
        else: 
            df['match'] = df.apply(lambda x:
                               True if ~any(item in x.type_of_study for item in not_type_of_study) \
                               else False, 
                               axis = 1)
    except NameError:
        df['match'] = True
    
    #Include studies that are matches, otherwise include every study that has valid name
    try:
        condition_2 = (df.match == True)
    except AttributeError: 
        condition_2 = (df.source != None)
        
    selected_studies = list(df[condition_1 & condition_2].source)
    
    return selected_studies

In [37]:
selected_studies = []
def filter_by_columns(selected_studies = selected_studies,
                      sampled_genes_path = sampled_genes_path,
                      MIN_NUM_SAMPLED = MIN_NUM_SAMPLED, 
                      MIN_NUM_DETECTED = MIN_NUM_DETECTED):

    '''
    This function returns a list of PAC_ids of the candidate and 
    noncandidate (from sampled genes) genes
    '''
    
    #Subset sampled genes by filtered studies
    df = pd.read_pickle(sampled_genes_path)
    df = df[df.source.isin(selected_studies)]
    
    #Sum counts in each column for each gene
    df = df[['num_detected', 'num_sampled','transcript_id_v5.3.1','pathway_id','PAC_id']]
    df = df.groupby(['transcript_id_v5.3.1','pathway_id','PAC_id']).agg('sum').reset_index()
        
    #Filter by column conditions
    if (MIN_NUM_SAMPLED != 0) & (MIN_NUM_DETECTED != 0):
        candidate = list(df[(df.num_sampled >= MIN_NUM_SAMPLED) & \
              (df.num_detected >= MIN_NUM_DETECTED)].PAC_id)
        noncandidate = list(df.loc[(df.num_sampled >= MIN_NUM_SAMPLED) & \
              (df.num_detected < MIN_NUM_DETECTED)].PAC_id)
    else: 
        candidate = list(df.PAC_id)
        noncandidate = list(df.PAC_id)
    return candidate, noncandidate

In [53]:
def filter_by_pathway(sampled_genes_path = sampled_genes_path, 
                      pathway = pathway):
    
    '''
    This function returns a list of candidate and noncandidate 
    (from all genes) genes based on pathway information
    '''
    
    df = pd.read_pickle(sampled_genes_path)
    
    try:
        if len(pathway)!= 0:
            df['match'] = df.apply(lambda x: any(item in x.pathway_id for item in pathway), axis = 1)
            condition = (df.match == True)
    except NameError:
        raise NameError
    
    try: 
        df.match
        candidate = list(df[condition].PAC_id)
        noncandidate = list(df.loc[~condition].PAC_id)
    except AttributeError: 
        condition = (df.PAC_id != None)
        candidate = list(df[condition].PAC_id)
        noncandidate = list(df.loc[condition].PAC_id)
    
    return candidate, noncandidate

In [72]:
def get_candidate_pool(clean_primary_data_path = clean_primary_data_path,
                       sampled_genes_path = sampled_genes_path,
                       divergence_path = divergence_path, 
                       MANIPULATED_GENE = MANIPULATED_GENE, 
                       type_of_study = type_of_study, 
                       not_type_of_study = not_type_of_study, 
                       pathway = pathway, 
                       MIN_NUM_SAMPLED = MIN_NUM_SAMPLED, 
                       MIN_NUM_DETECTED = MIN_NUM_DETECTED):
    
    '''
    This function returns the list of candidate and noncandidate genes 
    based on one set of candidate gene conditions
    '''
    
    #Get list of studies to include
    selected_studies = filter_by_study(clean_primary_data_path = clean_primary_data_path, 
                                       MANIPULATED_GENE = MANIPULATED_GENE, 
                                       type_of_study = type_of_study, 
                                       not_type_of_study = not_type_of_study)
        
    #Filter candidate and noncandidate genes by columns: 'num_detected', 
                                                        #'num_sampled',
                                                        #'transcript_id_v5.3.1',
                                                        #'pathway_id',
                                                        #'PAC_id'
    
    candidate_col, noncandidate_col = filter_by_columns(selected_studies = selected_studies,
                                                        sampled_genes_path = sampled_genes_path,
                                                        MIN_NUM_SAMPLED = MIN_NUM_SAMPLED, 
                                                        MIN_NUM_DETECTED = MIN_NUM_DETECTED)
    
    #Filter candidate and noncandidate genes by pathway
    candidate_path, noncandidate_path = filter_by_pathway(sampled_genes_path = sampled_genes_path,
                                                          pathway = pathway)
    
    #Get intersection of candidate_col and candidate_path genes
    candidate = list(set(candidate_col) & set(candidate_path))
    noncandidate_tmp = list(set(noncandidate_col) & set(noncandidate_path))
    noncandidate = [i for i in noncandidate_tmp if i not in candidate]
    return candidate, noncandidate

In [106]:
candidate_temp, noncandidate_temp = get_candidate_pool(clean_primary_data_path = clean_primary_data_path,
                                        sampled_genes_path = sampled_genes_path,
                                        divergence_path = divergence_path, 
                                        MANIPULATED_GENE = MANIPULATED_GENE, 
                                        type_of_study = type_of_study, 
                                        not_type_of_study = not_type_of_study, 
                                        pathway = [], 
                                        MIN_NUM_SAMPLED = MIN_NUM_SAMPLED, 
                                        MIN_NUM_DETECTED = MIN_NUM_DETECTED)

In [178]:
#convert all config parameters to dictionary format
local_params = [convert_to_dictionary(i, condition_sets) for i in local_params]
    
candidate = []
noncandidate = []
for i in condition_sets:
    type_of_study, not_type_of_study,MANIPULATED_GENE, MIN_NUM_SAMPLED, MIN_NUM_DETECTED, pathway \
    = [local_params[j][i] for j in range(len(local_params))]
        
    candidate_temp, noncandidate_temp = get_candidate_pool(clean_primary_data_path = clean_primary_data_path,
                                        sampled_genes_path = sampled_genes_path,
                                        divergence_path = divergence_path, 
                                        MANIPULATED_GENE = MANIPULATED_GENE, 
                                        type_of_study = type_of_study, 
                                        not_type_of_study = not_type_of_study, 
                                        pathway = pathway, 
                                        MIN_NUM_SAMPLED = MIN_NUM_SAMPLED, 
                                        MIN_NUM_DETECTED = MIN_NUM_DETECTED)
    candidate += candidate_temp
    noncandidate += noncandidate_temp
        
#Get SFS of gene pools
df = pd.read_pickle(divergence_path)

candidate_ls = list(np.unique(candidate))
noncandidate_ls = list(np.unique(noncandidate))

candidate = df[df.PAC_id.isin(candidate)]

#if NC_POOL_SOURCE = "genome", noncandidate genes are ALL genes IN THE GENOME that are not candidate genes
if NC_POOL_SOURCE == 'genome':
    noncandidate = df[~df.PAC_id.isin(candidate_ls)]

#if NC_POOL_SOURCE = "sampled_genes", noncandidate genes are sampled genes that are not candidate genes 
#but meets the MIN_NUM_SAMPLED requirement
if NC_POOL_SOURCE == "sampled_genes":
    noncandidate_final = [i for i in noncandidate_ls if i not in candidate_ls]
    noncandidate = df[df.PAC_id.isin(noncandidate_final)]


In [None]:
if "noncandidate" in output_path:
    noncandidate.to_pickle(output_path)
else:
    candidate.to_pickle(output_path)