# Get list of candidate and noncandidate genes

In [67]:
import pandas as pd
import pickle

In [None]:
#Define snakemake variables
#Change to code cell before running snakemake

clean_primary_data_path = str(snakemake.input[0])
sampled_genes_path = str(snakemake.input[1])
divergence_path = str(snakemake.input[2])

type_of_study = snakemake.config['type_of_study']
pathway = snakemake.config['pathway']

MANIPULATED_GENE = snakemake.config['MANIPULATED_GENE']
MIN_NUM_SAMPLED = snakemake.config['MIN_NUM_SAMPLED']
MIN_NUM_MANIPULATED = snakemake.config['MIN_NUM_MANIPULATED']
MIN_NUM_DETECTED = snakemake.config['MIN_NUM_DETECTED']

candidate_output_path = str(snakemake.output[0])
noncandidate_output_path = str(snakemake.output[1])

#Define notebook variables
#Change to raw cell before running snakemake
clean_primary_data_path = "../../data/workflow/clean_primary_data.pk"
sampled_genes_path = "../../data/workflow/sampled_genes.pk"
divergence_path = "../../data/workflow/SFS_and_divergence.pk"

#type_of_study = ['RT_qPCR']
#MANIPULATED_GENE = 0
MIN_NUM_SAMPLED = 1
MIN_NUM_MANIPULATED = 0
MIN_NUM_DETECTED = 1
#pathway = ['Creinhardtii LIPAS-PWY']

candidate_output_path = "../../data/workflow/candidate_genes.pk"
noncandidate_output_path = "../../data/workflow/noncandidate_genes.pk"

# 1. (Custom config) Filter sources by study:
#### Filter conditions:
- Whether study manipulated a gene or not
- Type of study

#### Output:
- Return list of applicable studies `<Firstauthor>_<Year>`

In [68]:
#change to raw cell when running snakemake
df = pd.read_pickle(clean_primary_data_path)

with open(sampled_genes_path, "rb") as f:
    df = pickle.load(f)

In [51]:
df.columns

Index(['source', 'manipulated_gene', 'type_of_study'], dtype='object')

In [52]:
try:
    if MANIPULATED_GENE == 2:
        condition_1 = (df.source != None)
    elif MANIPULATED_GENE == 1:
        condition_1 = (df.manipulated_gene == 1)
    elif MANIPULATED_GENE == 0:
        condition_1 = (df.manipulated_gene == 0)
except NameError:
    condition_1 = (df.source != None)

In [53]:
try:
    if len(type_of_study)!= 0:
        df['match'] = df.apply(lambda x: any(item in x.type_of_study for item in type_of_study), axis = 1)
        condition_2 = (df.match == True)
    elif len(type_of_study) == 0: 
        condition_2 = (df.source != None)
except NameError:
    condition_2 = (df.source != None)

In [54]:
selected_studies = list(df[condition_1 & condition_2].source)

# 2. Subset sampled genes by filtered studies

In [55]:
df = pd.read_pickle(sampled_genes_path)

with open(sampled_genes_path, "rb") as f:
    df = pickle.load(f)

In [56]:
df = df[df.source.isin(selected_studies)]

In [57]:
df.columns

Index(['num_detected', 'num_manipulated', 'num_sampled', 'source',
       'transcript_id', 'annotation_version', 'gene_id', 'gene_symbol',
       'pathway_id', 'transcript_id_v5.3.1', 'PAC_id'],
      dtype='object')

# 3. Sum counts in each column for each gene
#### Columns summed:
- Num_sampled
- Num_manipulated
- Num_detected


In [58]:
df = df[['num_detected', 'num_manipulated', 'num_sampled','transcript_id_v5.3.1','pathway_id','PAC_id']]
df = df.groupby(['transcript_id_v5.3.1','pathway_id','PAC_id']).agg('sum').reset_index()

# 4. (Custom config) Filter candidate and noncandidate genes by column
#### Filter by the following conditions:
- Num_sampled
- Num_manipulated
- Num_detected
- Pathway


##### By `pathway`

In [59]:
try:
    if len(pathway)!= 0:
        df['match'] = df.apply(lambda x: any(item in x.pathway_id for item in pathway), axis = 1)
        condition = (df.match == True)
    elif len(pathway) == 0: 
        condition = (df.PAC_id != None)
except NameError:
    condition = (df.PAC_id != None)

In [60]:
df = df[condition]

##### By `MIN_NUM_SAMPLED`, `MIN_NUM_MANIPULATED`, `MIN_NUM_DETECTED`

In [61]:
candidate = df[(df.num_sampled >= MIN_NUM_SAMPLED) & \
          (df.num_manipulated >=MIN_NUM_MANIPULATED) & \
          (df.num_detected >= MIN_NUM_DETECTED)]
noncandidate = df.loc[(df.num_sampled >= MIN_NUM_SAMPLED) & \
          (df.num_manipulated >=MIN_NUM_MANIPULATED) & \
          (df.num_detected < MIN_NUM_DETECTED)]

In [62]:
print(candidate.shape)
print(noncandidate.shape)

(661, 6)
(461, 6)


# 5. Subset  SFS_and_divergence.pk to only candidate or noncandidate genes

In [63]:
df = pd.read_pickle(sampled_genes_path)

with open(sampled_genes_path, "rb") as f:
    df = pickle.load(f)

In [64]:
candidate = df[df.PAC_id.isin(candidate.PAC_id)]
noncandidate = df[df.PAC_id.isin(noncandidate.PAC_id)]

In [65]:
noncandidate[:5]

Unnamed: 0,Cincerta_transcript_ID,diffs0,sites0,diffs4,sites4,PAC_id,neutral_SFS,selected_SFS
15,g2116.t1,99,788,112,253,PAC:26893261,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
193,g12519.t1,75,1537,155,548,PAC:26904561,"[463, 1, 10, 0, 3, 7, 2, 1, 6, 0, 0, 0, 0, 0, ...","[1377, 2, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
232,g14502.t1,51,1096,77,366,PAC:26894844,"[387, 0, 11, 2, 0, 2, 4, 2, 1, 0, 0, 0, 0, 0, ...","[1201, 3, 5, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, ..."
274,g3572.t1,5,383,40,104,PAC:26896671,"[120, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...","[442, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
281,g8628.t1,82,871,83,240,PAC:26889037,"[214, 8, 7, 2, 2, 1, 2, 7, 0, 1, 0, 0, 0, 0, 0...","[877, 8, 10, 2, 0, 0, 4, 4, 1, 0, 0, 0, 0, 0, ..."


## Export files

In [66]:
candidate.to_pickle(candidate_output_path)
noncandidate.to_pickle(noncandidate_output_path)