# Get list of candidate and noncandidate genes

In [67]:
import pandas as pd
import pickle

In [None]:
#Define snakemake variables
#Change to code cell before running snakemake

clean_primary_data_path = str(snakemake.input[0])
sampled_genes_path = str(snakemake.input[1])
divergence_path = str(snakemake.input[2])

type_of_study = snakemake.config['type_of_study']
pathway = snakemake.config['pathway']

MANIPULATED_GENE = snakemake.config['MANIPULATED_GENE']
MIN_NUM_SAMPLED = snakemake.config['MIN_NUM_SAMPLED']
MIN_NUM_DETECTED = snakemake.config['MIN_NUM_DETECTED']

NC_POOL_SOURCE = snakemake.config['NC_POOL_SOURCE']

output_path = str(snakemake.params.prefix)

#Define notebook variables
#Change to raw cell before running snakemake
clean_primary_data_path = "../../data/workflow/clean_primary_data.pk"
sampled_genes_path = "../../data/workflow/sampled_genes.pk"
divergence_path = "../../data/workflow/SFS_and_divergence.pk"

#type_of_study = ['RT_qPCR']
#MANIPULATED_GENE = 0
MIN_NUM_SAMPLED = 3
MIN_NUM_DETECTED = 2
#pathway = ['Creinhardtii LIPAS-PWY']
NC_POOL_SOURCE = 'genome'

candidate_output_path = "../../data/workflow/candidate_genes.pk"
noncandidate_output_path = "../../data/workflow/noncandidate_genes.pk"

# 1. (Custom config) Filter sources by study:
#### Filter conditions:
- Whether study manipulated a gene or not
- Type of study

#### Output:
- Return list of applicable studies `<Firstauthor>_<Year>`

In [151]:
#change to raw cell when running snakemake
df = pd.read_pickle(clean_primary_data_path)

with open(sampled_genes_path, "rb") as f:
    df = pickle.load(f)

In [152]:
df.columns

Index(['source', 'manipulated_gene', 'type_of_study'], dtype='object')

In [153]:
try:
    if MANIPULATED_GENE == 2:
        condition_1 = (df.source != None)
    elif MANIPULATED_GENE == 1:
        condition_1 = (df.manipulated_gene == 1)
    elif MANIPULATED_GENE == 0:
        condition_1 = (df.manipulated_gene == 0)
except NameError:
    condition_1 = (df.source != None)

In [154]:
try:
    if len(type_of_study)!= 0:
        df['match'] = df.apply(lambda x: any(item in x.type_of_study for item in type_of_study), axis = 1)
        condition_2 = (df.match == True)
    elif len(type_of_study) == 0: 
        condition_2 = (df.source != None)
except NameError:
    condition_2 = (df.source != None)

In [155]:
selected_studies = list(df[condition_1 & condition_2].source)

# 2. Subset sampled genes by filtered studies

In [156]:
df = pd.read_pickle(sampled_genes_path)

with open(sampled_genes_path, "rb") as f:
    df = pickle.load(f)

In [157]:
df = df[df.source.isin(selected_studies)]

In [158]:
df.columns

Index(['transcript_id', 'source', 'num_detected', 'num_manipulated',
       'num_sampled', 'annotation_version', 'gene_id', 'gene_symbol',
       'pathway_id', 'transcript_id_v5.3.1', 'PAC_id'],
      dtype='object')

# 3. Sum counts in each column for each gene
#### Columns summed:
- Num_sampled
- Num_manipulated
- Num_detected


In [159]:
df = df[['num_detected', 'num_manipulated', 'num_sampled','transcript_id_v5.3.1','pathway_id','PAC_id']]
df = df.groupby(['transcript_id_v5.3.1','pathway_id','PAC_id']).agg('sum').reset_index()

# 4. (Custom config) Filter candidate and noncandidate genes by column
#### Filter by the following conditions:
- Num_sampled
- Num_manipulated
- Num_detected
- Pathway


##### By `pathway`

In [160]:
try:
    if len(pathway)!= 0:
        df['match'] = df.apply(lambda x: any(item in x.pathway_id for item in pathway), axis = 1)
        condition = (df.match == True)
    elif len(pathway) == 0: 
        condition = (df.PAC_id != None)
except NameError:
    condition = (df.PAC_id != None)

In [161]:
df = df[condition]

##### By `MIN_NUM_SAMPLED`, `MIN_NUM_MANIPULATED`, `MIN_NUM_DETECTED`

In [162]:
candidate = df[(df.num_sampled >= MIN_NUM_SAMPLED) & \
          (df.num_detected >= MIN_NUM_DETECTED)]
noncandidate = df.loc[(df.num_sampled >= MIN_NUM_SAMPLED) & \
          (df.num_detected < MIN_NUM_DETECTED)]

# 5. Subset  SFS_and_divergence.pk to only candidate or noncandidate genes

In [164]:
df = pd.read_pickle(divergence_path)

with open(sampled_genes_path, "rb") as f:
    df = pickle.load(f)

In [165]:
candidate = df[df.PAC_id.isin(candidate.PAC_id)]
if NC_POOL_SOURCE == 'genome':
    noncandidate = df[~df.PAC_id.isin(candidate.PAC_id)]
if NC_POOL_SOURCE == "sampled_genes":
    noncandidate = df[df.PAC_id.isin(noncandidate.PAC_id)]

In [166]:
noncandidate[:5]

Unnamed: 0,Cincerta_transcript_ID,diffs0,sites0,diffs4,sites4,PAC_id,neutral_SFS,selected_SFS
0,g6935.t1,24,2777,113,699,PAC:26888200,"[609, 11, 17, 29, 6, 10, 6, 2, 8, 0, 0, 0, 0, ...","[2775, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,g13204.t1,28,655,38,217,PAC:26904954,"[155, 4, 0, 2, 0, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0...","[519, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
2,g11333.t1,231,2638,311,950,PAC:26894252,"[491, 4, 9, 4, 1, 0, 7, 2, 4, 1, 0, 0, 0, 0, 0...","[1435, 9, 2, 1, 0, 0, 4, 2, 3, 0, 0, 0, 0, 0, ..."
3,g15729.t1,450,2626,325,930,PAC:26891814,"[804, 48, 28, 18, 7, 9, 5, 3, 3, 0, 0, 0, 0, 0...","[2540, 55, 27, 11, 4, 6, 4, 7, 5, 1, 0, 0, 0, ..."
4,g5327.t1,3,1483,72,412,PAC:26901315,"[408, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1478, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
print(candidate.shape)
print(noncandidate.shape)

## Export files

In [66]:
if "noncandidate" in output_path:
    noncandidate.to_pickle(output_path)
else:
    candidate.to_pickle(output_path)