In [1]:
import os
import sys
import pandas as pd
import numpy as np
import re

from taigapy import TaigaClient

#PARAMETERS
min_avg_Geff = 0.2
min_sum_Geff = 1.5
min_hps_per_gene = 3

c = TaigaClient()

base_data_dir = '/Users/jmmcfarl/CPDS/demeter2'

Ach_model_dir = os.path.join(base_data_dir,'kube_results/Ach_final/1')
Ach_dataset_id = 'demeter2-achilles-5386'

DRIVE_model_dir = os.path.join(base_data_dir, 'kube_results/DRIVE_final/1')
DRIVE_dataset_id = 'demeter2-drive-0591'

Marc_model_dir = os.path.join(base_data_dir, 'kube_results/Marc_final/1')
Marc_dataset_id = 'demeter2-marcotte-a703'

comb_model_dir = os.path.join(base_data_dir, 'kube_results/comb_final/1')
comb_dataset_id = 'demeter2-combined-dc9c'

new_name_map = pd.read_csv('/Users/jmmcfarl/CPDS/demeter2/results/name_change_map.csv')
new_name_map_dict = {a: b for a, b in zip(new_name_map.old_name, new_name_map.new_name)}

In [2]:
hart_ess = c.get(name='demeter2-pos-neg-controls-a5c6', version=1, file='hart_pos_controls')['Gene_ID'].values.astype(str)
hart_non_ess = c.get(name='demeter2-pos-neg-controls-a5c6', version=1, file='hart_neg_controls')['Gene_ID'].values.astype(str)

sh_targets = c.get(name = 'gpp-shrna-mapping-8759', version = 2, file = 'shmap_19mer_noXLOC')
sh_targets = sh_targets.rename(columns = {'Barcode Sequence': 'hp'}, inplace = False).set_index('hp')

In [3]:
gene_name_df = sh_targets.ix[~sh_targets['Gene ID'].str.contains('^NO_CURRENT')].copy()
gene_name_df['Gene name'] = gene_name_df['Gene Symbol'] + ' (' + gene_name_df['Gene ID'] + ')'
gene_name_map = gene_name_df.set_index('Gene ID', inplace = False)['Gene name']
gene_name_map = gene_name_map.to_dict()

gene_sym_map = gene_name_df.set_index('Gene ID', inplace = False)['Gene Symbol']
gene_sym_map = gene_sym_map.to_dict()

In [4]:
D2_description='''
Results from DEMETER2 model fit.

Contents:

* gene_means_proc: posterior mean estimates of essentiality for each gene/CL pair
* gene_SDs_proc: posterior SD of essentiality estimates for each gene/CL pair  
* hp_data_comb: model results for each hairpin, including:
    * Geff: hairpin gene knockdown efficacy [0,1]
    * Seff: hairpin seed knockdown efficacy [0,1]
    * unpred_offset_mean: posterior mean of across-CL avg unpredicted offtarget effect
    * unpred_offset_sd: posterior SD of across-CL avg unpredicted offtarget effect
* CL_data_comb: model results for each CL, including:
    * gene_slope: RNAi efficacy parameter
    * CL_slope: overall scaling factor
    * noise_vars: noise variance
    * offset_mean: posterior mean of additive offset
    * offset_SD: posterior SD of additive offset
    
versions:
* v2: 
    * Run with final hyperparameter settings
    * Add gene symbols to entrez IDs
    * exclude genes with all NA values
    * exclude genes with poor reagents
    * normalize gene scores to have median of pos-cons at -1 and median of neg-cons at 0

* v3: 
    * Add gene symbols for gene families

'''

In [10]:
def make_processed_gene_data_D2(cur_model_dir, gene_name_map, min_avg_Geff, min_sum_Geff):
    '''Process gene means and SDs and make new files'''
    gs = pd.read_csv(os.path.join(cur_model_dir, 'gene_means.csv'), index_col = 0)
    gs_unc = pd.read_csv(os.path.join(cur_model_dir, 'gene_SDs.csv'), index_col = 0)
    
    #update cell line names
    gs.columns = gs.columns.to_series().replace(new_name_map_dict)
    gs_unc.columns = gs_unc.columns.to_series().replace(new_name_map_dict)
    
    #drop genes which are NA for all cell lines
    bad_genes = np.where(gs.isnull().sum(axis = 1) == gs.shape[1])[0]
    print('Removing {} genes with all NAs'.format(len(bad_genes)))
    gs.drop(gs.index[bad_genes], axis=0, inplace=True)
    gs_unc.drop(gs_unc.index[bad_genes], axis=0, inplace=True)

    #calc mean Geff and sum Geff per gene, and filter out bad-quality genes
    hp_data = pd.read_csv(os.path.join(cur_model_dir, 'hp_data.csv')).set_index('hp')
    hp_data = hp_data.join(sh_targets, how = 'left')
    hp_stats = hp_data.groupby('Gene ID').agg({'Geff': [np.mean, np.sum, 'count']})
    bad_genes = hp_stats.ix[(hp_stats['Geff']['mean'].values < min_avg_Geff) | (hp_stats['Geff']['sum'].values < min_sum_Geff) | \
                            (hp_stats['Geff']['count'].values < min_hps_per_gene)].index.values
    bad_genes = np.intersect1d(bad_genes, gs.index.values)
    print('Removing {} genes with all poor hp data'.format(len(bad_genes)))
    gs.drop(bad_genes, axis=0, inplace=True)
    gs_unc.drop(bad_genes, axis=0, inplace=True)

    #normalize gene scores by pos-neg control medians
    weights = 1/(gs_unc**2)
    per_gene_avgs = np.sum(gs * weights, axis = 1) / np.sum(weights, axis = 1)
    pos_con_median = np.nanmedian(per_gene_avgs.ix[hart_ess])
    neg_con_median = np.nanmedian(per_gene_avgs.ix[hart_non_ess])

    norm_gs_unc = gs_unc / (neg_con_median - pos_con_median)
    norm_gs = (gs - neg_con_median) / (neg_con_median - pos_con_median)

    #rename genes to include gene symbol
    norm_gs.rename(index = gene_name_map, inplace = True)
    norm_gs_unc.rename(index = gene_name_map, inplace = True)
    
    #handle gene names for gene families
    gene_families = np.where(norm_gs.index.str.contains('&'))[0]
    ind_names = norm_gs.index.values
    for fam_ind in gene_families:
        cur_fam = norm_gs.index.values[fam_ind]
        print(cur_fam)
        fam_syms = '&'.join([gene_sym_map[x] for x in re.split('&', cur_fam)])
#         ind_names[fam_ind] = cur_fam + ' (' + fam_syms + ')'
        ind_names[fam_ind] = fam_syms + ' (' + cur_fam + ')'
    norm_gs.index = ind_names
    norm_gs_unc.index = ind_names

    norm_gs.to_csv(os.path.join(cur_model_dir, 'gene_means_proc.csv'))
    norm_gs_unc.to_csv(os.path.join(cur_model_dir, 'gene_SDs_proc.csv'))

    
def prepare_D2_outputs(D2_model_dir):
    '''Combine batch and non-batch parameters for CL data and hp_data'''
    CL_data = pd.read_csv(os.path.join(D2_model_dir, 'CL_data.csv'), index_col = 0)
        
    CL_batch_data = pd.read_csv(os.path.join(D2_model_dir, 'CL_batch_data.csv'), index_col = 0)
   
    #rename cell lines
    CL_data.index = CL_data.index.to_series().replace(new_name_map_dict)
    CL_batch_data.index = CL_batch_data.index.to_series().replace(new_name_map_dict)
    
    CL_batch_data.reset_index(inplace=True)
    CL_batch_data['offset_var'] = CL_batch_data['offset_sd']**2
    CL_batch_means = CL_batch_data.groupby('CCLE_ID')[['CL_slope', 'noise_vars', 'offset_mean', 'offset_var']].agg('mean')
    CL_batch_means['offset_sd'] = np.sqrt(CL_batch_means['offset_var'].values)

    CL_data = pd.merge(CL_data, CL_batch_means[['CL_slope', 'noise_vars', 'offset_mean', 'offset_sd']], left_index=True, right_index = True)
    CL_data.to_csv(os.path.join(D2_model_dir, 'CL_data_comb.csv'))
    
    hp_data = pd.read_csv(os.path.join(D2_model_dir, 'hp_data.csv')).set_index('hp')
    hp_batch_data = pd.read_csv(os.path.join(D2_model_dir, 'hp_batch_data.csv')).reset_index()
    hp_batch_data['hairpin_offset_var'] = hp_batch_data['hairpin_offset_sd']**2
    hp_batch_means = hp_batch_data.groupby('hp')[['hairpin_offset_mean', 'hairpin_offset_var']].agg('mean')
    hp_batch_means['hairpin_offset_sd'] = np.sqrt(hp_batch_means['hairpin_offset_var'].values)

    hp_data = pd.merge(hp_data[['Geff', 'Seff', 'unpred_offset_mean', 'unpred_offset_sd']], hp_batch_means[['hairpin_offset_sd', 'hairpin_offset_mean']], left_index=True, right_index = True)
    hp_data.to_csv(os.path.join(D2_model_dir, 'hp_data_comb.csv'))

For Achilles data

In [6]:
make_processed_gene_data_D2(Ach_model_dir, gene_name_map, min_avg_Geff, min_sum_Geff)
prepare_D2_outputs(Ach_model_dir)

c.update_dataset(dataset_permaname=Ach_dataset_id,
#     dataset_description=D2_description,
    upload_file_path_dict={os.path.join(Ach_model_dir, 'CL_data_comb.csv'): 'NumericMatrixCSV',
                          os.path.join(Ach_model_dir, 'hp_data_comb.csv'): 'NumericMatrixCSV',
                          os.path.join(Ach_model_dir, 'gene_means_proc.csv'): 'NumericMatrixCSV',
                          os.path.join(Ach_model_dir, 'gene_SDs_proc.csv'): 'NumericMatrixCSV'})


Removing 387 genes with all NAs
Removing 358 genes with all poor hp data
Now choosing the datasets you would want to keep or remove:
	Keep CL_data_comb ? (y/n) n
	Not keeping CL_data_comb
	Keep gene_means_proc ? (y/n) n
	Not keeping gene_means_proc
	Keep gene_SDs_proc ? (y/n) n
	Not keeping gene_SDs_proc
	Keep hp_data_comb ? (y/n) n
	Not keeping hp_data_comb
	Keep pan_dependent_genes ? (y/n) n
	Not keeping pan_dependent_genes
	Keep sample_info ? (y/n) n
	Not keeping sample_info
Uploading gene_means_proc...
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Scanning through file to determine size (line 5001)
	 Conversion in progress, line 250


AttributeError: 'dict' object has no attribute 'message'

DRIVE data

In [7]:
make_processed_gene_data_D2(DRIVE_model_dir, gene_name_map, min_avg_Geff, min_sum_Geff)
prepare_D2_outputs(DRIVE_model_dir)

c.update_dataset(dataset_permaname=DRIVE_dataset_id,
#     dataset_description=D2_description,
    upload_file_path_dict={os.path.join(DRIVE_model_dir, 'CL_data_comb.csv'): 'NumericMatrixCSV',
                          os.path.join(DRIVE_model_dir, 'hp_data_comb.csv'): 'NumericMatrixCSV',
                          os.path.join(DRIVE_model_dir, 'gene_means_proc.csv'): 'NumericMatrixCSV',
                          os.path.join(DRIVE_model_dir, 'gene_SDs_proc.csv'): 'NumericMatrixCSV'})


Removing 647 genes with all NAs
Removing 571 genes with all poor hp data
Now choosing the datasets you would want to keep or remove:
	Keep CL_data_comb ? (y/n) n
	Not keeping CL_data_comb
	Keep gene_means_proc ? (y/n) n
	Not keeping gene_means_proc
	Keep gene_SDs_proc ? (y/n) n
	Not keeping gene_SDs_proc
	Keep hp_data_comb ? (y/n) n
	Not keeping hp_data_comb
	Keep pan_dependent_genes ? (y/n) n
	Not keeping pan_dependent_genes
Uploading gene_means_proc...
Conversion and upload...:
	 Downloading the file from S3
	 Scanning through file to determine size (line 6001)
	 Conversion in progress, line 6750


AttributeError: 'dict' object has no attribute 'message'

Marcotte data

In [8]:
make_processed_gene_data_D2(Marc_model_dir, gene_name_map, min_avg_Geff, min_sum_Geff)
prepare_D2_outputs(Marc_model_dir)

# c.update_dataset(dataset_permaname=Marc_dataset_id,
# #     dataset_description=D2_description,
#     upload_file_path_dict={os.path.join(Marc_model_dir, 'CL_data_comb.csv'): 'NumericMatrixCSV',
#                           os.path.join(Marc_model_dir, 'hp_data_comb.csv'): 'NumericMatrixCSV',
#                           os.path.join(Marc_model_dir, 'gene_means_proc.csv'): 'NumericMatrixCSV',
#                           os.path.join(Marc_model_dir, 'gene_SDs_proc.csv'): 'NumericMatrixCSV'})


Removing 582 genes with all NAs
Removing 3031 genes with all poor hp data


In [11]:
make_processed_gene_data_D2(comb_model_dir, gene_name_map, min_avg_Geff, min_sum_Geff)
prepare_D2_outputs(comb_model_dir)

# c.update_dataset(dataset_permaname=comb_dataset_id,
# #     dataset_description=D2_description,
#     upload_file_path_dict={os.path.join(comb_model_dir, 'CL_data_comb.csv'): 'NumericMatrixCSV',
#                           os.path.join(comb_model_dir, 'hp_data_comb.csv'): 'NumericMatrixCSV',
#                           os.path.join(comb_model_dir, 'gene_means_proc.csv'): 'NumericMatrixCSV',
#                           os.path.join(comb_model_dir, 'gene_SDs_proc.csv'): 'NumericMatrixCSV'})


Removing 414 genes with all NAs
Removing 795 genes with all poor hp data
100049076&100653061&11042&387036&653188
100093631&2970
100101478&767811
100125556&55199&653113
100129213&389834
100129269&105379331&107984772&346653
100130086&100506164
100131205&6144
100131539&100132396
100131626&105376489&105379724&107985887&400618
100131998&101060405
100132406&101060684&284565
100132464&387742
100132476&100996750
100132565&283768
100132979&390535&729786
100133050&102725009
100133161&105377826&107984841
100133220&101927601&102724093&102724117&374650&440295&642402&643707&647042
100188953&113444
100271836&641298
100271874&441208&646762
100287226&7652
100288072&105371187
100288332&101059938&101059953&613037&642778&642799&9284
100288380&89838
100288527&100288562&101929601&101929627&101930111&254958
100288695&96626
100288966&317754
100289087&7258&728137&728395&728403
100289462&1673
100293534&110384692&720
100302285&100422872&100422885&103504734
100310812&441273&729597
100422558&101060596&105371128
10

TypeError: expected string or buffer

In [32]:
gs = pd.read_csv(os.path.join(comb_model_dir, 'gene_means.csv'), index_col = 0)


In [13]:
#handle gene names for gene families
gene_families = np.where(gs.index.str.contains('&'))[0]
ind_names = gs.index.values
for fam_ind in gene_families:
    cur_fam = gs.index.values[fam_ind]
    print(cur_fam)
    fam_syms = '&'.join([gene_sym_map[x] for x in re.split('&', cur_fam)])
#         ind_names[fam_ind] = cur_fam + ' (' + fam_syms + ')'
    ind_names[fam_ind] = fam_syms + ' (' + cur_fam + ')'


100049076&100653061&11042&387036&653188
100093631&2970
100101478&767811
100125556&55199&653113
100128675&105373804
100129213&389834
100129269&105379331&107984772&346653
100130086&100506164
100130520&643382
100131067&101928238&102467003&104355149&201134&84310
100131205&6144
100131539&100132396
100131626&105376489&105379724&107985887&400618
100131655&101927380&107985083
100131998&101060405
100132406&101060684&284565
100132464&387742
100132476&100996750
100132565&283768
100132979&390535&729786
100133050&102725009
100133161&105377826&107984841
100133220&101927601&102724093&102724117&374650&440295&642402&643707&647042
100188953&113444
100271702&196047
100271836&641298
100271874&441208&646762
100287226&7652
100288072&105371187
100288332&101059938&101059953&613037&642778&642799&9284
100288380&89838
100288527&100288562&101929601&101929627&101930111&254958
100288637&85463
100288695&96626
100288966&317754
100289087&7258&728137&728395&728403
100289211&100507639&101927467&105371953&105377237
10028

TypeError: expected string or buffer

In [35]:
np.where(gs.index.str.contains('&').values.astype(bool))


(array([   13,    20,    26, ..., 18431, 18553, 18554]),)

In [52]:
gs.index

Index([                    u'1',                    u'10',
                         u'100',                  u'1000',
                       u'10000',             u'100009667',
                       u'10001',                 u'10002',
                       u'10003',             u'100033411',
       ...
                        u'9993',                  u'9994',
                        u'9997',                   u'BFP',
                         u'GFP', u'LUCIFERASE&Luciferase',
              u'LacZ&lacZsigma',                   u'RFP',
                        u'eGFP',                  u'lacZ'],
      dtype='object', length=18558)