In [8]:
import sys
import os
import argparse
import time
import pickle
import re
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
import tensorflow as tf

if "../modules" not in sys.path:
    sys.path.append("../modules")
import preprocess
from taigapy import TaigaClient

import demeter2

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

alt_tol = 1e-5 #convergence relative tolerance (on change in cost) for blockwise coord descent procedure
fit_CL_slopes = True 
regions = [(10,17),(11,18)] #seed sequence regions
data_dir = './data/'

# data_files = '/data/data_files_Achilles'
data_files = '/test/data_files_Achilles_paths'
D2_results_dir = '/results/kube_results/Ach_final/1/'

new_name_map = pd.read_csv('/results/results/name_change_map.csv')
new_name_map_dict = {a: b for a, b in zip(new_name_map.old_name, new_name_map.new_name)}

# data_files = '/test/data_files_DRIVE_paths'
# # data_files = '/test/data_files_DRIVE'
# # data_files = '/Users/jmmcfarl/CPDS/demeter2/data/data_files_DRIVE'
# D2_results_dir = '/results/kube_results/DRIVE_final/1/'

In [3]:
if os.path.exists('/taiga/token'):
    tc = TaigaClient(token_path = '/taiga/token')
elif os.path.exists('taiga/token'):
    tc = TaigaClient(token_path = 'taiga/token')
# tc = TaigaClient()

input_data = pd.read_csv(data_files, sep = '\t').dropna(axis = 0, how = 'all')
def get_dset(input_info):
    if len(input_info) == 1:
        print('Loading dataset from: {}'.format(input_info['data_file_paths']))
        cur_data = pd.read_csv(input_info['data_file_paths'], index_col = 0)
    else:
        print('Fetching taiga dataset: {} file: {}, version: {}'.format(input_info['data_set'], 
            input_info['data_file'], 
            int(input_info['version'])))
        if pd.isnull(input_info['data_file']):
            cur_data = tc.get(
                name = input_info['data_set'],
                version = int(input_info['version']))
        else:
            cur_data = tc.get(
                name = input_info['data_set'],
                file = input_info['data_file'],
                version = int(input_info['version']), force = True)
    return(cur_data)
dsets = [get_dset(x) for ii, x in input_data.iterrows()]
print('Read {} data files'.format(len(dsets)))


Loading dataset from: /data/achilles-98k-repcollapsed-lfc.csv
Loading dataset from: /data/achilles-55k-batch2-repcollapsed-lfc.csv
Loading dataset from: /data/achilles-55k-batch1-repcollapsed-lfc.csv
Read 3 data files


In [4]:
# sh_targets = tc.get(name='gpp-shrna-mapping-8759', version=2, file='shmap_19mer_noXLOC')
sh_targets = pd.read_csv('/data/shRNA-mapping.csv')
sh_targets.rename(columns = {'Barcode Sequence': 'SEQ', 'Gene ID': 'Gene_ID'}, inplace=True)

sh_targets.dropna(subset = ['Gene_ID'], inplace=True)

#load curated pos and negative control gene sets
print('Loading positive/negative control sets')
#load Entrez IDs for pos and neg con genes
# pos_con_genes = tc.get(name='demeter2-pos-neg-controls-a5c6', version=1, file='hart_pos_controls')['Gene_ID'].values
# neg_con_genes = tc.get(name='demeter2-pos-neg-controls-a5c6', version=1, file='hart_neg_controls')['Gene_ID'].values
pos_con_genes = pd.read_csv('/data/Hart-pos-controls.csv')['Gene_ID'].values
neg_con_genes = pd.read_csv('/data/Hart-neg-controls.csv')['Gene_ID'].values

train_neg_con_genes = neg_con_genes
train_pos_con_genes = pos_con_genes
test_neg_con_genes = np.setdiff1d(neg_con_genes, train_neg_con_genes)
test_pos_con_genes = np.setdiff1d(pos_con_genes, train_pos_con_genes)
print('Using {} positive and {} negative control genes for training'.format(len(train_pos_con_genes), 
    len(train_neg_con_genes)))

#parse data
print('Making processed data')
data = preprocess.make_demeter2_data(dsets, sh_targets)

data_names = {'genes': data['unique_genes'],
             'CLs': data['unique_CLs'],
             'hps': data['unique_hp_seqs'],
             'seeds': data['unique_seed_seqs']}


Loading positive/negative control sets
Using 217 positive and 926 negative control genes for training
Making processed data
Eliminating 552 promiscuous hairpins
Eliminated 3634/102295 non-targeting hairpins from map
Identified 334 gene families
Creating dataset with:
93863 hairpins
501 CLs
17507 genes
15135 seeds


In [5]:
from importlib import reload
reload(demeter2)
test_inds = None
gene_controls = {'pos': train_pos_con_genes, 'neg': train_neg_con_genes,
                'pos_test': test_pos_con_genes, 'neg_test': test_neg_con_genes}
mod = demeter2.demeter(data['LFC_mats'], data['gene_matrix'], data['seed_matrix'], 
                   gene_sets = gene_controls, data_names = data_names, test_inds = test_inds)


In [6]:
def load_matrix(path):
    mat = pd.read_csv(path)
    mat.iloc[:,0] = mat.iloc[:,0].astype(str)
    mat.set_index(mat.columns[0], inplace=True)
    return(mat)

In [9]:
CL_data = pd.read_csv(os.path.join(D2_results_dir, 'CL_data.csv'))
CL_data['CCLE_ID'] = CL_data['CCLE_ID'].replace(new_name_map_dict)
CL_data.set_index('CCLE_ID', inplace = True)
CL_data = CL_data.ix[mod.data_names['CLs']]
CL_data.reset_index(inplace=True)
np.testing.assert_array_equal(mod.data_names['CLs'],CL_data['CCLE_ID'])
_=mod.sess.run(mod.gene_slope.assign(CL_data.gene_slope.values.reshape(-1,1)))

# CL_batch_data = pd.read_csv(os.path.join(D2_results_dir, 'CL_batch_data.csv')).set_index('CCLE_ID')
CL_batch_data = pd.read_csv(os.path.join(D2_results_dir, 'CL_batch_data.csv'))
CL_batch_data['CCLE_ID'] = CL_batch_data['CCLE_ID'].replace(new_name_map_dict)
CL_batch_data.set_index('CCLE_ID', inplace = True)
np.testing.assert_array_equal(mod.all_CL_names,CL_batch_data.index.values)
_=mod.sess.run(mod.CL_slope.assign(CL_batch_data.CL_slope.values.reshape(-1,1)))
_=mod.sess.run(mod.CL_offset.assign(CL_batch_data.CL_offset.values.reshape(-1,1)))
_=mod.sess.run(mod.CL_noise_vars.assign(CL_batch_data.noise_vars.values.reshape(-1,1)))

hp_data = pd.read_csv(os.path.join(D2_results_dir, 'hp_data.csv'))
np.testing.assert_array_equal(mod.data_names['hps'],hp_data['hp'])
_=mod.sess.run(mod.guide_Geff.assign(hp_data.Geff.values.reshape(-1,1)))
_=mod.sess.run(mod.guide_Seff.assign(hp_data.Seff.values.reshape(-1,1)))
_=mod.sess.run(mod.hairpin_unpred.assign(hp_data.unpred_offset.values.reshape(-1,1)))

hp_batch_data = pd.read_csv(os.path.join(D2_results_dir, 'hp_batch_data.csv')).set_index('hp')
np.testing.assert_array_equal(mod.all_hp_seqs,hp_batch_data.index.values)
_=mod.sess.run(mod.hairpin_offset.assign(hp_batch_data.hairpin_offset.values.reshape(-1,1)))


In [10]:
LFC_mats_no_na = []
for LFC_mat in data['LFC_mats']:
    cur = LFC_mat.values.copy()
    cur[np.isnan(cur)] = 0
    LFC_mats_no_na.append(cur)

feed_dict = {i: d for i, d in zip(mod.obs, LFC_mats_no_na)}
train_eval_masks = demeter2.make_eval_masks(LFC_mats_no_na, None)
train_mask_dict = {i: d for i, d in zip(mod.eval_mask, train_eval_masks)}
feed_dict = demeter2.merge_dicts(feed_dict, train_mask_dict)

In [11]:
def get_R2_df(mod, data):
    shRNA_R2 = mod.sess.run(mod.shRNA_R2, feed_dict = feed_dict)
    shRNA_nLL = mod.sess.run(mod.shRNA_nLL, feed_dict = feed_dict)
    shRNA_SS = mod.sess.run(mod.shRNA_oSS, feed_dict = feed_dict)
    ovR2 = mod.sess.run(mod.R2, feed_dict = feed_dict)
    print(ovR2)
    
    df = pd.DataFrame()
    for ii in range(len(data['LFC_mats'])):
        cur_df = pd.DataFrame()
        cur_df['SSE'] = shRNA_nLL[ii] * 2 #LL are originally divided by 2
        cur_df['SST'] = shRNA_SS[ii] * 2
        cur_df['hp'] = data['LFC_mats'][ii].index.values
        cur_df['n_CLs'] = data['LFC_mats'][ii].shape[1]
        df = pd.concat([df, cur_df])
    tot_CLs = np.sum(np.array([x.shape[1] for x in data['LFC_mats']]))
    df['SSE'] = df['SSE'] * df['n_CLs']
    df['SST'] = df['SST'] * df['n_CLs']
    df = df.groupby('hp').agg({'SSE': ['sum'], 'SST': ['sum'], 'n_CLs': ['sum']})
    df[['SSE', 'SST']] = df[['SSE', 'SST']] / tot_CLs

    df['R2'] = 1 - df['SSE'] / df['SST']
    df.columns = ['SSE', 'SST', 'n_CLs', 'R2']    
    return(df)

In [12]:
gene_means = load_matrix(os.path.join(D2_results_dir, 'gene_means.csv'))
gene_means.columns = gene_means.columns.to_series().replace(new_name_map_dict)
gene_means.columns.values
gene_means = gene_means.ix[:,mod.data_names['CLs']]
np.testing.assert_array_equal(mod.data_names['CLs'],gene_means.columns.values)
gene_means.fillna(0, inplace=True)
_=mod.sess.run(mod.gene_score.assign(gene_means.values.transpose()))

gene_R2_df = get_R2_df(mod, data)

-0.579312097121


In [13]:
seed_means = load_matrix(os.path.join(D2_results_dir, 'seed_means.csv'))
seed_means.columns = seed_means.columns.to_series().replace(new_name_map_dict)
seed_means.columns.values
seed_means = seed_means.ix[:,mod.data_names['CLs']]
np.testing.assert_array_equal(mod.data_names['CLs'],seed_means.columns.values)
seed_means.fillna(0, inplace=True)
_=mod.sess.run(mod.seed_score.assign(seed_means.values.transpose()))

gene_means = gene_means*0
_=mod.sess.run(mod.gene_score.assign(gene_means.values.transpose()))

seed_R2_df = get_R2_df(mod, data)

0.759951044973


In [100]:
comb_R2_df = pd.merge(gene_R2_df, seed_R2_df[['SSE', 'R2']], left_index = True, right_index = True, suffixes = ['_gene', '_seed'])
comb_R2_df.to_csv(os.path.join(D2_results_dir, 'hp_R2_post.csv'))
