# EVALUATE DEFCOM AND EPITOME

This notebook showcases all the defcom and epitome evaluation functions. It assumes you have installed defcom as specified by the evaluation/defcom/README.md file. Additionally, it assumes you have run the evaluation/defcom/get_defcom_data.py script. Some things in this notebook are subject to change based on a variety of things, such as where you downloaded your defcom data to.

In [None]:
# User Paths
## TODO: set path to where you generated data: 'get_defcom_data.py [-h] --output_path OUTPUT_PATH'
defcom_data_path = "/data/akmorrow/defcom_data/"
config_path = '../../config.yml' # epitome configuration path
deepsea_labels_path = '../../data/deepsea_data/deepsea_labels_train/'

## Imports

In [None]:
# let's do all the neccessary imports
from constants import *
from defcom_functions import *
from visualization import *
import pandas as pd
import os
# let's import the rest of the epitome library
import sys
sys.path.insert(0,'../../')

# from epitome.constants import *
# from epitome.models import *
# from epitome.generators import *
# from epitome.functions import *
# import yaml
#from epitome.viz import *

## Create User Directories

In [None]:
# make required directories

defcom_config_dir = os.path.join(defcom_data_path, "defcom_configs")
defcom_results_dir = os.path.join(defcom_data_path, "defcom_results")
defcom_models_dir = os.path.join(defcom_data_path, "defcom_models")

if not os.path.exists(defcom_config_dir):
    os.mkdir(defcom_config_dir)
if not os.path.exists(defcom_results_dir):
    os.mkdir(defcom_results_dir)
if not os.path.exists(defcom_models_dir):
    os.mkdir(defcom_models_dir)


## Evaluate Defcom model on defcom data

In [None]:

bam_files = {
    'k562':    os.path.join(defcom_data_path, 'wgEncodeOpenChromDnaseK562AlnRep1.bam'),
    'gm12878': os.path.join(defcom_data_path, 'wgEncodeOpenChromDnaseGm12878AlnRep2.bam'), 
    'hepg2':   os.path.join(defcom_data_path, 'wgEncodeOpenChromDnaseHepg2AlnRep3.bam'),
    'h1hesc':  os.path.join(defcom_data_path, 'wgEncodeOpenChromDnaseH1hescAlnRep1.bam')
            }

query_cell_type = 'k562' # garbage placeholder for when we initially create the config

for train_cell_type in DEFCOM_CELLS:
    for tf in DEFCOM_TFS:
        
        if(tf == 'sp1' and train_cell_type == 'k562'): # defcom is missing data for K562 for Sp1
            continue
        
        active_sites_file =    os.path.join(defcom_data_path, tf + '_' + train_cell_type + '_pos_train.bed')
        inactive_sites_file =   os.path.join(defcom_data_path, tf + '_' + train_cell_type + '_neg_train.bed')
        candidate_sites_file =  os.path.join(defcom_data_path, tf + '_' + query_cell_type + '_all_valid.bed')
        training_bam_file =    bam_files[train_cell_type]
        candidate_bam_file =   bam_files[query_cell_type]
        model_out   =  os.path.join(defcom_data_path, 'defcom_models')
        results_out = os.path.join(defcom_data_path, 'defcom_results')
        config_out  = os.path.join(defcom_data_path, 'defcom_configs')
        
        # generate a config file
        config_file, prediction_results_file = createConfigFile(train_cell_type,
                                                        tf,
                                                        query_cell_type, 
                                                        active_sites_file, 
                                                        inactive_sites_file, 
                                                        candidate_sites_file, 
                                                        training_bam_file, 
                                                        candidate_bam_file,
                                                        model_out,
                                                        results_out,
                                                        config_out)

        print('Training {} {}'.format(train_cell_type, tf))
        train_defcom(config_file)
        
        for query_cell_type in DEFCOM_CELLS:
            
            if(tf == 'sp1' and query_cell_type == 'k562'): # defcom is missing data for K562 for Sp1
                continue
            
            candidate_sites_file = os.path.join(defcom_data_path, tf + '_' + query_cell_type + '_all_valid.bed')
            candidate_bam_file =   bam_files[query_cell_type]
            # generate a config file
            config_file, prediction_results_file = createConfigFile(train_cell_type,
                                                            tf,
                                                            query_cell_type, 
                                                            active_sites_file, 
                                                            inactive_sites_file, 
                                                            candidate_sites_file, 
                                                            training_bam_file, 
                                                            candidate_bam_file,
                                                            model_out,
                                                            results_out,
                                                            config_out)
            

            print('Predicting {} {} '.format(query_cell_type, tf))
            predict_defcom(config_file)


## Load in Epitome Data

In [None]:


# load in user paths
with open(config_path) as f:
    config = yaml.safe_load(f)
    
train_data, valid_data, test_data = load_deepsea_label_data(deepsea_labels_path)
dataset_split = {Dataset.TRAIN: train_data, Dataset.VALID: valid_data, Dataset.TEST: test_data}
#dataset_names = {Dataset.TRAIN: "train", # i could probalby put this in the constants file
#                Dataset.VALID: 'cross_validation',
#                Dataset.TEST: 'test'}

# these are the old indices train_indices = [(0, 2159308), (2159309, 2169309), (2169309, 2536878)]
indices = [(0, 2169308), (2169309, 2309366), (2309367, 2608182)]
# train start, train end, cv end, test end
# (start, chrom_6 - 10,000), (chrom_6 - 10,000, rest of chrom_6), (chrom 7, 8, 9) 

In [None]:
#
# Evaluate Epitome model on defcom data
#

# we also need to iterate through the epitome and defcom cells
for tf_EPITOME_NAME, tf_DEFCOM_NAME in EPITOME_AND_DEFCOM_TFS:
    
    epitome_tf = [tf_EPITOME_NAME]
    matrix, cellmap, assaymap = get_assays_from_feature_file( 
                                    eligible_assays = epitome_tf,
                                    eligible_cells = EPITOME_CELLS, 
                                    min_cells_per_assay = 2, 
                                    min_assays_per_cell= 2)

    for query_cell_EPITOME_NAME, query_cell_DEFCOM_NAME in EPITOME_AND_DEFCOM_CELLS:

        epitome_model = MLP(data = dataset_split, # I'm assuming this is data
                    test_celltypes = [query_cell_EPITOME_NAME], # cell line reserved for testing
                    matrix = matrix,
                    assaymap = assaymap,
                    cellmap = cellmap,
                    shuffle_size=2,
                    prefetch_size = 64,
                    debug = False, 
                    batch_size = 64, 
                    radii=[1,3,10,30],
                    split_indices = indices
                    )

        print('Training {} | {}'.format('joint', tf_EPITOME_NAME))
        epitome_model.train(100)

        print('Predicting {} | {}'.format(query_cell_EPITOME_NAME, tf_EPITOME_NAME))
        # path to DeFCoM bed file:
        peak_file = os.path.join(defcom_data_path,'{}_{}_all_valid.bed'.format(tf_DEFCOM_NAME, query_cell_DEFCOM_NAME))

        peak_result = epitome_model.score_peak_file(peak_file) # score the peak file
        peak_result.to_csv(os.path.join(defcom_results_dir,'epitome_single_tf_model_preds/{}_{}_{}.csv'.format('joint', tf_DEFCOM_NAME, query_cell_DEFCOM_NAME)), index=False)

In [None]:
#
# Evaluate the Defcom model in this cell
#

# this code assumes you already have made a folder named defcom results in data
# it also assumes you outputted your defcom data to data/defcom_data, please change things as appropriate

# let's create a dataframe with the evaluation results. It will tell us the auROC and auPR for a given evaluation
eval_results_df = pd.DataFrame(columns=['model','training_cell','transcription_factor','query_cell','auROC','auPR'])

# for each cell type
for train_cell in DEFCOM_CELLS:
    # for each transcription factor
    for tf in DEFCOM_TFS:        
        for query_cell in DEFCOM_CELLS:

            # we are missing data for sp1 in K562...
            if((tf == 'sp1' and query_cell == 'k562') or (tf == 'sp1' and query_cell == 'k562')):
                continue
                
            # get names of files we'll need
            prediction_results_file = os.path.join(defcom_data_path, 'defcom_results/{}_{}_{}_results.bed'.format(train_cell, tf, query_cell))
            pos_file = os.path.join(defcom_data_path, '{}_{}_pos_valid.bed'.format(tf, query_cell))
            # evaluate the model
            auROC, auPRC = evaluateDefcomResults(tf, prediction_results_file, pos_file)
            # append to the rest of the results
            eval_results_df = eval_results_df.append({'model':'defcom_{}'.format(train_cell),
                                'training_cell': train_cell, 
                                'transcription_factor': tf,
                                'query_cell': query_cell,
                                'auROC': auROC, 
                                'auPR': auPRC}, ignore_index=True)
                                
                                
            

print(eval_results_df)

eval_results_df.to_csv(os.path.join(defcom_results_dir, "eval_results","defcom_functions_test_results.csv"), index=False)

In [None]:
eval_results_df = pd.read_csv(os.path.join(defcom_results_dir, "eval_results", "defcom_functions_test_results.csv"))
    
# for all the query cell types
for query_cell_epitome_name, query_cell in EPITOME_AND_DEFCOM_CELLS:
    # for all the transcription factors
    for tf_epitome_name, tf in EPITOME_AND_DEFCOM_TFS:
        # get the files we'll need
        prediction_results_file = '../../results/epitome_single_tf_model_preds/{}_{}_{}.csv'.format('joint', tf, query_cell)
        pos_file = '../../data/defcom_data/{}_{}_pos_valid.bed'.format(tf, query_cell)
        # try just incase we are missing data for one
        try:
            # evaluate the model
            auROC, auPR = evaluateEpitomeResults(tf_epitome_name, prediction_results_file, pos_file)
            # append the results
            eval_results_df = eval_results_df.append({'model':'epitome',
                        'training_cell': 'joint', 
                        'transcription_factor': tf,
                        'query_cell': query_cell,
                        'auROC': auROC, 
                        'auPR': auPR}, ignore_index=True)
        # if something goes wrong we should know about it
        except Exception as e:
            print('Unable to score predictions for TF: {} and query cell type {}. Exception: {}'.format(tf, query_cell, e))

print(eval_results_df)
eval_results_df.to_csv('../../results/eval_results/defcom_functions_test_results.csv', index=False)

In [None]:
# to start: let's go ahead and load in the data
defcom_eval_results = '../../results/eval_results/defcom_functions_test_results.csv'

results = pd.read_csv(defcom_eval_results, index_col=False)
results = results.loc[ (results['transcription_factor'] != 'Ep300') & (results['transcription_factor'] != 'Gabpa')]
# we're going to generate one boxplot with 16 boxes
#results['model'] = results['model'] + '_' + results['training_cell']
#results.loc[results['model'].str.contains('epitome'), 'model'] = 'epitome'
#results.loc[results['model'].str.contains('epitome'), 'training_cell'] = 'joint'

save_plot_path = '../../results/plots/defcom_functions_test'
generateScatterPlots(results, 'auROC', save_plot_path)

In [None]:
# to start: let's go ahead and load in the data
defcom_eval_results = 'results/eval_results/defcom_functions_test_results.csv'

results = pd.read_csv(defcom_eval_results, index_col=False)
results = results.loc[ (results['transcription_factor'] != 'Ep300') & (results['transcription_factor'] != 'Gabpa')]
# we're going to generate one boxplot with 16 boxes
#results['model'] = results['model'] + '_' + results['training_cell']
#results.loc[results['model'].str.contains('epitome'), 'model'] = 'epitome'
#results.loc[results['model'].str.contains('epitome'), 'training_cell'] = 'joint'

save_plot_path = 'results/plots/defcom_functions_test/comparative_boxplot_auROC.png'
generateComparativeBoxplot(results, 'auROC', 'results/plots/defcom_functions_test/comparative_boxplot_auROC.png')
generateComparativeBoxplot(results, 'auPR', 'results/plots/defcom_functions_test/comparative_boxplot_auPR.png')