In [1]:
import bioframe as bf
import pandas as pd
import copy
import numpy as np
from schema import DFSchema
from df_loader import CrisprDFLoader, PredDFLoader
from overlaps import (
    read_overlaps_from_file, merge_multiple_predictions, compute_crispr_overlaps
)
# pd.set_option('display.max_rows', 1000) 

In [2]:
CRISPR_FILENAME = "/oak/stanford/groups/engreitz/Projects/Benchmarking/CRISPR_data/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz"
PRED_FILENAME = "/oak/stanford/groups/engreitz/Users/atan5133/abc_run_comparisons/results_no_qnorm_08_28_dev/Predictions/EnhancerPredictionsAllPutative.tsv.gz"
OVERLAP_FILENAME = "crispr_pred_overlaps_noqnorm.csv"
ABC_THRESHOLD = 0.02
TSS_REF_FILE = "resources/genome_annotations/RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.bed"
TARGET_GENE = "PLP2"

In [3]:
pred_df = PredDFLoader(PRED_FILENAME, TSS_REF_FILE, ABC_THRESHOLD).load()
crispr_df = CrisprDFLoader(CRISPR_FILENAME, TSS_REF_FILE).load()
# overlap_df = compute_crispr_overlaps(
#     crispr_df, pred_df
# )

In [4]:
overlap_df = read_overlaps_from_file(OVERLAP_FILENAME)

In [5]:
df = merge_multiple_predictions(overlap_df, ABC_THRESHOLD)

In [6]:
def format_distance(bp):
    kb = bp / 1000
    mb = kb / 1000

    if mb >= 1:
        return f"{mb:.2f} Mbp"
    elif kb >= 1:
        return f"{kb:.2f} Kbp"
    else:
        return f"{bp} bp"

In [7]:
gene_df = df[df["TargetGene_crispr"] == TARGET_GENE]
# pretty print distance
gene_df.loc[:,'distance_pred'] = gene_df['distance_pred'].apply(format_distance)

In [31]:
# Sorted by top 10 contact
gene_df_sorted = gene_df.sort_values(by="hic_contact_pl_scaled_adj_pred", ascending=False)
columns_pred = ['name', 'distance', 'activity_base', 'hic_contact', 'ABC.Score', 'IsSignificant', 'hic_contact_pl_scaled_adj','ABC.Score.Numerator', 'normalized_dhs', 'isSelfPromoter', 'powerlaw_contact', 'powerlaw_contact_reference','hic_contact_pl_scaled']
columns = ["dataset_crispr", "start_crispr", "end_crispr", "pValueAdjusted_crispr", "EffectSize_crispr", "IsSignificant_crispr"] + [col+'_pred' for col in columns_pred]
gene_df_sorted[:10][columns]

Unnamed: 0,dataset_crispr,start_crispr,end_crispr,pValueAdjusted_crispr,EffectSize_crispr,IsSignificant_crispr,name_pred,distance_pred,activity_base_pred,hic_contact_pred,ABC.Score_pred,IsSignificant_pred,hic_contact_pl_scaled_adj_pred,ABC.Score.Numerator_pred,normalized_dhs_pred,isSelfPromoter_pred,powerlaw_contact_pred,powerlaw_contact_reference_pred,hic_contact_pl_scaled_pred
10009,FlowFISH_K562,49167166,49167746,1.103467e-121,-0.378592,True,genic|chrX:49166324-49167829,4.76 Kbp,24.772761,0.087295,0.191596,True,0.271877,6.735153,12.623233,False,0.048166,0.123436,0.223711
10008,FlowFISH_K562,49166366,49166866,8.919537e-31,-0.220761,True,genic|chrX:49166324-49167829,4.76 Kbp,24.772761,0.087295,0.191596,True,0.271877,6.735153,12.623233,False,0.048166,0.123436,0.223711
10010,FlowFISH_K562,49177416,49177916,0.03925845,-0.054055,True,genic|chrX:49177318-49177903,5.77 Kbp,1.7288,0.072017,0.011304,False,0.229846,0.397358,3.155808,False,0.041681,0.108903,0.188166
10007,FlowFISH_K562,49163947,49164447,2.648354e-15,-0.14165,True,genic|chrX:49163587-49164440,7.82 Kbp,10.31196,0.039454,0.040513,True,0.138105,1.424133,14.035042,False,0.030728,0.08363,0.107377
10006,FlowFISH_K562,49161593,49162093,0.841184,0.031541,False,intergenic|chrX:49161552-49162052,10.03 Kbp,2.985601,0.039454,0.011463,False,0.134961,0.40294,1.411809,False,0.02393,0.067344,0.111031
10005,FlowFISH_K562,49155148,49156608,1.577397e-19,-0.115462,True,intergenic|chrX:49155190-49156881,15.80 Kbp,39.711592,0.022662,0.093715,True,0.082957,3.294344,34.215605,False,0.015167,0.045368,0.06779
10003,FlowFISH_K562,49148590,49149090,4.377685e-05,-0.232888,True,intergenic|chrX:49148373-49149527,22.89 Kbp,19.348904,0.017966,0.036846,True,0.06694,1.295223,5.647236,False,0.010454,0.032868,0.056486
10004,FlowFISH_K562,49150250,49150750,0.3981444,0.133775,False,intergenic|chrX:49150333-49150833,21.25 Kbp,2.743047,0.016441,0.004872,False,0.062441,0.17128,0.581333,False,0.011261,0.035055,0.05118
10001,FlowFISH_K562,49120032,49120532,0.6719259,-0.04083,False,genic|chrX:49119614-49120533,51.76 Kbp,0.627102,0.016115,0.001091,False,0.061147,0.038345,1.245714,False,0.004606,0.016159,0.056541
10011,FlowFISH_K562,49188652,49189152,0.3981444,0.031456,False,genic|chrX:49188676-49189176,17.09 Kbp,2.905495,0.013501,0.004532,False,0.054833,0.159316,1.910094,False,0.014017,0.042375,0.040815


In [33]:
# top 15 effect sizes
gene_df.sort_values(by="EffectSize_crispr", ascending=False)[columns][:15]

Unnamed: 0,dataset_crispr,start_crispr,end_crispr,pValueAdjusted_crispr,EffectSize_crispr,IsSignificant_crispr,name_pred,distance_pred,activity_base_pred,hic_contact_pred,ABC.Score_pred,IsSignificant_pred,hic_contact_pl_scaled_adj_pred,ABC.Score.Numerator_pred,normalized_dhs_pred,isSelfPromoter_pred,powerlaw_contact_pred,powerlaw_contact_reference_pred,hic_contact_pl_scaled_pred
9966,FlowFISH_K562,48800418,48800918,7.761674e-22,0.225423,True,intergenic|chrX:48800213-48801055,371.20 Kbp,42.260009,0.00247,0.014341,False,0.011929,0.504134,43.184744,False,0.000637,0.002911,0.011293
9962,FlowFISH_K562,48793298,48793798,9.751705e-08,0.138006,True,genic|chrX:48793366-48793866,378.22 Kbp,1.536079,0.000815,0.000191,False,0.004361,0.006699,1.494857,False,0.000625,0.002864,0.003736
10004,FlowFISH_K562,49150250,49150750,0.3981444,0.133775,False,intergenic|chrX:49150333-49150833,21.25 Kbp,2.743047,0.016441,0.004872,False,0.062441,0.17128,0.581333,False,0.011261,0.035055,0.05118
9959,FlowFISH_K562,48782797,48783297,4.009294e-10,0.122494,True,intergenic|chrX:48782830-48783330,388.76 Kbp,14.889316,0.002959,0.006024,False,0.014222,0.211757,25.08037,False,0.000608,0.002796,0.013614
9961,FlowFISH_K562,48789397,48789977,1.859083e-07,0.107275,True,genic|chrX:48789486-48789986,382.10 Kbp,6.395206,0.001109,0.001038,False,0.005708,0.036502,4.318474,False,0.000618,0.002839,0.005089
9902,FlowFISH_K562,47880786,47881326,0.05739911,0.103834,False,,nan bp,,,,False,,,,,,,
9881,FlowFISH_K562,47595586,47596086,0.6748666,0.084979,False,,nan bp,,,,False,,,,,,,
9960,FlowFISH_K562,48784617,48785277,2.438952e-05,0.081854,True,,nan bp,,,,False,,,,,,,
9971,FlowFISH_K562,48864144,48864644,0.05740783,0.079294,False,intergenic|chrX:48864121-48864621,307.46 Kbp,1.373911,0.000598,0.000134,False,0.003434,0.004719,1.993142,False,0.000769,0.00343,0.002665
9964,FlowFISH_K562,48794818,48795318,0.002922221,0.07589,True,genic|chrX:48794178-48795650,376.92 Kbp,27.622858,0.000815,0.003427,False,0.004362,0.120483,22.588943,False,0.000627,0.002873,0.003735


In [28]:
# Total Significant enhancers: 14
print(len(gene_df[gene_df["IsSignificant_crispr"]]))

# Total predicted signficant enhancers: 6
print(len(gene_df[gene_df["IsSignificant_pred"]]))

# This shows that normalizing across E-G pairs doesn't work for cases like this.
# We miss out on a lot of predictions

14
6
