In [3]:
from overlaps import (
    read_overlaps_from_file, merge_multiple_predictions, compute_crispr_overlaps
)
from misclassifications import get_misclassifications, label_misclassifications, LABELERS, groupby_gene
from main import OVERLAP_FILENAME
from df_loader import CrisprDFLoader, PredDFLoader
from schema import DFSchema
import pandas as pd
pd.set_option('display.max_rows', 100) 

In [4]:
CRISPR_FILENAME = "/oak/stanford/groups/engreitz/Projects/Benchmarking/CRISPR_data/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz"
PRED_FILENAME = "/oak/stanford/groups/engreitz/Users/atan5133/abc_run_comparisons/ABC-Enhancer-Gene-Prediction/results/K562/Predictions/EnhancerPredictionsAllPutative.tsv.gz"
THRESHOLD = 0.016
TSS_REF_FILE = "resources/genome_annotations/RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.bed"

In [7]:
# overlaps = read_overlaps_from_file(OVERLAP_FILENAME)
pred_df = PredDFLoader(PRED_FILENAME, TSS_REF_FILE, THRESHOLD, score_col='powerlaw.Score').load()
crispr_df = CrisprDFLoader(CRISPR_FILENAME, TSS_REF_FILE).load()
overlap_df = compute_crispr_overlaps(
    crispr_df, pred_df
)
overlaps = merge_multiple_predictions(overlap_df, THRESHOLD, "powerlaw.Score")

  pd.api.types.is_categorical_dtype(chrom_dtype),
  pd.api.types.is_categorical_dtype(chrom_dtype),
  df_input_2[overlap_df_idxs[:, 1] == -1] = None


In [8]:
overlap_df[['chrom_pred', 'start_pred','end_pred', 'name_pred', 'distance_pred', 'normalized_dhs_pred', 'activity_base_pred', 'IsSignificant_crispr', 'IsSignificant_pred', 'powerlaw.Score_pred']].iloc[:20]

Unnamed: 0,chrom_pred,start_pred,end_pred,name_pred,distance_pred,normalized_dhs_pred,activity_base_pred,IsSignificant_crispr,IsSignificant_pred,powerlaw.Score_pred
0,chr1,3774743.0,3775877.0,genic|chr1:3774743-3775877,81923.0,40.860043,40.860043,True,True,0.047217
1,chr1,3774743.0,3775877.0,genic|chr1:3774743-3775877,21194.0,40.860043,40.860043,True,True,0.146263
2,chr1,3803896.0,3804664.0,intergenic|chr1:3803896-3804664,7776.0,3.472462,3.472462,False,True,0.03471
3,chr1,3804903.0,3805717.0,intergenic|chr1:3804903-3805717,8806.0,3.293819,3.293819,False,True,0.028986
4,chr1,3774743.0,3775877.0,genic|chr1:3774743-3775877,2524.0,40.860043,40.860043,True,True,0.443125
5,chr1,3803896.0,3804664.0,intergenic|chr1:3803896-3804664,31494.0,3.472462,3.472462,False,False,0.005719
6,chr1,3804903.0,3805717.0,intergenic|chr1:3804903-3805717,32524.0,3.293819,3.293819,False,False,0.005249
7,,,,,,,,False,False,
8,chr1,5304582.0,5305082.0,intergenic|chr1:5304582-5305082,894787.0,2.156485,2.156485,False,False,0.000147
9,chr1,8197384.0,8198250.0,intergenic|chr1:8197384-8198250,236164.0,20.158125,20.158125,False,False,0.005993


In [4]:
misclass_df = get_misclassifications(overlaps)
label_misclassifications(misclass_df)
pd.set_option('display.max_rows', 100) 

In [5]:
for labeler in LABELERS:
    print(labeler.summarize_category_count(misclass_df))
    print()

FalsePos_Category
True    343
Name: count, dtype: int64

FalseNeg_Category
True    220
Name: count, dtype: int64

DistToTSSSize_Category
medium (<= 100000 bp)    288
large (> 100000 bp)      193
small (<= 10000 bp)       82
Name: count, dtype: int64

Top5Gene_Category
HDAC6      10
PRDX2      10
PQBP1       9
PLP2        9
HNRNPA1     8
Name: count, dtype: int64

from_mult_pred
True    31
Name: count, dtype: int64

EnhancerSize_Category
Large     418
Normal    145
Name: count, dtype: int64



In [6]:
false_pos = len(misclass_df[(misclass_df["EnhancerSize_Category"] == "Large") & (misclass_df["FalsePos_Category"] == True)])
false_neg = len(misclass_df[(misclass_df["EnhancerSize_Category"] == "Large") & (misclass_df["FalseNeg_Category"] == True)])
print(f"False pos large enhancers: {false_pos}\nFalse neg large enhancers: {false_neg}")

False pos large enhancers: 304
False neg large enhancers: 114


In [28]:
gata_1 = overlap_df[overlap_df[DFSchema.TARGET_GENE + DFSchema.CRISPR_SUFFIX] == "GATA1"]
gata_1 = gata_1.sort_values(by="activity_base_pred", ascending=False)
gata_1[["activity_base_pred", "hic_contact_pl_scaled_adj_pred", "ABC.Score_pred", "Regulated_crispr"]][:50]


Unnamed: 0,activity_base_pred,hic_contact_pl_scaled_adj_pred,ABC.Score_pred,Regulated_crispr
9927,65.643916,0.006329,0.009888,False
9937,52.973323,0.00294,0.003706,False
9895,40.108691,0.027504,0.026253,False
9894,40.108691,0.027504,0.026253,False
9873,37.37411,0.038451,0.034201,False
9874,37.37411,0.038451,0.034201,False
9867,31.472263,0.010772,0.008068,False
9793,28.581401,0.000192,0.00013,False
9896,24.908039,0.013216,0.007834,False
9831,24.068452,0.005631,0.003226,False


In [7]:
# Look at GATA1 classification
def print_target_gene_misclassification(target_gene):
    incorrect = len(misclass_df[misclass_df[DFSchema.TARGET_GENE + DFSchema.CRISPR_SUFFIX] == target_gene])
    total_pairs = len(overlaps[overlaps[DFSchema.TARGET_GENE + DFSchema.CRISPR_SUFFIX] == target_gene])
    print(f"Misclassified {incorrect}/{total_pairs} eg pairs for {target_gene}.\n{int(100 * incorrect/total_pairs)}% incorrect")

print_target_gene_misclassification("GATA1")
print_target_gene_misclassification("PLP2")

Misclassified 6/212 eg pairs for GATA1.
2% incorrect
Misclassified 9/211 eg pairs for PLP2.
4% incorrect


In [18]:
# Example queries
print(misclass_df["FalsePos_Category"].value_counts().filter([True]))
print()
print(len(misclass_df.query("FalseNeg_Category == True")))

FalsePos_Category
True    263
Name: count, dtype: int64

211


In [19]:
sizes = pred_df["end"] - pred_df["start"]
num_normal = (sizes == 500).sum()
num_large = (sizes > 500).sum()

In [20]:
num_normal

7083290

In [21]:
num_large

1662088