In [1]:
from overlaps import (
    read_overlaps_from_file, merge_multiple_predictions, compute_crispr_overlaps
)
from misclassifications import get_misclassifications, label_misclassifications, LABELERS, groupby_gene
from main import OVERLAP_FILENAME
from df_loader import CrisprDFLoader, PredDFLoader
from schema import DFSchema
import pandas as pd
pd.set_option('display.max_rows', 100) 

In [2]:
CRISPR_FILENAME = "/oak/stanford/groups/engreitz/Projects/Benchmarking/CRISPR_data/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz"
PRED_FILENAME = "/oak/stanford/groups/engreitz/Users/atan5133/abc_run_comparisons/results_10_27_dev/Predictions/EnhancerPredictionsAllPutative.tsv.gz"
OVERLAP_FILENAME = "crispr_pred_overlaps_noqnorm.csv"
ABC_THRESHOLD = 0.024
TSS_REF_FILE = "resources/genome_annotations/RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.bed"

In [3]:
# overlaps = read_overlaps_from_file(OVERLAP_FILENAME)
pred_df = PredDFLoader(PRED_FILENAME, TSS_REF_FILE, ABC_THRESHOLD).load()
crispr_df = CrisprDFLoader(CRISPR_FILENAME, TSS_REF_FILE).load()
overlap_df = compute_crispr_overlaps(
    crispr_df, pred_df
)
overlaps = merge_multiple_predictions(overlap_df, ABC_THRESHOLD)

  pd.api.types.is_categorical_dtype(chrom_dtype),
  pd.api.types.is_categorical_dtype(chrom_dtype),
  df_input_2[overlap_df_idxs[:, 1] == -1] = None


In [14]:
misclass_df = get_misclassifications(overlaps)
label_misclassifications(misclass_df)
pd.set_option('display.max_rows', 100) 

In [15]:
for labeler in LABELERS:
    print(labeler.summarize_category_count(misclass_df))
    print()

FalsePos_Category
True    263
Name: count, dtype: int64

FalseNeg_Category
True    211
Name: count, dtype: int64

DistToTSSSize_Category
medium (<= 100000 bp)    233
large (> 100000 bp)      172
small (<= 10000 bp)       69
Name: count, dtype: int64

Top5Gene_Category
PLP2     10
MYC       8
PRDX2     8
HDAC6     6
PQBP1     5
Name: count, dtype: int64

from_mult_pred
True    39
Name: count, dtype: int64

EnhancerSize_Category
Normal    314
Large     160
Name: count, dtype: int64



In [22]:
false_pos = len(misclass_df[(misclass_df["EnhancerSize_Category"] == "Large") & (misclass_df["FalsePos_Category"] == True)])
false_neg = len(misclass_df[(misclass_df["EnhancerSize_Category"] == "Large") & (misclass_df["FalseNeg_Category"] == True)])
print(f"False pos large enhancers: {false_pos}\nFalse neg large enhancers: {false_neg}")

False pos large enhancers: 129
False neg large enhancers: 31


In [17]:
# Look at MYC classification
def print_target_gene_misclassification(target_gene):
    incorrect = len(misclass_df[misclass_df[DFSchema.TARGET_GENE + DFSchema.CRISPR_SUFFIX] == target_gene])
    total_pairs = len(overlaps[overlaps[DFSchema.TARGET_GENE + DFSchema.CRISPR_SUFFIX] == target_gene])
    print(f"Misclassified {incorrect}/{total_pairs} eg pairs for {target_gene}.\n{int(100 * incorrect/total_pairs)}% incorrect")

print_target_gene_misclassification("MYC")
print_target_gene_misclassification("PLP2")

Misclassified 8/105 eg pairs for MYC.
7% incorrect
Misclassified 10/211 eg pairs for PLP2.
4% incorrect


In [18]:
# Example queries
print(misclass_df["FalsePos_Category"].value_counts().filter([True]))
print()
print(len(misclass_df.query("FalseNeg_Category == True")))

FalsePos_Category
True    263
Name: count, dtype: int64

211


In [19]:
sizes = pred_df["end"] - pred_df["start"]
num_normal = (sizes == 500).sum()
num_large = (sizes > 500).sum()

In [20]:
num_normal

7083290

In [21]:
num_large

1662088