In [2]:
import pandas as pd

from df_loader import PredDFLoader, CrisprDFLoader
from overlaps import (compute_all_overlaps, read_overlaps_from_file,
                      write_overlaps_to_file)
from schema import DFSchema

In [3]:
CRISPR_FILENAME = "resources/example/EPCrisprBenchmark_Fulco2019_K562_GRCh38.tsv.gz"
DEV_FILENAME = "resources/personal/dev_predictions.txt.gz"
KRISTY_FILENAME = "resources/personal/kristy_predictions.txt.gz"
TSS_REF_FILE = "resources/genome_annotations/RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.bed"
ABC_THRESHOLD = 0.02


In [4]:
def get_overlap_df(filename):
    pred_df = PredDFLoader(filename, TSS_REF_FILE, ABC_THRESHOLD).load()
    crispr_df = CrisprDFLoader(CRISPR_FILENAME, TSS_REF_FILE).load()
    overlaps = compute_all_overlaps(
        crispr_df, pred_df, suffixes=[DFSchema.CRISPR_SUFFIX, DFSchema.PRED_SUFFIX]
    )
    overlaps = compute_all_overlaps(crispr_df, pred_df)
    return overlaps

In [8]:
dev_overlaps = get_overlap_df(DEV_FILENAME)

In [9]:
kristy_overlaps = get_overlap_df(KRISTY_FILENAME)

In [18]:
def pairs_in_common(df_a, df_b):
    overlaps = compute_all_overlaps(
        df_a, df_b
    )
    return overlaps
    

In [19]:
df_a = PredDFLoader(DEV_FILENAME, TSS_REF_FILE, ABC_THRESHOLD).load()
df_b = PredDFLoader(KRISTY_FILENAME, TSS_REF_FILE, ABC_THRESHOLD).load()
common = pairs_in_common(df_a, df_b)

In [23]:
pd.set_option('display.max_rows', 20)

In [26]:
common[common['name_a']=='intergenic|chr2:37189681-37190250'][['name_a', 'name_b', 'start_a', 'start_b', 'powerlaw.Score_a', 'powerlaw.Score_b', 'TargetGene_a', 'TargetGene_b']]

Unnamed: 0,name_a,name_b,start_a,start_b,powerlaw.Score_a,powerlaw.Score_b,TargetGene_a,TargetGene_b
1,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000045,0.000045,LOC102723824,LOC102723824
508258,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000002,0.000002,YIPF4,YIPF4
560733,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000011,0.000011,SRSF7,SRSF7
807068,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000088,0.000088,GEMIN6,GEMIN6
1125368,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000038,0.000038,LOC728730,LOC728730
...,...,...,...,...,...,...,...,...
7760858,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000315,0.000315,FEZ2,FEZ2
7916899,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000045,0.000045,SOS1,SOS1
8847883,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000073,0.000073,MORN2,MORN2
8934512,intergenic|chr2:37189681-37190250,intergenic|chr2:37189681-37190250,37189681,37189681,0.000020,0.000020,TTC27,TTC27


In [25]:
print(len(df_a), len(df_b), len(common))  # Around 20k differences / 9mil total

9292977 9598794 9274604


In [35]:
# 1 million differences
print(len(common[(common['powerlaw.Score_a'] - common['powerlaw.Score_b']) > .0]))

# 8247 significant differences
print(len(common[(common['powerlaw.Score_a'] - common['powerlaw.Score_b']) > .01]))



1435445
286


In [12]:
len(kristy_overlaps)

4664

In [23]:
dev_overlaps['powerlaw.Score_pred'][:20] - kristy_overlaps['powerlaw.Score_pred'][:20]

0    -4.000000e-06
1    -2.000000e-06
2    -1.000000e-06
3    -7.000000e-06
4    -5.000000e-05
5    -2.000000e-06
6     0.000000e+00
7     0.000000e+00
8     0.000000e+00
9     0.000000e+00
10   -1.000000e-06
11    0.000000e+00
12    0.000000e+00
13    0.000000e+00
14    0.000000e+00
15    0.000000e+00
16    0.000000e+00
17    0.000000e+00
18    0.000000e+00
19    0.000000e+00
Name: powerlaw.Score_pred, dtype: float64

In [15]:
dev_overlaps.iloc[0]

index_a                                                             3781
index_a                                                             3781
chrom_a                                                             chr6
start_a                                                        109304426
end_a                                                          109304926
name_a                                  CD164|chr6:109625629-109626129:*
EffectSize_a                                                   -0.657495
chrTSS_a                                                            chr6
TargetGeneTSS_a                                                109382562
endTSS_a                                                     109382812.0
TargetGene_a                                                       CD164
IsSignficant_a                                                      True
pValueAdjusted_a                                                0.001244
PowerAtEffectSize25_a                              