In [1]:
import numpy as np
import pandas as pd

### Reading in the original data

In [2]:
path = "pairwiseMetrics_cancer_non-cancer_s1.txt.gz"

In [3]:
names = ["Chromosome", "Start", "End", "MaxDiffState", "Distance", "Sign", "Pval", "MHPval"]

In [4]:
pairwiseMetrics = pd.read_table(path, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32})

In [5]:
originalExemplars = pairwiseMetrics.iloc[(-pairwiseMetrics["Distance"].abs()).argsort()].head(100)

### Reading in the clumped data

In [6]:
names = ["Chromosome", "Start", "End", "MaxDiffState", "Distance", "Sign", "Pval", "MHPval", "Dummy"]

In [7]:
clumpedPath = "../results/highValueData.txt.gz"

In [8]:
clumpedDF = pd.read_table(clumpedPath, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32, "Dummy": str})

In [9]:
clumpedExemplars = clumpedDF.iloc[(-clumpedDF["Distance"].abs()).argsort()].head(100)

### Reading in the evenly spread data

In [10]:
spreadPath = "../results/flattenedData.txt.gz"

In [11]:
spreadDF = pd.read_table(spreadPath, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32, "Dummy": str})

In [12]:
spreadExemplars = spreadDF.iloc[(-spreadDF["Distance"].abs()).argsort()].head(100)

### Reading in PQ clumped

In [13]:
clumpedPQPath = "pq.bed.clumped"

In [14]:
clumpedPQDF = pd.read_table(clumpedPQPath, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32, "Dummy": str})

In [15]:
clumpedPQExemplars = clumpedPQDF.iloc[(-clumpedPQDF["Distance"].abs()).argsort()].head(100)

### Reading in WIS clumped

In [16]:
clumpedWISPath = "wis.bed.clumped"

In [17]:
clumpedWISDF = pd.read_table(clumpedWISPath, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32, "Dummy": str})

In [18]:
clumpedWISExemplars = clumpedWISDF.iloc[(-clumpedWISDF["Distance"].abs()).argsort()].head(100)

### Reading in PQ Spread

In [19]:
spreadPQPath = "pq.bed.evenSpacing"

In [20]:
spreadPQDF = pd.read_table(spreadPQPath, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32, "Dummy": str})

In [21]:
spreadPQExemplars = spreadPQDF.iloc[(-spreadPQDF["Distance"].abs()).argsort()].head(100)

### Reading in WIS spread

In [22]:
spreadWISPath = "wis.bed.evenSpacing"

In [23]:
spreadWISDF = pd.read_table(spreadWISPath, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32, "Dummy": str})

In [24]:
spreadWISExemplars = spreadWISDF.iloc[(-spreadWISDF["Distance"].abs()).argsort()].head(100)

### Comparing Original vs PQ vs WIS clumped

In [25]:
clumpedExemplars.head(10)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
13832371,chr20,47902600,47902800,8,56.064678,+,0.0,0.00869,
12707236,chr17,41276800,41277000,1,55.905739,+,0.0,0.00869,
12582589,chr17,16347400,16347600,8,55.137058,+,2e-05,0.0102,
12582576,chr17,16344800,16345000,8,54.722752,+,0.0,0.00869,
13832373,chr20,47903000,47903200,8,54.08213,+,0.0,0.00869,
8393897,chr9,139620200,139620400,8,53.253578,+,0.0,0.00869,
8393890,chr9,139618800,139619000,7,53.210369,+,1e-05,0.00897,
13606034,chr20,2635200,2635400,8,53.081379,+,0.0,0.00869,
9455101,chr11,75113000,75113200,8,53.032398,+,0.0,0.00869,
8393903,chr9,139621400,139621600,4,52.78059,+,0.0,0.00869,


In [26]:
clumpedPQExemplars.head(10)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
99058,chr17,41276800,41277000,1,55.905739,+,0.0,0.00869,.
95717,chr17,16344800,16345000,8,54.722752,+,0.0,0.00869,.
247604,chrX,69510200,69510400,1,54.221569,+,0.0,0.00869,.
244471,chr9,139620200,139620400,8,53.253578,+,0.0,0.00869,.
143223,chr20,2635200,2635400,8,53.081379,+,0.0,0.00869,.
42699,chr11,75113000,75113200,8,53.032398,+,0.0,0.00869,.
95718,chr17,16347800,16348000,8,52.679951,+,4e-05,0.01132,.
118151,chr19,42366400,42366600,8,52.14822,+,0.0,0.00869,.
226824,chr8,48872000,48872200,2,52.046761,-,0.0,0.00869,.
148588,chr20,47902200,47902400,8,51.112049,+,0.0,0.00869,.


In [27]:
clumpedExemplars.iloc[0:10]

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
13832371,chr20,47902600,47902800,8,56.064678,+,0.0,0.00869,
12707236,chr17,41276800,41277000,1,55.905739,+,0.0,0.00869,
12582589,chr17,16347400,16347600,8,55.137058,+,2e-05,0.0102,
12582576,chr17,16344800,16345000,8,54.722752,+,0.0,0.00869,
13832373,chr20,47903000,47903200,8,54.08213,+,0.0,0.00869,
8393897,chr9,139620200,139620400,8,53.253578,+,0.0,0.00869,
8393890,chr9,139618800,139619000,7,53.210369,+,1e-05,0.00897,
13606034,chr20,2635200,2635400,8,53.081379,+,0.0,0.00869,
9455101,chr11,75113000,75113200,8,53.032398,+,0.0,0.00869,
8393903,chr9,139621400,139621600,4,52.78059,+,0.0,0.00869,


In [28]:
clumpedDF.iloc[12707206:12707266]

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
12707206,chr17,41270800,41271000,6,0.68787,+,0.01399,0.0998,
12707207,chr17,41271000,41271200,11,0.56398,+,0.01908,0.11904,
12707208,chr17,41271200,41271400,18,0.42307,+,0.02896,0.15256,
12707209,chr17,41271400,41271600,11,0.59297,+,0.01768,0.11393,
12707210,chr17,41271600,41271800,6,0.89919,+,0.00892,0.07853,
12707211,chr17,41271800,41272000,6,1.15318,+,0.00567,0.06263,
12707212,chr17,41272000,41272200,6,1.15318,+,0.00567,0.06263,
12707213,chr17,41272200,41272400,6,1.01885,+,0.00714,0.07011,
12707214,chr17,41272400,41272600,6,1.04913,+,0.00677,0.06828,
12707215,chr17,41272600,41272800,6,1.25882,+,0.0048,0.05777,


In [29]:
clumpedWISExemplars.loc[clumpedWISExemplars.Chromosome == "chr17"]

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
205663,chr17,16347800,16348000,8,52.679951,+,4e-05,0.01132,.
205662,chr17,16343000,16343200,1,52.09824,+,2e-05,0.01053,.
208617,chr17,41278600,41278800,4,50.05835,+,0.00031,0.01874,.
211114,chr17,60502400,60502600,2,40.029339,+,0.08953,0.31441,.
210725,chr17,57410000,57410200,14,39.42923,+,0.0,0.00869,.
204722,chr17,8287200,8287400,1,39.163052,+,0.00459,0.05656,.
213262,chr17,77787200,77787400,1,37.25053,-,0.00011,0.01374,.
203946,chr17,3629200,3629400,6,37.19046,+,0.02019,0.12305,.
210768,chr17,57784000,57784200,1,36.25029,+,0.0,0.00869,.
211113,chr17,60498200,60498400,18,35.103779,+,0.89279,0.93879,.


In [30]:
clumpedWISExemplars.head(20)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
247157,chrX,69510200,69510400,1,54.221569,+,0.0,0.00869,.
205663,chr17,16347800,16348000,8,52.679951,+,4e-05,0.01132,.
205662,chr17,16343000,16343200,1,52.09824,+,2e-05,0.01053,.
234026,chr20,47902200,47902400,8,51.112049,+,0.0,0.00869,.
228953,chr20,2637200,2637400,5,51.06839,-,0.00132,0.03255,.
186265,chr15,30915800,30916000,18,50.822411,-,0.53935,0.93879,.
114247,chr8,48869000,48869200,5,50.714809,+,0.00087,0.02745,.
208617,chr17,41278600,41278800,4,50.05835,+,0.00031,0.01874,.
228952,chr20,2632400,2632600,1,49.330719,+,0.00914,0.07955,.
132410,chr9,139623000,139623200,1,47.772572,-,0.00095,0.02848,.


In [31]:
clumpedWISExemplars.head(10)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
247157,chrX,69510200,69510400,1,54.221569,+,0.0,0.00869,.
205663,chr17,16347800,16348000,8,52.679951,+,4e-05,0.01132,.
205662,chr17,16343000,16343200,1,52.09824,+,2e-05,0.01053,.
234026,chr20,47902200,47902400,8,51.112049,+,0.0,0.00869,.
228953,chr20,2637200,2637400,5,51.06839,-,0.00132,0.03255,.
186265,chr15,30915800,30916000,18,50.822411,-,0.53935,0.93879,.
114247,chr8,48869000,48869200,5,50.714809,+,0.00087,0.02745,.
208617,chr17,41278600,41278800,4,50.05835,+,0.00031,0.01874,.
228952,chr20,2632400,2632600,1,49.330719,+,0.00914,0.07955,.
132410,chr9,139623000,139623200,1,47.772572,-,0.00095,0.02848,.


### Comparing Original vs PQ vs WIS spread

In [32]:
spreadExemplars.head(10)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
13832374,chr20,47903200,47903400,8,57.402458,+,0.0,0.00869,
12582578,chr17,16345200,16345400,8,56.087971,+,0.0,0.00869,
12707236,chr17,41276800,41277000,1,55.905739,+,0.0,0.00869,
14752707,chrX,69510200,69510400,1,54.221569,+,0.0,0.00869,
8393897,chr9,139620200,139620400,8,53.253578,+,0.0,0.00869,
13606034,chr20,2635200,2635400,8,53.081379,+,0.0,0.00869,
9455101,chr11,75113000,75113200,8,53.032398,+,0.0,0.00869,
13509046,chr19,42366400,42366600,8,52.14822,+,0.0,0.00869,
7208336,chr8,48872000,48872200,2,52.046761,-,0.0,0.00869,
11691015,chr15,30918400,30918600,2,51.060848,-,0.0,0.00869,


In [33]:
spreadPQExemplars.head(10)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
148583,chr20,47903200,47903400,8,57.402458,+,0.0,0.00869,.
95711,chr17,16345200,16345400,8,56.087971,+,0.0,0.00869,.
99050,chr17,41276800,41277000,1,55.905739,+,0.0,0.00869,.
247602,chrX,69510200,69510400,1,54.221569,+,0.0,0.00869,.
244466,chr9,139620200,139620400,8,53.253578,+,0.0,0.00869,.
143218,chr20,2635200,2635400,8,53.081379,+,0.0,0.00869,.
42696,chr11,75113000,75113200,8,53.032398,+,0.0,0.00869,.
118147,chr19,42366400,42366600,8,52.14822,+,0.0,0.00869,.
226817,chr8,48872000,48872200,2,52.046761,-,0.0,0.00869,.
74965,chr15,30918400,30918600,2,51.060848,-,0.0,0.00869,.


In [34]:
spreadWISExemplars.head(10)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval,Dummy
234027,chr20,47903200,47903400,8,57.402458,+,0.0,0.00869,.
205670,chr17,16345200,16345400,8,56.087971,+,0.0,0.00869,.
208623,chr17,41276800,41277000,1,55.905739,+,0.0,0.00869,.
247157,chrX,69510200,69510400,1,54.221569,+,0.0,0.00869,.
132419,chr9,139620200,139620400,8,53.253578,+,0.0,0.00869,.
228954,chr20,2635200,2635400,8,53.081379,+,0.0,0.00869,.
153004,chr11,75113000,75113200,8,53.032398,+,0.0,0.00869,.
225555,chr19,42366400,42366600,8,52.14822,+,0.0,0.00869,.
114255,chr8,48872000,48872200,2,52.046761,-,0.0,0.00869,.
186274,chr15,30918400,30918600,2,51.060848,-,0.0,0.00869,.
