In [5]:
import numpy as np
import pandas as pd
import random

In [6]:
path = "pairwiseMetrics_cancer_non-cancer_s1.txt.gz"

In [7]:
names = ["Chromosome", "Start", "End", "MaxDiffState", "Distance", "Sign", "Pval", "MHPval"]

In [8]:
pairwiseMetrics = pd.read_table(path, header=None, sep="\t", names=names).astype({"Chromosome": str, "Start": np.int32, "End": np.int32, "MaxDiffState": str, "Distance": np.float32, "Sign": str, "Pval": np.float32, "MHPval": np.float32})

In [9]:
pairwiseMetrics.head(10)

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval
0,chr1,0,200,18,0.0,+,0.892788,0.938788
1,chr1,200,400,18,0.0,+,0.892788,0.938788
2,chr1,400,600,18,0.0,+,0.892788,0.938788
3,chr1,600,800,18,0.0,+,0.892788,0.938788
4,chr1,800,1000,18,0.0,+,0.892788,0.938788
5,chr1,1000,1200,18,0.0,+,0.892788,0.938788
6,chr1,1200,1400,18,0.0,+,0.892788,0.938788
7,chr1,1400,1600,18,0.0,+,0.892788,0.938788
8,chr1,1600,1800,18,0.0,+,0.892788,0.938788
9,chr1,1800,2000,18,0.0,+,0.892788,0.938788


In [10]:
exemplars = pairwiseMetrics.iloc[(-pairwiseMetrics["Distance"].abs()).argsort()].head(200)

### Generating High values around exemplars

In [11]:
highValueData = pairwiseMetrics.copy()

In [12]:
newIndices = []

In [13]:
for i, row in exemplars.iterrows():
    randomDistance1 = random.uniform(row.Distance - 10, row.Distance)
    randomIndex1 = random.randint(1, 15)
    randomDistance2 = random.uniform(row.Distance - 10, row.Distance)
    randomIndex2 = random.randint(-15, -1)
    highValueData.iloc[i + randomIndex1, 4] = randomDistance1
    highValueData.iloc[i + randomIndex2, 4] = randomDistance2
    newIndices += [i + randomIndex1, i + randomIndex2]

### Flattening Data around exemplars

In [14]:
pairwiseMetrics.iloc[(-pairwiseMetrics["Distance"].abs()).argsort()].head(200).iloc[-1]

Chromosome           chr2
Start            54786200
End              54786400
MaxDiffState            3
Distance        32.428841
Sign                    -
Pval                  0.0
MHPval           0.008686
Name: 1520184, dtype: object

In [15]:
flatData = pairwiseMetrics.copy()

In [16]:
flattenedIndices = []

In [17]:
for i, row in exemplars.iterrows():
    flattenedIndices.append(i)
    for j in range(-15, 16):
        if i+j not in flattenedIndices:
            flatData.iloc[i+j, 4] = random.uniform(0, 10)

In [18]:
exemplars.iloc[0].name

13832374

In [19]:
flatData.iloc[13832359:13832390]

Unnamed: 0,Chromosome,Start,End,MaxDiffState,Distance,Sign,Pval,MHPval
13832359,chr20,47900200,47900400,8,3.662477,+,1.15013e-07,0.008686
13832360,chr20,47900400,47900600,8,4.991324,+,2.12887e-07,0.008686
13832361,chr20,47900600,47900800,8,7.072759,+,1.19568e-07,0.008686
13832362,chr20,47900800,47901000,8,1.593751,+,7.40164e-08,0.008686
13832363,chr20,47901000,47901200,8,0.449191,+,3.94085e-07,0.008686
13832364,chr20,47901200,47901400,7,3.963907,+,4.55688e-07,0.008686
13832365,chr20,47901400,47901600,8,1.359495,+,4.01818e-07,0.008686
13832366,chr20,47901600,47901800,8,3.142186,+,1.28156e-07,0.008686
13832367,chr20,47901800,47902000,8,9.907576,+,5.45513e-08,0.008686
13832368,chr20,47902000,47902200,8,8.004575,+,2.42076e-08,0.008686


### Writing data

In [20]:
highValueData.to_csv("../results/highValueData.txt.gz", sep="\t", header=False, float_format="%.5f", index=False)

In [21]:
flatData.to_csv("../results/flattenedData.txt.gz", sep="\t", header=False, float_format="%.5f", index=False)