In this iPython notebook, the **distance to the closest TSS** was determined for each genome section.

<h3>Import data</h3>

In [2]:
import pandas as pd

In [42]:
es = pd.read_table('Spectacle/OUTPUTSAMPLE_HG19_NEW/GM12878_20_spectral_segments.bed',header=None)
es.columns = ['chr','lower','upper','state']
es['middle'] = (es['lower']+es['upper'])/2
es.head()

Unnamed: 0,chr,lower,upper,state,middle
0,chr10,0,119800,E18,59900
1,chr10,119800,120400,E11,120100
2,chr10,120400,122200,E18,121300
3,chr10,122200,122800,E11,122500
4,chr10,122800,173400,E18,148100


In [7]:
tss = pd.read_table('GM12878_Gencode_TSS.hg19.txt.gz', compression = 'gzip', header=None)
tss.columns = ['chr','pos']
tss.head()

Unnamed: 0,chr,pos
0,chr10,100206638
1,chr10,101190418
2,chr10,101380137
3,chr10,101419240
4,chr10,101491857


<h3>Find closest distance</h3>

In [36]:
# Code from http://stackoverflow.com/questions/12141150/from-list-of-integers-get-number-closest-to-a-given-value
from bisect import bisect_left

def takeClosest(myList, myNumber):
    """
    Assumes myList is sorted. Returns closest distance to myNumber.

    If two numbers are equally close, return the smallest number.
    """
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return abs(myList[0] - myNumber)
    if pos == len(myList):
        return abs(myList[-1] - myNumber)
    before = myList[pos - 1]
    after = myList[pos]
    if after - myNumber < myNumber - before:
       return abs(after - myNumber)
    else:
       return abs(before - myNumber)

In [43]:
# Separate into chromatin groups and find closest distance to tss
groups = es.groupby('chr')
for name, group in groups:
    print name
    t = tss[tss['chr']==name]
    es.loc[es['chr']==name, 'distance'] = group['middle'].apply(lambda num: takeClosest(t['pos'].tolist(), num))

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9


In [44]:
es.head()

Unnamed: 0,chr,lower,upper,state,middle,distance
0,chr10,0,119800,E18,59900,100146738
1,chr10,119800,120400,E11,120100,100086538
2,chr10,120400,122200,E18,121300,100085338
3,chr10,122200,122800,E11,122500,100084138
4,chr10,122800,173400,E18,148100,100058538


<h3>Write output file</h3>

In [45]:
new_filename = "GM12878_"+'tss'+".txt"
with open(new_filename, 'w') as the_file:
    es.to_csv(the_file, sep='\t', index=False)