In this iPython notebook, we want to **add a feature to an existing dataset of features that is split up into files for each chromosome**. Provided below is an example for adding p300 to an existing dataset of 9 columns (8 histone marks and 1 binary TPM). The code depends on quicksect.py

In [1]:
import glob
import pandas as pd
import numpy as np

<h3>Import data</h3>

In [None]:
CELLNAME = 'GM12878'

In [2]:
def get_data(filename, chrom):
    dat = pd.read_table(filename,skiprows=1)
    nrows = dat.shape[0]
    dat['chr'] = chrom
    dat['lower'] = np.arange(1,200*(nrows),200)
    dat['upper'] = np.arange(200,200*(nrows+1),200).reshape(nrows,1)
    #dat.head()
    return dat

In [3]:
p300 = pd.read_table('data/'+CELLNAME'_distal_P300_align.hg19.txt.gz', compression = 'gzip', header=None)
p300.columns = ['chr','lower']
p300['upper'] = p300['lower']+200
p300.head()

Unnamed: 0,chr,lower,upper
0,chr10,100082601,100082801
1,chr10,10008401,10008601
2,chr10,100121001,100121201
3,chr10,100139401,100139601
4,chr10,10018001,10018201


<h3>Match overlapping intervals</h3>

In [4]:
import os 
files = glob.glob('Spectacle/SAMPLEDATA_HG19_'+CELLNAME+'/*.txt') # create the list of file

dats = []
for filename in files:
    print filename
    chrom = os.path.split(filename)[1].split("_")[1][3:] # get chromosome number from file name
    dat = get_data(filename, chrom)     # Import existing dataset for chromosome number
    p = p300[p300['chr']=='chr'+chrom]  # Select only rows with corresponding chromosome number
    
    # Find overlapping intervals
    query = zip(dat['lower'],dat['upper'])
    data = zip(p['lower'],p['upper'])
    
    # Modified code from: https://www.biostars.org/p/99/
    from random import randint, seed
    from quicksect import IntervalNode
    def find(start, end, tree):
        #Finds a list with the overlapping intervals
        out = []
        tree.intersect( start, end, lambda x: out.append(x) )
        return int(not not out) #return 1 if there is an intersection

    # start the root at the first element
    start, end = data[0]
    tree = IntervalNode( start, end )

    # build an interval tree from the rest of the data
    for start, end in data[1:]:
        tree = tree.insert( start, end )

    overlap = []
    for start, end in query:
        overlap.append(find(start, end , tree))

    dat['p300'] = overlap
    print dat['p300'].value_counts()
    dats.append(dat)

Spectacle/SAMPLEDATA_HG19_K562/K562_chr10_binary_TPM.txt
0    668790
1      8883
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr11_binary_TPM.txt
0    667960
1      7072
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr12_binary_TPM.txt
0    662546
1      6713
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr13_binary_TPM.txt
0    566697
1      9152
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr14_binary_TPM.txt
0    535182
1      1565
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr15_binary_TPM.txt
0    508173
1      4483
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr16_binary_TPM.txt
0    447574
1      4199
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr17_binary_TPM.txt
0    402083
1      3893
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr18_binary_TPM.txt
0    385551
1      4835
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr19_binary_TPM.txt
0    293348
1      2296
dtype: int64
Spectacle/SAMPLEDATA_HG19_K562/K562_chr1_binary_TPM.txt
0   

In [5]:
result = pd.concat(dats)
result = result.ix[:, [9,10,11,0,1,2,3,4,5,6,7,8,12]] #rearrange columns

<h3>Write to output file</h3>

In [6]:
new_filename = CELLNAME+"_features.txt"
with open(new_filename, 'w') as the_file:
    result.to_csv(the_file, sep='\t', index=False)
result.head()

Unnamed: 0,H3K27ac,H3K27me3,H3K36me3,H3K4me1,H3K4me2,H3K4me3,H3K9ac,H4K20me1,TPM,chr,lower,upper,p300
0,0,0,0,0,0,0,0,0,0,10,1,200,0
1,0,0,0,0,0,0,0,0,0,10,201,400,0
2,0,0,0,0,0,0,0,0,0,10,401,600,0
3,0,0,0,0,0,0,0,0,0,10,601,800,0
4,0,0,0,0,0,0,0,0,0,10,801,1000,0
