In this iPython notebook, we want to **add a feature to an existing dataset of features that is split up into files for each chromosome**. Provided below is an example for adding p300 to an existing dataset of 9 columns (8 histone marks and 1 binary TPM). The code depends on quicksect.py

In [1]:
import glob
import pandas as pd
import numpy as np

<h3>Import data</h3>

In [2]:
def get_data(filename, chrom):
    dat = pd.read_table(filename,skiprows=1)
    nrows = dat.shape[0]
    dat['chr'] = chrom
    dat['lower'] = np.arange(1,200*(nrows),200)
    dat['upper'] = np.arange(200,200*(nrows+1),200).reshape(nrows,1)
    #dat.head()
    return dat

In [28]:
p300 = pd.read_table('data/GM12878_distal_P300_align.hg19.txt.gz', compression = 'gzip', header=None)
p300.columns = ['chr','lower']
p300['upper'] = p300['lower']+200
p300.head()

Unnamed: 0,chr,lower,upper
0,chr10,100006601,100006801
1,chr10,100011001,100011201
2,chr10,100099401,100099601
3,chr10,100099601,100099801
4,chr10,100099801,100100001


<h3>Match overlapping intervals</h3>

In [29]:
import os 
files = glob.glob('Spectacle/SAMPLEDATA_HG19_NEW/*.txt') # create the list of file

dats = []
for filename in files:
    print filename
    chrom = os.path.split(filename)[1].split("_")[1][3:] # get chromosome number from file name
    dat = get_data(filename, chrom)     # Import existing dataset for chromosome number
    p = p300[p300['chr']=='chr'+chrom]  # Select only rows with corresponding chromosome number
    
    # ========================================================================================
    # Find overlapping intervals
    query = zip(dat['lower'],dat['upper'])
    data = zip(p['lower'],p['upper'])
    
    # Modified code from: https://www.biostars.org/p/99/
    from random import randint, seed
    from quicksect import IntervalNode
    def find(start, end, tree):
        #Finds a list with the overlapping intervals
        out = []
        tree.intersect( start, end, lambda x: out.append(x) )
        return not not out
        #return True if there is an intersection

    # start the root at the first element
    start, end = data[0]
    tree = IntervalNode( start, end )

    # build an interval tree from the rest of the data
    for start, end in data[1:]:
        tree = tree.insert( start, end )

    overlap = []
    for start, end in query:
        overlap.append(find(start, end , tree))

    dat['p300'] = map(int, overlap)
    print dat['p300'].value_counts()
    dats.append(dat)

Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr10_binary_new.txt
0    671279
1      6394
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr11_binary_new.txt
0    669375
1      5657
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr12_binary_new.txt
0    663439
1      5820
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr13_binary_new.txt
0    573521
1      2328
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr14_binary_new.txt
0    533193
1      3554
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr15_binary_new.txt
0    509108
1      3548
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr16_binary_new.txt
0    446652
1      5121
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr17_binary_new.txt
0    401075
1      4901
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr18_binary_new.txt
0    388021
1      2365
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_chr19_binary_new.txt
0    291474
1      4170
dtype: int64
Spectacle/SAMPLEDATA_HG19_NEW/GM12878_ch

In [31]:
result = pd.concat(dats)

<h3>Write to output file</h3>

In [37]:
new_filename = "features_data.txt"
with open(new_filename, 'w') as the_file:
    result.to_csv(the_file, sep='\t', index=False)
result.head()

In [41]:
result = result.ix[:, [0,1,2,3,4,5,6,7,8,12]]