In [1]:
import pandas as pd
import numpy as np
import sys
from common import commons
home = commons.home
from features_preprocess import get_winid
import os
import re

In [2]:
def nearest_tss(tss,sites_df):
    merged = pd.merge(sites_df,tss,how='outer',on=['chr','coordinate'])
    merged.sort_values(['chr','coordinate'],inplace=True)
    merged.rename(columns={'strand':'before_tss'},inplace=True)
    merged.ix[merged['before_tss'].isnull()==False, 'before_tss'] = merged.ix[merged['before_tss'].isnull()==False,'coordinate']
    merged['after_tss'] = merged['before_tss']
    merged['before_tss'].fillna(method='ffill', inplace=True)
    merged['after_tss'].fillna(method='bfill',inplace=True)
    merged['dist_to_before_tss'] = np.abs(merged['coordinate']-merged['before_tss'])
    merged['dist_to_after_tss'] = np.abs(merged['coordinate']-merged['after_tss'])
    merged['tss'] = None
    before_ix = (merged['dist_to_before_tss'] < merged['dist_to_after_tss']) | (merged['dist_to_after_tss'].isnull())
    merged.ix[before_ix,'tss'] = merged.ix[before_ix,'before_tss']
    after_ix = (merged['dist_to_before_tss'] >= merged['dist_to_after_tss']) | (merged['dist_to_before_tss'].isnull())
    merged.ix[after_ix,'tss'] = merged.ix[after_ix,'after_tss']
    merged['dist_to_nearest_tss'] = np.abs(merged['coordinate']-merged['tss']) 
    merged.drop(['before_tss','after_tss','tss','dist_to_before_tss','dist_to_after_tss'],axis=1,inplace=True)
    merged.dropna(axis=0,subset=['id'],inplace=True)
    return merged

def rename_features(x):   #rename repetitive features
    features = np.array(x.columns)
    features_count = pd.Series(index=x.columns.unique())
    features_count = features_count.fillna(int(0))
    for i,name in enumerate(x.columns):
        if features_count[name] == 0:
            features_count[name] += 1
        else:
            features[i] = name+str(features_count[name])
            features_count[name] += 1
    x.columns = features
    return 

In [3]:
dataset = 'Cd'
if dataset == 'AD_CpG':
    type_name = commons.type_name  ## amyloid, cerad, tangles
    with_cell_type = commons.with_cell_type ## with or without
    dataset = dataset+'/'+type_name+with_cell_type
with pd.HDFStore(home+'data/'+dataset+'/all_sites_winid','r') as h5s:
    all_sites = h5s['all_sites_winid']
all_sites.reset_index(drop=True,inplace=True)

In [26]:
all_sites

Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,winid,A549,Astrocy,...,NCFF774VLD_WGBS_counts,NCFF795DNO_WGBS_counts,NCFF801OHX_WGBS_counts,NCFF811QOG_WGBS_counts,NCFF831OYO_WGBS_counts,NCFF843SYR_WGBS_counts,NCFF847OWL_WGBS_counts,NCFF874GGB_WGBS_counts,NCFF913ZNZ_WGBS_counts,NCFF923CZC_WGBS_counts
0,cg23440882,1,875880,0.0238,0.358200,0.030570,0,4380,9,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,cg24685837,1,982225,0.0459,0.169100,0.604002,0,4912,5,1,...,64.0,0.0,32.0,12.0,14.0,6.0,13.0,21.0,5.0,5.0
2,cg02494066,1,983386,0.0421,0.238000,0.587702,0,4917,4,3,...,41.0,1.0,31.0,8.0,8.0,9.0,11.0,9.0,2.0,6.0
3,cg20685419,1,1007730,0.0436,0.229800,0.443294,0,5039,4,4,...,53.0,1.0,46.0,4.0,7.0,5.0,0.0,6.0,1.0,6.0
4,cg00305285,1,1017115,0.0044,0.926000,0.722598,0,5086,3,5,...,38.0,0.0,45.0,4.0,15.0,5.0,13.0,8.0,6.0,7.0
5,cg15207999,1,1021210,0.0071,0.810100,0.802394,0,5107,3,3,...,71.0,5.0,58.0,13.0,16.0,21.0,16.0,31.0,13.0,12.0
6,cg05929553,1,1086836,0.0199,0.577400,0.835167,0,5435,2,2,...,75.0,0.0,43.0,4.0,5.0,10.0,8.0,18.0,5.0,4.0
7,cg07115976,1,1155731,-0.1745,0.000138,0.906688,-1,5779,8,2,...,47.0,1.0,44.0,16.0,12.0,9.0,11.0,4.0,13.0,11.0
8,cg00211609,1,1178039,0.0233,0.451700,0.402827,0,5891,5,0,...,87.0,0.0,58.0,4.0,11.0,3.0,10.0,5.0,12.0,6.0
9,cg02136596,1,1384930,0.0116,0.787200,0.776592,0,6925,0,6,...,7.0,10.0,12.0,69.0,38.0,57.0,27.0,22.0,46.0,36.0


In [5]:
feature_dir = home+'data/features/'+dataset+'/'
files = os.listdir(feature_dir)
pattern = '.*all.csv$'
reg = re.compile(pattern)
files = [name for name in files if len(reg.findall(name))>0]




In [6]:
for file in files:    
    feature = pd.read_csv(feature_dir+file)
    print(len(feature.columns))
    all_sites = pd.concat([all_sites,feature],axis=1)

31
267
317
73
80
735
303


In [7]:
rename_features(all_sites)

In [8]:
all_sites.shape

(3008, 1816)

In [12]:
import re

columns = all_sites.columns.values

In [25]:
pattern = re.compile(r'.*WGBS.*')
i=0
for col in columns:
    if len(pattern.findall(col))>0:
        print(col)
        i += 1
print(i)

ENCFF003JVR_WGBS_counts
ENCFF043NUK_WGBS_counts
ENCFF064GJQ_WGBS_counts
ENCFF092FNE_WGBS_counts
ENCFF103DNU_WGBS_counts
ENCFF116DGM_WGBS_counts
ENCFF121VIX_WGBS_counts
ENCFF121ZES_WGBS_counts
ENCFF164EAU_WGBS_counts
ENCFF168HTX_WGBS_counts
ENCFF179VKR_WGBS_counts
ENCFF189WPY_WGBS_counts
ENCFF200MJQ_WGBS_counts
ENCFF210XTE_WGBS_counts
ENCFF219GCQ_WGBS_counts
ENCFF223LJW_WGBS_counts
ENCFF247ILV_WGBS_counts
ENCFF254DBF_WGBS_counts
ENCFF266NGW_WGBS_counts
ENCFF279HCL_WGBS_counts
ENCFF297CJG_WGBS_counts
ENCFF303ZGP_WGBS_counts
ENCFF315ZJB_WGBS_counts
ENCFF318AMC_WGBS_counts
ENCFF331VRY_WGBS_counts
ENCFF333OHK_WGBS_counts
ENCFF355UVU_WGBS_counts
ENCFF366UWF_WGBS_counts
ENCFF428TVT_WGBS_counts
ENCFF459EEM_WGBS_counts
ENCFF477AUC_WGBS_counts
ENCFF477GKI_WGBS_counts
ENCFF479QJK_WGBS_counts
ENCFF487XOB_WGBS_counts
ENCFF489CEV_WGBS_counts
ENCFF500DKA_WGBS_counts
ENCFF510EMT_WGBS_counts
ENCFF511FUP_WGBS_counts
ENCFF513ITC_WGBS_counts
ENCFF536RSX_WGBS_counts
ENCFF545MIY_WGBS_counts
ENCFF550FZT_WGBS

In [10]:
all_sites.drop(['start','end'],axis=1,inplace=True)

In [11]:
additional_features = ['ATAC','CADD','DANN','Eigen','GenoCanyon','RNASeq','WGBS']
#merge with additional features
with pd.HDFStore(feature_dir+'addtional_features','r') as h5s:
    for feature in additional_features:
        feature_frame = h5s[feature]
        all_sites = pd.concat([all_sites,feature_frame],axis=1)
all_sites = all_sites.loc[:,~all_sites.columns.duplicated()]
all_sites['chr'] = all_sites['chr'].astype('i8')

In [11]:
all_sites.ix[:,:1950]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


Unnamed: 0,id,chr,coordinate,beta_sign,pvalue,beta,label,winid,A549,Astrocy,...,ENCFF723ZMR_RNASeq_counts,ENCFF301ROZ_RNASeq_counts,ENCFF888ZFS_RNASeq_counts,ENCFF105THO_RNASeq_counts,ENCFF760IDU_RNASeq_counts,ENCFF624VBI_RNASeq_counts,ENCFF552FTX_RNASeq_counts,ENCFF623UTC_RNASeq_counts,ENCFF535JQR_RNASeq_counts,ENCFF003JVR_WGBS_counts
0,cg23440882,1,875880,0.0238,0.358200,0.030570,0,4380,9,5,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,14.0,0.0,0.0
1,cg24685837,1,982225,0.0459,0.169100,0.604002,0,4912,5,1,...,1540.0,12.0,0.0,147.0,0.0,93.0,36.0,179.0,0.0,1.0
2,cg02494066,1,983386,0.0421,0.238000,0.587702,0,4917,4,3,...,369.0,5.0,0.0,69.0,0.0,48.0,28.0,74.0,0.0,3.0
3,cg20685419,1,1007730,0.0436,0.229800,0.443294,0,5039,4,4,...,0.0,0.0,0.0,0.0,2.0,9.0,0.0,1.0,1.0,2.0
4,cg00305285,1,1017115,0.0044,0.926000,0.722598,0,5086,3,5,...,0.0,0.0,0.0,26.0,1.0,3.0,3.0,3.0,1.0,4.0
5,cg15207999,1,1021210,0.0071,0.810100,0.802394,0,5107,3,3,...,225.0,23.0,1.0,173.0,11.0,80.0,36.0,27.0,7.0,7.0
6,cg05929553,1,1086836,0.0199,0.577400,0.835167,0,5435,2,2,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,5.0,6.0
7,cg07115976,1,1155731,-0.1745,0.000138,0.906688,-1,5779,8,2,...,37.0,2.0,0.0,59.0,2.0,19.0,7.0,33.0,1.0,7.0
8,cg00211609,1,1178039,0.0233,0.451700,0.402827,0,5891,5,0,...,0.0,0.0,0.0,2.0,0.0,16.0,0.0,10.0,1.0,2.0
9,cg02136596,1,1384930,0.0116,0.787200,0.776592,0,6925,0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#nearest tss distance    
chrs = all_sites['chr'].unique()
cols=['chr', 'coordinate','strand']
tss =  pd.read_csv(home+'tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
tss = get_winid.convert_chr_to_num(tss,chrs)
tss.sort_values(['chr','coordinate'],inplace=True)
all_sites = nearest_tss(tss,all_sites)


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [13]:
with pd.HDFStore(home+'data/'+dataset+'/all_features','w') as h5s:
    h5s['all_features'] = all_sites