## Get coordinates of fragile sites, centromeres and telomeres
- Telomeres are all 10,000 bp (0.01 Mb) long
- Centromeres are all 3,000,000 bp (3Mb) long

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import os

get_data_path = lambda folders, fname: os.path.normpath(os.environ['THIRD_PARTY_DIR']+'/'+'/'.join(folders)+'/'+fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/' +'/'.join(folders) +'/'+ fname)

file_pcawg_fsites = get_data_path(['PCAWG'], 'major_fragile_sites.txt')
file_genome_gaps = get_data_path(['UCSC_genome'], 'hg19_gaps.txt')
file_chr_sizes = get_data_path(['UCSC_genome'], 'hg19.chrom.sizes.txt')

# Output
file_chrom_info = get_local_data_path(['processed'], 'hg19_chrom_info.csv')
file_fragsites = get_local_data_path(['processed'], 'hg19_fragile_sites.csv')

### Fragile sites

In [2]:
pcawg_fragsites = pd.read_csv(file_pcawg_fsites, sep=' ')
pcawg_fragsites = pcawg_fragsites[pcawg_fragsites.chrom!='chrX'].assign(chrom = lambda x: x.chrom.apply(lambda c: int(c[3:])))
# Update PARK2 symbol to PRKN (manually checked this in HGNC)
pcawg_fragsites = pcawg_fragsites.rename(columns={'start':'startpos','end':'endpos','chrom':'chr'}).replace({'PARK2':'PRKN'})
print('N major fragile sites in PCAWG (non-X):', pcawg_fragsites.shape[0])
pcawg_fragsites.to_csv(file_fragsites, index=0)

N major fragile sites in PCAWG (non-X): 15


In [3]:
pcawg_fragsites

Unnamed: 0,chr,startpos,endpos,width,gene_name,CFS_name
0,1,71800000,72800000,1000001,NEGR1,FRA1L
1,1,245800000,246800000,1000001,SMYD3,FRA1I
2,2,140900000,143000000,2100001,LRP1B,FRA2F
3,2,205300000,206600000,1300001,PARD3B,FRA2I
4,3,59600000,61300000,1700001,FHIT,FRA3B
5,3,115400000,117800000,2400001,LSAMP,FRA3L
6,3,174100000,175600000,1500001,NAALADL2,FRA3O
7,4,90900000,92600000,1700001,CCSER1,FRA4F
8,5,58200000,59900000,1700001,PDE4D,FRA5H
9,6,161700000,163200000,1500001,PRKN,FRA6E


### Centromeres + telomeres

In [4]:
genome_gaps = pd.read_csv(file_genome_gaps, sep='\t')[['chrom','chromStart','chromEnd','type']]
genome_gaps = genome_gaps[genome_gaps.type.isin(['telomere','centromere'])]
genome_gaps['chrom'] = genome_gaps['chrom'].apply(lambda x: x[3:])
centromeres = genome_gaps[genome_gaps.type=='centromere'].reset_index(drop=True)
telomeres = genome_gaps[genome_gaps.type=='telomere'].reset_index(drop=True)

In [5]:
centromeres[:1]

Unnamed: 0,chrom,chromStart,chromEnd,type
0,1,121535434,124535434,centromere


In [6]:
chr_sizes = pd.read_csv(file_chr_sizes, sep='\t', nrows=24, names=['chrom','size'])
chr_sizes['chrom'] = chr_sizes['chrom'].apply(lambda x: x[3:])

In [7]:
assert(pd.merge(chr_sizes, telomeres.rename(columns={'chromEnd':'size'})).shape[0]==23) # chr 17 missing from telomeres

In [8]:
chrom_info = pd.merge(centromeres, chr_sizes).rename(columns={'chromStart':'centStart','chromEnd':'centEnd','chrom':'chr'})
chrom_info = chrom_info.drop(columns=['type']).set_index('chr')
chrom_info[:2]

Unnamed: 0_level_0,centStart,centEnd,size
chr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,121535434,124535434,249250621
2,92326171,95326171,243199373


In [9]:
chrom_info.to_csv(file_chrom_info)