## TCGA ASCAT results

Data downloaded from: https://github.com/VanLoo-lab/ascat/tree/master/ReleasedData/TCGA_SNP6_hg19

These calls are all based on **hg19/GRCh37** coordinates.

From CCDS Readme:  
- cds_from = chromosome position of CDS start (or CDS end for minus strand genes), in 0-based coordinates      
- cds_to = chromosome position of CDS end (or CDS start for minus strand genes), in 0-based coordinates  
- cds_locations = comma separated list of from-to, of chromosome positions of CDS exons, in 0-based coordinates, ordered from low to high

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import os
import matplotlib.pyplot as plt

import processing_utils as util
%load_ext autoreload
%autoreload 1
%aimport processing_utils

get_data_path = lambda folders, fname: os.path.normpath(os.environ['THIRD_PARTY_DIR']+'/'+'/'.join(folders)+'/'+fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/' +'/'.join(folders) +'/'+ fname)

folder_ascat_segments = get_data_path(['TCGA','ASCATv3_SNP6_hg19'], 'segments')
file_ascat_summary = get_data_path(['TCGA','ASCATv3_SNP6_hg19'],'summary.ascatv3TCGA.penalty70.hg19.tsv')
file_ccds = get_data_path(['ccds_gene_annotations'], 'hg19.txt')
file_chrom_info = get_local_data_path(['processed'], 'hg19_chrom_info.csv')

# OUTPUT
file_ascat_homdels = get_local_data_path(['processed','TCGA','ASCAT'], 'homdels.csv')
file_ascat_loh = get_local_data_path(['processed','TCGA','ASCAT'], 'LOH.csv')
file_ascat_segment_bounds = get_local_data_path(['processed','TCGA','ASCAT'], 'segment_boundaries.csv')

### ASCAT summary

In [2]:
ascat_summary_full = pd.read_csv(file_ascat_summary, sep='\t')
ascat_summary_full[:1]

Unnamed: 0,name,patient,cancer_type,sex,barcodeTumour,barcodeNormal,tumour_mapd,normal_mapd,GC_correction_before,GC_correction_after,...,homdel_segs,homdel_largest,homdel_size,homdel_fraction,LOH,mode_minA,mode_majA,WGD,GI,QC
0,TCGA-02-0001,TCGA-02-0001,GBM,XX,TCGA-02-0001-01C-01D-0182-01,TCGA-02-0001-10A-01D-0182-01,0.2828,0.2716,50bp=0.1387 / 5kb=0.12,50bp=0 / 5kb=0,...,1,351838,351838,0.00012,0.2042,1,1,0,0.6196,Pass


In [3]:
ascat_summary = ascat_summary_full[(ascat_summary_full['QC']=='Pass')]
print('N samples with solution:', ascat_summary.shape[0],'/',ascat_summary_full.shape[0])
assert(ascat_summary.patient.nunique()==ascat_summary.shape[0])

N samples with solution: 9966 / 10674


### Extract homozygous deletion segments
All segments with total copy number 0.

In [7]:
homdels_list = []
for i, sname in enumerate(ascat_summary.name.values):
    print(i, end='\r')
    sample = pd.read_csv(folder_ascat_segments+'/'+sname+'.segments.txt', sep='\t')
    sample = sample.assign(CN = sample.nMajor + sample.nMinor)
    homdels_list.append(sample[sample.CN==0])

9965

In [8]:
homdels_all = pd.concat(homdels_list).reset_index(drop=True)
homdels_all = pd.merge(ascat_summary[['name','patient','cancer_type','barcodeTumour']], 
                       homdels_all.rename(columns={'sample':'name'}))
homdels_all['del_len'] = homdels_all.endpos - homdels_all.startpos

In [9]:
homdels_all[:2]

Unnamed: 0,name,patient,cancer_type,barcodeTumour,chr,startpos,endpos,nMajor,nMinor,CN,del_len
0,TCGA-02-0001,TCGA-02-0001,GBM,TCGA-02-0001-01C-01D-0182-01,9,21907786,22259623,0,0,0,351837
1,TCGA-02-0006,TCGA-02-0006,GBM,TCGA-02-0006-01B-01D-0182-01,10,89335769,90811390,0,0,0,1475621


In [10]:
# Check that number of HD segments matches summary report
print('N HDs:', homdels_all.shape[0])
qc_check = pd.merge(homdels_all.groupby('patient').size().reset_index(name='n_segs'), 
                    ascat_summary[['patient','homdel_segs']], how='right').fillna(0).astype({'n_segs':int})
assert(qc_check[qc_check.n_segs!=qc_check.homdel_segs].shape[0]==0)

N HDs: 13692


In [12]:
homdels_all.drop(columns=['nMajor','nMinor','CN']).to_csv(file_ascat_homdels, index=0)

### Extract all LOH segments (minor cn = 0; major cn >0)

In [13]:
loh_list = []
for i, sname in enumerate(ascat_summary.name.values):
    print(i, end='\r')
    sample = pd.read_csv(folder_ascat_segments+'/'+sname+'.segments.txt', sep='\t')
    loh_list.append(sample[(sample.nMinor==0) & (sample.nMajor>0)])

9965

In [14]:
loh = pd.concat(loh_list).reset_index(drop=True)

In [15]:
loh = pd.merge(ascat_summary[['name','patient','ploidy']], loh.rename(columns={'sample':'name'}))
loh = loh[~loh.chr.isin(['X','Y'])].astype({'chr':int}).reset_index(drop=True)
loh = loh.assign(del_len = loh.endpos - loh.startpos)

In [16]:
print('N LOH segments:', loh.shape[0])
print('N samples w/ LOH segments:', loh.patient.nunique())

N LOH segments: 250410
N samples w/ LOH segments: 9713


In [17]:
loh.to_csv(file_ascat_loh, index=0)

### Identify ASCAT segment boundaries - telomere and centromere adjacent segments
- First segment start + last segment end for each chromosome.  
- Expect this to be the same for each sample, bounded by the SNP array, but check!   
- NOTE: short arm not mapped for chromosomes 13,14,15,21,22

In [18]:
chrom_info = pd.read_csv(file_chrom_info, index_col=0)
chrom_info[:1]

Unnamed: 0_level_0,centStart,centEnd,size
chr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,121535434,124535434,249250621


In [19]:
chr_start = []
chr_end = []
cent_start = []
cent_end = []
for i, sname in enumerate(ascat_summary.name.values):
    print(i, end='\r')
    sample = pd.read_csv(folder_ascat_segments+'/'+sname+'.segments.txt', sep='\t')
    # Find max endpos that is before centromere start, for each chr
    df = sample.groupby('chr').apply(lambda x: x.endpos[x.endpos <= chrom_info.loc[x.name,'centStart']].max())\
               .reset_index(name='endpos')
    cent_start.append(df)
    # Find min startpos that is after centromere end, for each chr
    df = sample.groupby('chr').apply(lambda x: x.startpos[x.startpos >= chrom_info.loc[x.name,'centEnd']].min())\
               .reset_index(name='startpos')
    cent_end.append(df)
    # Find edges - overall min startpos / max endpos
    chr_start.append(sample.groupby('chr').startpos.min().reset_index())
    chr_end.append(sample.groupby('chr').endpos.max().reset_index())

9965

In [20]:
chr_start_posns = pd.concat(chr_start).reset_index(drop=True)
chr_end_posns = pd.concat(chr_end).reset_index(drop=True)
cent_start_posns = pd.concat(cent_start).reset_index(drop=True)
cent_end_posns = pd.concat(cent_end).reset_index(drop=True)

In [21]:
# Check whether start and end is the same for every sample
for chrom in range(1,23):
    print('Chr', chrom, '\tstart:', chr_start_posns[chr_start_posns.chr==str(chrom)].startpos.unique(),
          '   \tend:', chr_end_posns[chr_end_posns.chr==str(chrom)].endpos.unique())

Chr 1 	start: [61735]    	end: [249224388]
Chr 2 	start: [12784]    	end: [243089456]
Chr 3 	start: [60345]    	end: [197896118]
Chr 4 	start: [12281]    	end: [191027923]
Chr 5 	start: [15532]    	end: [180790320]
Chr 6 	start: [149661]    	end: [171051005]
Chr 7 	start: [43259]    	end: [159127004]
Chr 8 	start: [31254]    	end: [146298155]
Chr 9 	start: [46587]    	end: [141091394]
Chr 10 	start: [72759]    	end: [135506704]
Chr 11 	start: [198572]    	end: [134944770]
Chr 12 	start: [150442]    	end: [133778189]
Chr 13 	start: [19026949]    	end: [115108397]
Chr 14 	start: [19002124]    	end: [107285437]
Chr 15 	start: [20016328]    	end: [102469040]
Chr 16 	start: [60777]    	end: [90287535]
Chr 17 	start: [526]    	end: [81049726]
Chr 18 	start: [48133]    	end: [78015057]
Chr 19 	start: [90910]    	end: [59097854]
Chr 20 	start: [61305]    	end: [62956153]
Chr 21 	start: [10736871]    	end: [48096957]
Chr 22 	start: [16052528]    	end: [51234455]


In [22]:
bounds = pd.merge(chr_start_posns.groupby('chr').startpos.min().reset_index(), 
                  chr_end_posns.groupby('chr').endpos.max().reset_index())
# No segments before centromere on chromosomes 13, 14, 15, 22
cent_bounds = pd.merge(cent_start_posns.groupby('chr').endpos.max().reset_index().rename(columns={'endpos':'cent_start'}),
                       cent_end_posns.groupby('chr').startpos.min().reset_index().rename(columns={'startpos':'cent_end'}))
bounds = pd.merge(bounds, cent_bounds)
bounds = bounds[~bounds.chr.isin(['X','Y'])].astype({'chr':int}).sort_values('chr').set_index('chr')

In [23]:
bounds

Unnamed: 0_level_0,startpos,endpos,cent_start,cent_end
chr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,61735,249224388,121482979.0,144007049.0
2,12784,243089456,92305784.0,95327887.0
3,60345,197896118,90502862.0,93519478.0
4,12281,191027923,49658612.0,52685699.0
5,15532,180790320,46389273.0,49432831.0
6,149661,171051005,58774716.0,61886440.0
7,43259,159127004,58019983.0,61063974.0
8,31254,146298155,43824048.0,46847534.0
9,46587,141091394,47217176.0,65616971.0
10,72759,135506704,39076221.0,42433540.0


In [25]:
bounds.to_csv(file_ascat_segment_bounds)