## PCAWG consensus CNA

Downloaded from https://dcc.icgc.org/releases/PCAWG/consensus_cnv/

Donor table downloaded from main PCAWG publication.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import os
import matplotlib.pyplot as plt
import tarfile

import processing_utils as util
%load_ext autoreload
%autoreload 1
%aimport processing_utils

get_data_path = lambda folders, fname: os.path.normpath(os.environ['THIRD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/' +'/'.join(folders) +'/'+ fname)

folder_pcawg_icgc_segments = get_data_path(['PCAWG'], 'consensus.20170119.somatic.cna.icgc.public.tar.gz')
file_pcawg_samples = get_data_path(['PCAWG'], 'pcawg_donors_tableS1.xlsx')
file_pcawg_sample_info = get_data_path(['PCAWG'], 'consensus.20170218.purity.ploidy.txt')

# Output
file_pcawg_icgc_homdels = get_local_data_path(['processed','ICGC'], 'homdels.csv')
file_pcawg_icgc_loh = get_local_data_path(['processed','ICGC'], 'LOH.csv')

### PCAWG samples + purity/ploidy
2,583 samples had data of optimal quality (white-listed donors)

In [5]:
sample_list = pd.read_excel(file_pcawg_samples, engine='openpyxl', skiprows=2)
print('Total:', sample_list.shape[0])
print('TCGA:', sample_list.tcga_donor_uuid.nunique())
print('non-TCGA:', sample_list[sample_list.tcga_donor_uuid.isna()].icgc_donor_id.nunique())
sample_list[:1]

Total: 2583
TCGA: 801
non-TCGA: 1782


Unnamed: 0,tumour_specimen_aliquot_id,normal_specimen_aliquot_id,icgc_donor_id,icgc_sample_id,icgc_specimen_id,dcc_specimen_type,project_code,gender,age,organ_system,...,specimen_donor_treatment_type,donor_wgs_included_excluded,specimen_library_strategy,gain_count,loss_count,hd_count,wgd,all.SNVs,all.MNVs,all.Indels
0,0009b464-b376-4fbc-8a56-da538269a02f,5ef2ed4d-464e-4a51-95e0-401d9ae7be86,DO46416,SA505245,SP101724,Recurrent tumour - other,OV-AU,female,54.0,OVARY,...,other therapy,Included,WGS,288,349,0,1,15273,225,670


In [6]:
# Note: this includes grey list samples
sample_info = pd.read_csv(file_pcawg_sample_info, sep='\t')
print('N samples:', sample_info.samplename.nunique())
sample_info[:1]

N samples: 2778


Unnamed: 0,samplename,purity,ploidy,purity_conf_mad,wgd_status,wgd_uncertain
0,0009b464-b376-4fbc-8a56-da538269a02f,0.885,3.355,0.039,wgd,False


### Extract homozygous deletion segments
All segments with copy number 0, except those on Y chromosome.

In [4]:
cols = ['chromosome','start','end','total_cn','major_cn','minor_cn','star']
homdels_list = []
n=0
with tarfile.open(folder_pcawg_icgc_segments, "r:*") as tar:
    for member in tar.getmembers():
        print(n, end='\r')
        sample = pd.read_csv(tar.extractfile(member.name), header=0, sep="\t")
        sample_name = member.name.replace('.consensus.20170119.somatic.cna.txt', '')
        homdels_list.append(sample[(sample.major_cn==0) & (sample.minor_cn==0)][cols].assign(samplename=sample_name))
        n += 1

1949

In [5]:
homdels = pd.concat(homdels_list).reset_index(drop=True).drop(columns=['major_cn','minor_cn','star'])
homdels = homdels.rename(columns={'chromosome':'chr','start':'startpos','end':'endpos'}).astype({'startpos':int,'endpos':int})
homdels = homdels[~homdels.chr.isin(['X','Y'])].astype({'chr':int}).reset_index(drop=True)
# Only including white listed donors
print('Non white listed:', homdels[~homdels.samplename.isin(sample_list.tumour_specimen_aliquot_id)].samplename.nunique())
homdels = homdels[homdels.samplename.isin(sample_list.tumour_specimen_aliquot_id)].reset_index(drop=True)

Non white listed: 75


In [9]:
# Merge consecutive segments
def merge_consecutive_segments(x):
    if x.shape[0]==1: return x
    new_segs = []
    segstart=0
    istart=0
    iend=1
    while istart < x.shape[0]:
        while iend < x.shape[0] and x.iloc[istart].endpos == x.iloc[iend].startpos-1:
            iend += 1
            istart += 1
        seg = x.iloc[segstart]
        seg.endpos = x.iloc[iend-1].endpos
        new_segs.append(seg)
        segstart=iend
        istart=iend
        iend=iend+1
    return pd.DataFrame(new_segs)

In [7]:
print('N HD segments before merging:', homdels.shape[0])
homdels_merged = homdels.groupby(['samplename','chr']).apply(merge_consecutive_segments).reset_index(drop=True)
homdels_merged = homdels_merged.assign(del_len = homdels_merged.endpos - homdels_merged.startpos)
print('N HD segments after merging:', homdels_merged.shape[0])

N HD segments before merging: 4471
N HD segments after merging: 3545


In [8]:
homdels_merged.to_csv(file_pcawg_icgc_homdels, index=0)

### Extract all LOH segments (minor cn = 0; major cn >0)

In [2]:
cols = ['chromosome','start','end','total_cn','major_cn','minor_cn']
dels_list = []
n=0
with tarfile.open(folder_pcawg_icgc_segments, "r:*") as tar:
    for member in tar.getmembers():
        print(n, end='\r')
        sample = pd.read_csv(tar.extractfile(member.name), header=0, sep="\t")
        sample_name = member.name.replace('.consensus.20170119.somatic.cna.txt', '')
        sample = sample[~sample['chromosome'].isin(['X','Y'])][cols]
        dels_list.append(sample[((sample.minor_cn==0) & (sample.major_cn>0)) |
                                ((sample.major_cn==0) & (sample.minor_cn>0))].assign(samplename=sample_name))
        n +=1

1949

In [3]:
loh = pd.concat(dels_list).reset_index(drop=True)
loh = loh.rename(columns={'chromosome':'chr','start':'startpos','end':'endpos'})\
         .astype({'startpos':int,'endpos':int,'chr':int})

In [7]:
# Only include white listed donors
print('Non white listed:', loh[~loh.samplename.isin(sample_list.tumour_specimen_aliquot_id)].samplename.nunique())
loh = loh[loh.samplename.isin(sample_list.tumour_specimen_aliquot_id)]

Non white listed: 155


In [10]:
print('N del segments before merging:', loh.shape[0])
loh_merged = loh.groupby(['samplename','chr','minor_cn','major_cn']).apply(merge_consecutive_segments).reset_index(drop=True)
loh_merged = loh_merged.assign(del_len = loh_merged.endpos - loh_merged.startpos)
print('N del segments after merging:', loh_merged.shape[0])

N del segments before merging: 110790
N del segments after merging: 59843


In [11]:
loh_final = pd.merge(loh_merged, sample_info[['samplename','ploidy']])

In [12]:
loh_final.to_csv(file_pcawg_icgc_loh, index=0)