This notebook process FACETS calls of CNV. It merges primary and relapse to obtain private and shared variants. After manually inspection, the shared variants (START and END) are determined as the smallest and greatest position of the overlapping segments. 

In [None]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]

In [None]:
import os
import pandas as pd
import numpy as np
import pybedtools
import glob
from aux_functions import process_cnv,stage_mapping

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

%load_ext autoreload
%autoreload 2

In [None]:
## FUNCTIONS

def alteration(rw):
    if rw['tcn.em']>2:
        rw['alteration'] = 'amp'
    elif rw['tcn.em']<2:
        rw['alteration'] = 'del'
    elif (rw['tcn.em'] == 2) and (rw['lcn.em'] == 0):
        rw['alteration'] = 'loh'
    else:
        print(rw)
    return(rw)

def merge_bedtools_shared(df_pry,df_rel):
        
    df_pry = df_pry.apply(lambda x: alteration(x), axis=1)
    df_rel = df_rel.apply(lambda x: alteration(x), axis=1)

    df_pry.rename(columns={'chrom':'chrom_pry', 'start':'start_pry', 'end':'end_pry', 
                       'lcn.em':'minor_cnv_pry', 'tcn.em':'total_cnv_pry', 'cf.em':'ccf_pry',
                       'alteration':'alteration_pry'}, inplace=True)
    df_rel.rename(columns={'chrom':'chrom_rel', 'start':'start_rel', 'end':'end_rel',
                       'lcn.em':'minor_cnv_rel', 'tcn.em':'total_cnv_rel', 'cf.em':'ccf_rel',
                       'alteration':'alteration_rel'}, inplace=True)

    df_pry = df_pry[['chrom_pry', 'start_pry', 'end_pry', 'minor_cnv_pry', 'total_cnv_pry', 'ccf_pry', 'patient', 'alteration_pry']]
    df_rel = df_rel[['chrom_rel', 'start_rel', 'end_rel', 'minor_cnv_rel', 'total_cnv_rel', 'ccf_rel', 'patient', 'alteration_rel']]
    
    df_pry['chrom_pry'] = df_pry['chrom_pry'].astype(str)
    df_rel['chrom_rel'] = df_rel['chrom_rel'].astype(str)

    df_pry[['start_pry', 'end_pry']] = df_pry[['start_pry', 'end_pry']].astype(int)
    df_rel[['start_rel', 'end_rel']] = df_rel[['start_rel', 'end_rel']].astype(int)

    pry = pybedtools.BedTool.from_dataframe(df_pry[['chrom_pry', 'start_pry', 'end_pry']])
    rel = pybedtools.BedTool.from_dataframe(df_rel[['chrom_rel', 'start_rel', 'end_rel']])

    result = pry.intersect(rel, loj = True)

    merged = pd.read_table(result.fn, names=['chrom_pry', 'start_pry', 'end_pry', 
                                                  'chrom_rel', 'start_rel', 'end_rel'])

    merged[['start_pry', 'end_pry', 
            'start_rel', 'end_rel']] = merged[['start_pry', 'end_pry', 
                                               'start_rel', 'end_rel']].astype(int)
    merged[['chrom_pry', 'chrom_rel']] = merged[['chrom_pry', 'chrom_rel']].astype(str)


    merged = merged.merge(df_pry, how='left', on=['chrom_pry', 'start_pry', 'end_pry'])
    merged = merged.merge(df_rel, how='left', on=['chrom_rel', 'start_rel', 'end_rel', 'patient'])

    merged['pry_alt'] = merged.apply(lambda x: x['chrom_pry']+':'+str(x['start_pry'])+'-'+str(x['end_pry']), axis=1)
    merged['rel_alt'] = merged.apply(lambda x: x['chrom_rel']+':'+str(x['start_rel'])+'-'+str(x['end_rel']), axis=1)
    
    merged['subset'] = 'shared'
    merged = merged[(merged['chrom_rel'] != '.') & (merged['start_rel'] != -1) & (merged['end_rel'] != -1)]
    
    df_pry['pry_alt'] = df_pry.apply(lambda x: str(x['chrom_pry'])+':'+str(x['start_pry'])+'-'+str(x['end_pry']), axis=1)
    df_rel['rel_alt'] = df_rel.apply(lambda x: str(x['chrom_rel'])+':'+str(x['start_rel'])+'-'+str(x['end_rel']), axis=1)
    
    df_pp = df_pry[~df_pry['pry_alt'].isin(merged['pry_alt'].tolist())]
    df_pr = df_rel[~df_rel['rel_alt'].isin(merged['rel_alt'].tolist())]
    
    df_pp['subset'] = 'private primary'
    df_pp.rename(columns={'chrom_pry':'chrom', 'start_pry':'start', 'end_pry':'end',
                              'minor_cnv_pry':'minor_cnv', 'total_cnv_pry':'total_cnv',
                              'alteration_pry':'alteration','ccf_pry':'ccf'}, inplace=True)
    df_pp = df_pp[['chrom', 'start', 'end', 'minor_cnv', 'total_cnv', 'ccf','alteration', 'subset', 'patient']]
    
    df_pr['subset'] = 'private relapse'
    df_pr.rename(columns={'chrom_rel':'chrom', 'start_rel':'start', 'end_rel':'end',
                              'minor_cnv_rel':'minor_cnv', 'total_cnv_rel':'total_cnv',
                              'alteration_rel':'alteration','ccf_rel':'ccf'}, inplace=True)
    df_pr = df_pr[['chrom', 'start', 'end', 'minor_cnv', 'total_cnv', 'ccf','alteration', 'subset', 'patient']]
    
    df_pp[['start', 'end']] = df_pp[['start', 'end']].astype(int)
    df_pp[['chrom']] = df_pp[['chrom']].astype(str)
    
    df_pr[['start', 'end']] = df_pr[['start', 'end']].astype(int)
    df_pr[['chrom']] = df_pr[['chrom']].astype(str)

    
    merged_sh = pd.DataFrame()
    for i,rw in merged.iterrows():
        if rw['alteration_pry'] != rw['alteration_rel']:
            df_pp = df_pp.append({'chrom':rw['chrom_pry'], 'start':rw['start_pry'], 'end':rw['end_pry'],
                          'minor_cnv':rw['minor_cnv_pry'], 'total_cnv':rw['total_cnv_pry'],
                          'alteration':rw['alteration_pry'],'ccf':rw['ccf_pry'],'patient':rw['patient'], 
                         'subset':'private primary'}, ignore_index=True, sort=False)
    
            df_pr = df_pr.append({'chrom':rw['chrom_rel'], 'start':rw['start_rel'], 'end':rw['end_rel'],
                          'minor_cnv':rw['minor_cnv_rel'], 'total_cnv':rw['total_cnv_rel'],'patient':rw['patient'], 
                          'subset':'private relapse','alteration':rw['alteration_rel'],'ccf':rw['ccf_rel']}, 
                                         ignore_index=True, sort=False)
        else:
            merged_sh = merged_sh.append(rw, ignore_index=True, sort=False)
    merged_sh = merged_sh[['chrom_pry', 'start_pry', 'end_pry', 'chrom_rel', 'start_rel',
       'end_rel', 'minor_cnv_pry', 'total_cnv_pry', 'ccf_pry', 'patient',
       'alteration_pry', 'minor_cnv_rel', 'total_cnv_rel', 'ccf_rel',
       'alteration_rel', 'subset']]
    merged_sh[['start_pry', 'end_pry','start_rel','end_rel']] = merged_sh[['start_pry', 'end_pry','start_rel','end_rel']].astype(int)
    return merged_sh, df_pp, df_pr

def merge_bedtools_private(df, ttype):
    df = df.apply(lambda x: alteration(x), axis=1)
    df.rename(columns={'lcn.em':'minor_cnv', 'tcn.em':'total_cnv', 'cf.em':'ccf'}, inplace=True)
    if ttype == 'private primary':
        df['subset'] = 'private primary'
    else:
        df['subset'] = ttype
    df = df[['chrom', 'start', 'end', 'minor_cnv', 'total_cnv', 'ccf', 'patient', 'alteration','subset']]
    return df

In [None]:
# clinical: Dataframe with the clinical data. You can get the dataframe by reading the excel table 
# Table S1. of Additional file 2 of the paper and then use function stage_mapping to get the desired dataframe
clinical = pd.read_excel(, sheet_name="Table S1", skiprows=[0,1])
clinical = stage_mapping(clinical)

In [None]:
clinical.head()

In [None]:
ttype='both'

## path to FACETS run results
dire_in = ""

chroms = [str(x) for x in range(1,23,1)]
chroms.extend("X")

In [None]:
facets_results = process_cnv(dire_in, clinical)

facets_cnv = facets_results[~((facets_results['tcn.em'] == 2) & (facets_results['lcn.em'] == 1))]
facets_cnv.groupby('chrom').count()

facets_cnv_pry = facets_cnv[facets_cnv['stage'] == 'primary']
facets_cnv_rel = facets_cnv[facets_cnv['stage'] == 'relapse']

In [None]:
facets_cnv_pry.head()

In [None]:
facets_cnv_rel.head()

In [None]:
grps = facets_cnv_pry.groupby("patient")

cnv_shared = pd.DataFrame()
cnv_private_pry = pd.DataFrame()
cnv_private_rel = pd.DataFrame()

for g in grps.groups:
    if (g != 'PAT3') | (g != 'PAT4'):
        print(g)
        df_pat_pry = grps.get_group(g)
        chr_grps = df_pat_pry.groupby("chrom")
        for chr_ in chroms:
            df_chr_pry = df_pat_pry[df_pat_pry['chrom'] == chr_]
            df_chr_rel = facets_cnv_rel[(facets_cnv_rel['patient'] == g) & (facets_cnv_rel['chrom'] == chr_)]
            
            if (df_chr_pry.empty == False) and (df_chr_rel.empty == False): 
                df_chr_sh, df_chr_pp, df_chr_pr = merge_bedtools_shared(df_chr_pry, df_chr_rel)
                cnv_shared = cnv_shared.append(df_chr_sh, ignore_index=True,sort=False)
                cnv_private_pry = cnv_private_pry.append(df_chr_pp, ignore_index=True,sort=False)
                cnv_private_rel = cnv_private_rel.append(df_chr_pr, ignore_index=True,sort=False)
            
            elif (df_chr_pry.empty == False) and (df_chr_rel.empty == True):
                df_chr = merge_bedtools_private(df_chr_pry, 'private primary')
                cnv_private_pry = cnv_private_pry.append(df_chr, ignore_index=True,sort=False)
                
            elif (df_chr_pry.empty == True) and (df_chr_rel.empty == False):
                df_chr = merge_bedtools_private(df_chr_rel, 'private relapse')
                cnv_private_rel = cnv_private_rel.append(df_chr, ignore_index=True, sort=False)
            else:
                if (df_chr_pry.empty == True) and (df_chr_rel.empty == True):
                    print('both empty')
                else:
                    print('unknown situation')

In [None]:
cnv_shared

In [None]:
cnv_private_pry

In [None]:
cnv_private_rel

In [None]:
# make a directory with the processed results. For example here is cnv_results_processed/

cnv_private_pry.to_csv(os.path.join(dire_in,"cnv_results_processed/private_primary_cnv.csv"), 
                    sep='\t', index=False)

cnv_private_rel.to_csv(os.path.join(dire_in,"cnv_results_processed/private_relapse_cnv.csv"), 
                  sep='\t',index=False)

cnv_shared.to_csv(os.path.join(dire_in,"cnv_results_processed/shared_cnv.csv"), 
                    sep='\t',index=False) # check the overlapping segments to determine the truly shared variants