In [None]:
import numpy as np
import scipy.stats
import dalmatian
import pandas as pd
import matplotlib.pyplot as plt
from calc_ccf import calc_ccf
import collections
import pdb

In [None]:
wm = dalmatian.WorkspaceManager('broad-firecloud-ibmwatson/Wu_Getzlab_IBM_Richters_Clean_NoNormal')
pairs = wm.get_pairs()

In [None]:
# read in maf file and drop all rows where alt or ref counts are NaN
pair_list= ['F-014-CLL-01-Pair', 'F-014-CLL-03-Pair', 'F-014-RS-01-Pair']

In [None]:
# maintain a list of the HUGO symbols that do not fall into a CN seg and check against known driver list
# so we don't lose drivers to the process
hugo_symbols = []

for pair_id in pair_list:
    
    maf_df = pd.read_csv(pairs.unmatched_absolute_annotated_maf_capture[pair_id], sep='\t')

    cn_segs = pd.read_csv(pairs.CN_Profile_ccf[pair_id], sep='\t')

    purity = float(pairs.unmatched_absolute_called_purity[pair_id])

    muts_no_segs = []

    # for every mutation in the maf, calculate the CP distribution and replace the CCF distribution with the CP
    # also add to a list all mutations that do not fall into a predetermined CN segment
    for mut in maf_df.index:
        curr_chr = maf_df.Chromosome[mut]
        chr_segs = cn_segs[cn_segs.chromosome == curr_chr]

        minor_cn, major_cn = None, None

        for seg in chr_segs.index:
            if maf_df.Start_position[mut] > chr_segs.start[seg] and maf_df.Start_position[mut] < chr_segs.end[seg]:
                minor_cn = chr_segs.minor_cn[seg]
                major_cn = chr_segs.major_cn[seg]

        if pd.isna(minor_cn) or pd.isna(major_cn) or major_cn == None or minor_cn == None:
            print(pair_id + '\'s mutation ' + maf_df.Hugo_Symbol[mut] + ' did not fall in a predetermined segment of the profile.')
            hugo_symbols.append(maf_df.Hugo_Symbol[mut])
            muts_no_segs.append(mut)
            continue
        try:
            new_cp_dstn = calc_ccf(minor_cn, major_cn, maf_df.alt[mut], maf_df.ref[mut], purity, cp=True)
            new_cp_dstn_df = pd.DataFrame([new_cp_dstn], columns=maf_df.columns[-101:].tolist())
        except ValueError as e:
            pdb.set_trace()

        maf_df.loc[mut, maf_df.columns[-101:]] = new_cp_dstn_df.transpose().to_numpy().flatten()

    maf_df.drop(muts_no_segs, inplace=True)

    maf_df = maf_df.infer_objects()
    maf_df.reset_index(inplace=True, drop=True)
    
    # drop all mutations with incomplete CP distribution (this happens for some low coverage muts)
    df['sum_cp'] = df.loc[:, df.columns[-101:]].sum(1)
    df = df[df.sum_cp > 0.9]
    df = df.drop(columns = ['sum_cp'])

    maf_df.to_csv(pairs.unmatched_absolute_annotated_maf_capture_CP[pair_id], sep='\t', index=False)

In [None]:
# print the set of all HUGO symbols that did not fall in CN segs
set(hugo_symbols)