In [1]:
import pandas as pd
import numpy as np

## Generate simulation pmats

In [78]:
def generate_gt_pmat(n_rows, clones, samples, n_mutations, high_prob, low_prob): #n_rows, clones, samples, n_mutations, high_prob, low_prob
    c = [f'c{w+1}' for w in range(clones)]
    s = [f's{w+1}' for w in range(samples)]
    mux = pd.MultiIndex.from_product([c, s])
    
    #Generate a dataframe with with default value
    gt = pd.DataFrame(low_prob, index=np.arange(n_rows) // n_mutations, columns=mux)
    
    # random assign high probability for mutation by random index with no repeat
    for c, i in zip(gt.columns.levels[0], 
                    np.random.choice(np.unique(gt.index), clones, replace=False)):
        gt.loc[i, c] = high_prob
    
    gt.columns = gt.columns.map(lambda clones: f'{clones[0]}_{clones[1]}')
    gt = gt.reset_index(drop=True)
    
    gt.insert(loc=0, column='bulk', value=0.0001)
    #gt_size = gt.shape
    
    #exp_name = '0'
    matrix_path = '../../data/simulated_matrix_output_test/gt.csv'
    gt.to_csv(matrix_path, index=False, sep=' ', header= False)
    
    return gt

    
def add_noise(cells_w_noise, gt):
    #gt = generate_gt_pmat(n_rows, clones, samples, n_mutations, high_prob, low_prob)
    for n in cells_w_noise:
        gt_w_noise = gt.copy()
        gt_w_noise = gt_w_noise.drop('bulk', axis=1)
        gt_w_noise = gt_w_noise.T
        gt_w_noise['noise1'] = low_prob # add a position with default value for low probability
        noisy_samples = gt_w_noise.sample(n) 
        noisy_samples.noise1=0.9
        gt_w_noise.update(noisy_samples)
        gt_w_noise = gt_w_noise.T
        gt_w_noise.insert(loc=0, column='bulk', value=0.0001)
        #print(gt_w_noise)
        update_list = noisy_samples.index.tolist() # save noise samples in list for checking
        print(update_list)
        n = str(n)
        matrix_path = '../../data/simulated_matrix_output_test/gt_w_' + n + '_noisy_samples.csv'
        gt_w_noise.to_csv(matrix_path, index=False, sep=' ', header= False)


        
def generate_all_pmats(n_rows, clones, samples, n_mutations, high_prob, low_prob, cells_w_noise):
    gt = generate_gt_pmat(n_rows, clones, samples, n_mutations, high_prob, low_prob)
    add_noise(cells_w_noise, gt)
    

In [79]:
n_rows = 6 #total number of mutations in pmat
clones = 6 # number of clones
samples = 6 # number of cells per clone
n_mutations = 1 # number of mutations per clone
high_prob = 0.9 # probability for presence of mutation
low_prob = 0.0001 # probability for absence of mutation

cells_w_noise = [2, 4, 8, 16, 32]

In [80]:
generate_all_pmats(n_rows, clones, samples, n_mutations, high_prob, low_prob, cells_w_noise)

['c2_s2', 'c1_s2']
['c2_s4', 'c5_s1', 'c6_s2', 'c5_s5']
['c4_s6', 'c1_s4', 'c2_s3', 'c4_s4', 'c6_s1', 'c6_s6', 'c5_s6', 'c5_s2']
['c1_s3', 'c1_s5', 'c6_s2', 'c2_s3', 'c6_s3', 'c2_s4', 'c1_s2', 'c5_s1', 'c3_s4', 'c2_s6', 'c2_s5', 'c5_s6', 'c4_s2', 'c3_s5', 'c3_s6', 'c1_s6']
['c4_s3', 'c5_s5', 'c6_s5', 'c4_s5', 'c3_s6', 'c3_s2', 'c3_s5', 'c5_s3', 'c2_s5', 'c5_s2', 'c1_s2', 'c3_s1', 'c2_s2', 'c4_s1', 'c6_s2', 'c2_s1', 'c6_s1', 'c2_s3', 'c4_s6', 'c2_s6', 'c4_s2', 'c1_s4', 'c6_s3', 'c1_s6', 'c5_s4', 'c3_s4', 'c6_s6', 'c5_s1', 'c1_s3', 'c2_s4', 'c3_s3', 'c4_s4']


## Remove clone specific mutations

In [21]:
rm_1 = gt.copy()
rm_1 = rm_1.drop([1])
rm_2 = rm_1.drop([2])
rm_3 = rm_2.drop([3])
rm_4 = rm_3.drop([4])

matrix_path = '../../data/simulated_matrix_output/1.csv'
rm_1.to_csv(matrix_path, index=False, sep=' ', header= False)

matrix_path = '../../data/simulated_matrix_output/2.csv'
rm_2.to_csv(matrix_path, index=False, sep=' ', header= False)

matrix_path = '../../data/simulated_matrix_output/3.csv'
rm_3.to_csv(matrix_path, index=False, sep=' ', header= False)

matrix_path = '../../data/simulated_matrix_output/4.csv'
rm_4.to_csv(matrix_path, index=False, sep=' ', header= False)

In [92]:
gt

Unnamed: 0,bulk,c1_s1,c1_s2,c1_s3,c1_s4,c1_s5,c1_s6,c2_s1,c2_s2,c2_s3,...,c5_s3,c5_s4,c5_s5,c5_s6,c6_s1,c6_s2,c6_s3,c6_s4,c6_s5,c6_s6
0,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,...,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001
1,0.0001,0.9,0.9,0.9,0.9,0.9,0.9,0.0001,0.0001,0.0001,...,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001
2,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,...,0.9,0.9,0.9,0.9,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001
3,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,...,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001
4,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.9,0.9,0.9,...,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001
5,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,...,0.0001,0.0001,0.0001,0.0001,0.9,0.9,0.9,0.9,0.9,0.9


In [99]:
cells_w_noise = [30]

#noise_pos = [2]


for n in cells_w_noise:
    gt_w_noise = gt.copy()
    gt_w_noise = gt_w_noise.drop('bulk', axis=1)
    gt_w_noise = gt_w_noise.T
    gt_w_noise['noisee'] = low_prob # add a position with default value for low probability
    print(gt_w_noise)
    #print(gt_w_noise[noise])
    noisy_samples = gt_w_noise.sample(n) 
    noisy_samples.noise=0.9
    gt_w_noise.update(noisy_samples)
    gt_w_noise = gt_w_noise.T
    gt_w_noise.insert(loc=0, column='bulk', value=0.0001)
    #print(gt_w_noise)
    #update_list = noisy_samples.index.tolist() # save noise samples in list for checking
    #print(update_list)

            0       1       2       3       4       5  noisee
c1_s1  0.0001  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001
c1_s2  0.0001  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001
c1_s3  0.0001  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001
c1_s4  0.0001  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001
c1_s5  0.0001  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001
c1_s6  0.0001  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001
c2_s1  0.0001  0.0001  0.0001  0.0001  0.9000  0.0001  0.0001
c2_s2  0.0001  0.0001  0.0001  0.0001  0.9000  0.0001  0.0001
c2_s3  0.0001  0.0001  0.0001  0.0001  0.9000  0.0001  0.0001
c2_s4  0.0001  0.0001  0.0001  0.0001  0.9000  0.0001  0.0001
c2_s5  0.0001  0.0001  0.0001  0.0001  0.9000  0.0001  0.0001
c2_s6  0.0001  0.0001  0.0001  0.0001  0.9000  0.0001  0.0001
c3_s1  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001  0.0001
c3_s2  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001  0.0001
c3_s3  0.9000  0.0001  0.0001  0.0001  0.0001  0.0001  0.0001
c3_s4  0