In [3]:
from Bio import SeqIO
from glob import glob
import os

In [4]:
import numpy
numpy.__version__

'1.22.3'

In [5]:
from mito.genotyping import *

## Load reference

In [6]:
# Path to reference genome
reference_path = '../../data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

In [7]:
reference

0        G
1        A
2        T
3        C
4        A
        ..
16564    C
16565    G
16566    A
16567    T
16568    G
Length: 16569, dtype: object

## Load read matrices

In [8]:
meta = pd.read_csv('../../data/Joanna_AC-files/a1_06_meta_data.csv')

In [36]:
meta

Unnamed: 0,cell_id,clone_id
0,A1_06_D14_p4_H2,0
1,A1_06_D14_p4_D4,0
2,A1_06_D14_p6_C18,0
3,A1_06_D14_p2_E17,0
4,A1_06_D14_p4_P5,0
...,...,...
3551,A1_06_D180_p1_D13,1118
3552,A1_06_D180_p3_P11,1119
3553,A1_06_D14_p5_P23,1120
3554,A1_06_D180_p1_P17,1121


In [31]:
clone_sizes_n_10 = pd.DataFrame(meta.groupby('clone_id').count()) 
clone_sizes_n_10 = clone_sizes_n_10.loc[clone_sizes_n_10['cell_id'] >=10].index

In [32]:
clone_sizes_n_10

Int64Index([  0,   1,   3,   5,   7,   9,  13,  16,  19,  22,  28,  30,  31,
             36,  39,  40,  41,  47,  48,  50,  53,  55,  57,  60,  61,  64,
             65,  68,  69,  70,  74,  80,  85,  94, 100, 106, 107, 113, 122,
            130, 137, 142, 151, 154, 155, 159, 161, 171, 177, 181, 187, 192,
            199, 204, 212, 247, 255, 327, 330, 345, 360, 362, 397, 480, 555,
            584],
           dtype='int64', name='clone_id')

In [46]:
meta = meta.loc[meta['clone_id'].isin(clone_sizes_n_10)].set_index('cell_id')

In [48]:
meta

Unnamed: 0_level_0,clone_id
cell_id,Unnamed: 1_level_1
A1_06_D14_p4_H2,0
A1_06_D14_p4_D4,0
A1_06_D14_p6_C18,0
A1_06_D14_p2_E17,0
A1_06_D14_p4_P5,0
...,...
A1_06_D14_p1_I4,584
A1_06_D14_p1_N14,584
A1_06_D14_p6_A22,584
A1_06_D14_p1_N15,584


In [49]:
# Path to allele count files
INPUT_PATH = '../../data/Joanna_AC-files/A1_06/'


# Parameters
#DATA_NAME = 'A1_06'
#BASE_PATH = f'../../../data/20210712_{DATA_NAME}'
#INPUT_PATH = f'{BASE_PATH}/cells/'
#e_rate = 0.0778
e_rate = 0.0556
p_mutation = 1 / 500.0

In [50]:
cell_count = []
cell_name = []
locations = range(1, 16571+1)

# Get all cell counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt')))

for filename in sorted(cell_count_filenames):
    name = os.path.basename(filename).split('.')[0]
    
    try: 
        meta.loc[name]
    except: 
        continue 
    
    # Load one cell count file
    counts = pd.read_csv(filename, sep='\t')
    print(f'Reading {filename}, {counts.shape[0]} locations')
    # Some files are missing some locations
    counts = counts.set_index('POS').reindex(locations).reset_index()
    counts.name = name
    
    cell_count.append(counts)
    cell_name.append(name)

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A1.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A14.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A21.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A23.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A4.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A5.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_A9.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_B1.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_B16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_B20.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p1_B22

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_A4.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_A5.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_A6.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_A7.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B10.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B14.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B15.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B5.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_B9.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p2_C1.t

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_A14.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_A24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_A3.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_A6.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_A9.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B13.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B15.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B2.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B20.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B21.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p3_B2

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_C6.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_C9.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D1.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D11.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D13.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D15.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D17.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D18.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D19.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D20.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p4_D

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F19.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F3.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F6.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F7.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F8.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_F9.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_G1.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_G13.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_G16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_G22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p5_G3.t

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G10.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G15.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G17.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G19.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G2.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G23.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G3.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G7.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_G8.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_H1.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_H12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D14_p6_H14

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_J17.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_J18.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_J23.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_J24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_J3.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_J4.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_J9.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_K12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_K18.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_K21.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_K22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p1_K24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_L8.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M1.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M11.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M14.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M15.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M20.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M21.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M3.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M6.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p2_M9.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_0

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_N16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_N2.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_N21.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_N24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_O13.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_O14.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_O19.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_O20.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_O21.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_P12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_P17.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p3_P19.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K15.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K18.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K19.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K2.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K20.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K23.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K3.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_K6.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_L10.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p4_L12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1

Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_F23.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_F24.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_F3.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_F4.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G1.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G12.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G15.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G16.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G2.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G20.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G21.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_06_D180_p5_G22.txt, 16569 locations
Reading ../../data/Joanna_AC-files/A1_06/A1_

In [52]:
len(cell_name)

1471

In [50]:
def remove_low_cov_pos(cell_count):
    
    # Filter positions with low mean coverage
    
    # First make list with positions with low coverage
    # concat cell counts into new df
    cc_flt = pd.concat(cell_count, axis=1)
    # drop unwanted cols
    cc_flt.drop(['#CHR', 'POS', 'Count_A', 'Count_C', 'Count_G', 'Count_T'], axis=1, inplace=True)
    # drop bulk col
    column_numbers = [x for x in range(cc_flt.shape[1])] # list of columns' integer indices
    column_numbers.remove(0) #removing column integer index 0
    cc_flt = cc_flt.iloc[:, column_numbers] #return all columns except the 0th column
    
    # compute average coverage
    cc_flt['mean'] = cc_flt.mean(axis=1)
    
    # Get positions in new col
    cc_flt.reset_index(inplace=True)
    cc_flt['POS'] = cc_flt['index']+1
    
    # make new df with only relevant info
    cc_flt_pos = cc_flt[['mean', 'POS']]
    
    # reset index
    cc_flt_pos.set_index('POS', inplace=True)
    
    # Select rows with low coverage
    threshold = 100
    below_thres = cc_flt_pos.loc[cc_flt_pos['mean'] < threshold]

    # save index (which is POS) to list
    below_thres_lst = below_thres.index.tolist()
    
    return below_thres_lst

In [51]:
# Obtain a set of error rates to test


#e_rates = [0.0778]
e_rates = [0.0556]

#e_rates = np.linspace(0.0001, 0.0009, 10).round(4)
#extra_error_rates = [0.001, 0.002, 0.005]
#extra_error_rates_ar = np.array(extra_error_rates)
#all_e_rates = np.concatenate((e_rates1, extra_error_rates_ar))

all_e_rates = e_rates
all_e_rates

[0.0556]

In [52]:
#np.linspace(0.0001, 0.22, 10).round(4)

## Compute mutations probabilities

In [53]:
def cell_prob(e_rate):
    
    error_rate_when_no_mutation = error_rate_when_mutation = e_rate
    p_mutation = 1 / 500.0

    cell_prob = []
    for count in cell_count:
        count = count.iloc[:reference.shape[0]]  # discard trailing positions
        p = nucleotide_mutation_prob(
            cell_counts=count,
            reference=reference,
            error_rate_when_no_mutation=error_rate_when_no_mutation,
            error_rate_when_mutation=error_rate_when_mutation,
            p_mutation=p_mutation,
        )

        cell_prob.append(p)
        
    # Compute P(mutation | read counts)
    cells_p_mutation = []
    for cell_prob in cell_prob:
        p = mutation_prob(cell_prob, reference)
        cells_p_mutation.append(p)
        
        
    # Make mutation matrix
    mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
    mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
    mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)
    
    return mutation_matrix      

In [54]:
mutation_matrix = cell_prob(0.0556)

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


In [42]:
mutation_matrix.shape

(16569, 3282)

In [41]:
mutation_matrix

Unnamed: 0,#CHR,POS,0,1,2,3,4,5,6,7,...,3270,3271,3272,3273,3274,3275,3276,3277,3278,3279
0,MT,1,0.000134,,,,0.002000,,0.000668,0.002,...,0.000501,0.002000,,0.000223,0.000200,0.001001,0.000223,,,0.002
1,MT,2,0.000134,,,,0.002000,,0.000668,0.002,...,0.000401,0.002000,,0.000223,0.000167,0.001001,0.000223,,,0.002
2,MT,3,0.000134,,,,0.002000,,0.000668,0.002,...,0.000401,0.002000,,0.000223,0.000167,0.001001,0.000182,,,0.002
3,MT,4,0.000134,,,,0.002000,,0.000668,0.002,...,0.000401,0.002000,,0.000223,0.000167,0.001001,0.000182,,,0.002
4,MT,5,0.000134,,,,0.002000,,0.000668,0.002,...,0.000401,0.002000,,0.000223,0.000167,0.001001,0.000182,,,0.002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16564,MT,16565,,0.002,,,0.002000,,0.001001,,...,,0.001001,,0.000668,0.000334,0.000401,0.000668,,,
16565,MT,16566,,0.002,,,0.002000,,0.001001,,...,,0.001001,,0.000668,0.000334,0.000401,0.000668,,,
16566,MT,16567,,0.002,,,0.001001,,0.001001,,...,,0.001001,,0.000501,0.000334,0.000401,0.000668,,,
16567,MT,16568,,0.002,,,0.001001,,0.001001,,...,,0.001001,,0.000501,0.000334,0.000401,0.000668,,,


In [43]:
mutation_matrix.isna().sum().sum()

5839158

## Filter pmat

In [27]:
#def flt_pmat(e_rate, cell_count):
mutation_matrix = mutation_matrix
print('no filters' + str(mutation_matrix.shape))

# Remove germline mutations
#mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]

# Probabilities for location with mutation at any of the cells
mutation_threshold = 0.9
data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]

# select rows that have a value greater than mutation_threshold in more than x cells
mutation_threshold = 0.9
mask = (data.values > mutation_threshold).sum(axis=1) > 1 #create an array with boolean values and sum these along the axis and select rows with more than 1 True
two_cells_have_mut = data[mask]

# Define size of 50% of the population
half = len(two_cells_have_mut.columns) / 2

# Remove rows where 50% of cols are nan and save in new df
nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> half].index)
print('remove rows where 50% is nan' + str(nan_rows.shape))

# Replace NaN with row mean
imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()

# To skip removal of sites with overall high probability for mut
#high_prob_rows = imputed.copy()

# remove locations close to another location
index = imputed.index.tolist() + [2000000]
ind = [ a for a,b in zip(index[:-1], index[1:]) if b-a > 4]
clust = imputed.loc[ind]
print('remove cluster mutations' + str(clust.shape))

#clust.shape

# Replace 1.0
replaced = clust.replace(1.0, 0.99999)

# Remove pos with low cov
below_thres_lst = remove_low_cov_pos(cell_count)


low_dp = replaced.copy()
low_dp.reset_index(inplace = True)
low_dp = low_dp[~low_dp['POS'].isin(below_thres_lst)]
low_dp.set_index('POS', inplace=True)

print('remove positions with low dp' + str(low_dp.shape))

# save matrix
#np.set_printoptions(suppress=True)
#e_rate_name = str(e_rate)
#matrix_path = '../../data/A6_old_files' + e_rate_name + '.csv'
#low_dp.to_csv(matrix_path, index=False, sep=' ', header= False)
    
    #return low_dp
    


no filters(16569, 3282)
remove rows where 50% is nan(672, 3280)
remove cluster mutations(487, 3280)
remove positions with low dp(151, 3280)


In [57]:
#for e_rate in all_e_rates:
    #flt_pmat(e_rate, cell_count)