In [43]:
from Bio import SeqIO
from glob import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [44]:
import numpy
numpy.__version__

'1.22.3'

In [45]:
from mito.genotyping import *

## Load reference

In [46]:
# Path to reference genome
reference_path = '../../data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

## Load read matrices

In [47]:
# Path to allele count files
INPUT_PATH = '../../data/P9855_and_bulk_ac/'

In [48]:
cell_count = []

# Get all allele counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt'))) #*.txt

for filename in sorted(cell_count_filenames):
    print('Reading {}'.format(filename))
    name = os.path.basename(filename).split('.')[0]
    

    # Load one allele count file
    counts = pd.read_csv(filename, sep='\t')
    counts.name = name
    cell_count.append(counts)
    
# Get sample order

sample_list = []

for filename in sorted(cell_count_filenames):
    name = os.path.basename(filename).split('-')[0]
    sample_list.append(name)

sample_list_df = pd.DataFrame(sample_list)
sample_list_df = sample_list_df.rename(columns={0: 'cell'})

Reading ../../data/P9855_and_bulk_ac/P3861_218.clean.dedup_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2085_S108_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2089_S112_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2090_S113_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2091_S114_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2093_S116_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2096_S119_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2101_S124_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2102_S125_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2104_S127_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2110_S133_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2111_S134_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2112_S135_L007_ac.txt


In [49]:
e_rates = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]


## Compute mutations probabilities

In [50]:
def cell_prob(e_rate):
    
    error_rate_when_no_mutation = error_rate_when_mutation = e_rate
    p_mutation = 1 / 1000.0

    cell_prob = []
    for count in cell_count:
        count = count.iloc[:reference.shape[0]]  # discard trailing positions
        p = nucleotide_mutation_prob(
            cell_counts=count,
            reference=reference,
            error_rate_when_no_mutation=error_rate_when_no_mutation,
            error_rate_when_mutation=error_rate_when_mutation,
            p_mutation=p_mutation,
        )

        cell_prob.append(p)
        
    # Compute P(mutation | read counts)
    cells_p_mutation = []
    for cell_prob in cell_prob:
        p = mutation_prob(cell_prob, reference)
        cells_p_mutation.append(p)
        
        
    # Make mutation matrix
    mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
    mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
    mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)
    return mutation_matrix      

## Filter pmat

In [55]:
def flt_pmat(e_rate):
    mutation_matrix = cell_prob(e_rate)

    # Remove germline mutations
    mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]
    
    # Probabilities for location with mutation at any of the cells
    mutation_threshold = 0.9
    data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
    mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]
    
    # select rows that have a value greater than mutation_threshold in more than x cells
    mutation_threshold = 0.9
    mask = (data.values > mutation_threshold).sum(axis=1) > 1 #create an array with boolean values and sum these along the axis and select rows with more than 1 True
    two_cells_have_mut = data[mask]

    # Fix matrix
    half = len(two_cells_have_mut.columns) / 2

    # Remove rows where 50% of cols are nan and save in new df
    nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> half].index)

    # Replace NaN with row mean
    imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()
    
    # To skip removal of sites with overall high probability for mut
    high_prob_rows = imputed.copy()
    
    # Replace 1.0
    high_prob_rows = high_prob_rows.replace(1.0, 0.99999)
    
    np.set_printoptions(suppress=True)
    e_rate_name = str(e_rate)
    matrix_path = '../../data/P9855_matrix_output/' + e_rate_name + '.csv'
    high_prob_rows.to_csv(matrix_path, index=False, sep=' ', header= False)
    
    #print positions
    #matrix_path = '../../data/P9855_matrix_output_w_pos/' + e_rate_name + '.csv'
    #high_prob_rows.to_csv(matrix_path, sep='\t', header=False)
    


In [56]:
for e_rate in e_rates:
    flt_pmat(e_rate)

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  


## Now run mt-SCITE

In [39]:
# prepare pmat data

pmat_names = []
shapes = []

pmat_input_path = f'../../data/P9855_matrix_output/'
#print(pmat_input_path)
pmats = list(glob(os.path.join(pmat_input_path, '*.csv')))
tree_name = []

for filename in sorted(pmats):
    name = os.path.basename(filename).split('-')[0].rsplit('.', 1)[0]
    #print(name)
    pmat_names.append(name)
    df = pd.read_csv(filename, sep=' ', header=None)
    shapes.append(len(df))

# make df with pmat info
pmat_data = pd.DataFrame(
    {'pmat_names': pmat_names,
     'len': shapes,
    })

pmat_data

Unnamed: 0,pmat_names,len
0,0.0001,176
1,0.0002,64
2,0.0003,40
3,0.0004,35
4,0.0005,34
5,0.0006,30
6,0.0007,28
7,0.0008,26
8,0.0009,24
9,0.001,24


In [40]:
# Check n samples
a_pmat = pd.read_csv('../../data/P9855_matrix_output/0.0006.csv', sep=' ', header=None)
n_cells = a_pmat.shape[1]

In [41]:
# Run mt-SCITE

SCITE_PATH = '../../../mt-SCITE'
PMAT_PATH = f'../../data/P9855_matrix_output/'
OUTPUT = f'../../../mt-SCITE/mt-SCITE_output/P9855/'

for rep in range(1,11): #11
    
    for pmat in pmat_names:        
        run_id = pmat + '_' + str(rep)
        print('Running tree inference for error rate ' + pmat + ' repetition ' + str(rep))

        # Get number of mutations
        n = pmat_data.loc[pmat_data['pmat_names'] == pmat, 'len'].iloc[0].astype(str)

        try:
            os.makedirs(OUTPUT + '/stdout/') 
        except FileExistsError :
            pass
        except :
            raise

        ! $SCITE_PATH/mt-SCITE/scite -i $PMAT_PATH/$pmat\.csv -n $n -m $n_cells -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o $OUTPUT/$run_id 1> $OUTPUT/stdout/$run_id\.stdout.txt
        
 

Running tree inference for error rate 0.0001 repetition 1
Running tree inference for error rate 0.0002 repetition 1
Running tree inference for error rate 0.0003 repetition 1
Running tree inference for error rate 0.0004 repetition 1
Running tree inference for error rate 0.0005 repetition 1
Running tree inference for error rate 0.0006 repetition 1
Running tree inference for error rate 0.0007 repetition 1
Running tree inference for error rate 0.0008 repetition 1
Running tree inference for error rate 0.0009 repetition 1
Running tree inference for error rate 0.001 repetition 1
Running tree inference for error rate 0.0001 repetition 2
Running tree inference for error rate 0.0002 repetition 2
Running tree inference for error rate 0.0003 repetition 2
Running tree inference for error rate 0.0004 repetition 2
Running tree inference for error rate 0.0005 repetition 2
Running tree inference for error rate 0.0006 repetition 2
Running tree inference for error rate 0.0007 repetition 2
Running tree in

In [58]:
true_tree = pd.read_csv('../../data/P9855_matrix_output_w_pos/0.0007.csv', sep='\t', header=None)
true_tree

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,36,1.41561e-06,6.92581e-08,0.99999,0.99999,2.017038e-07,1.762886e-07,3.716115e-07,7.251941e-08,3.518114e-06,9.968306e-08,1.107639e-07,7.145525e-08,1.021586e-07
1,312,2.218039e-07,0.99999,5.058882e-08,5.873729e-08,3.478587e-08,0.99999,2.498707e-08,4.025095e-08,6.606385e-08,1.948997e-07,6.156975e-08,9.123551e-08,2.300719e-07
2,791,2.733173e-07,0.99999,7.627657e-08,4.244745e-07,2.109461e-08,0.99999,5.230957e-10,3.913396e-09,1.160457e-07,3.161729e-08,1.560368e-08,1.652183e-07,4.732761e-07
3,1126,2.842819e-07,5.885587e-07,2.969149e-07,1.750307e-07,9.07279e-08,1.691451e-07,7.565573e-08,1.674595e-07,1.058588e-07,1.0,1.896711e-07,2.291222e-07,0.9999981
4,1782,1.682192e-07,5.377589e-08,0.99999,0.99999,5.243574e-09,1.829185e-08,5.901172e-08,4.517252e-08,2.245854e-08,7.274125e-08,7.71106e-08,5.017605e-08,3.142429e-08
5,2361,1.661019e-07,1.250446e-07,2.811671e-08,1.278892e-08,2.19109e-09,3.038116e-09,1.371787e-07,1.403317e-06,3.524677e-09,0.99999,1.028411e-08,2.040664e-08,0.99999
6,2844,1.71325e-07,1.818665e-07,1.463766e-07,4.167147e-07,3.232247e-08,9.774378e-08,1.481055e-06,4.75438e-09,1.156411e-07,3.074085e-08,0.99999,0.99999,3.647753e-06
7,3109,1.788725e-07,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999
8,3164,1.9464e-07,0.9999071,2.257035e-08,5.338698e-08,1.283497e-07,1.0,1.777175e-07,3.249343e-09,1.307059e-08,3.720967e-08,7.259147e-08,3.395028e-08,7.458818e-08
9,5130,2.943786e-07,3.324482e-08,9.712984e-08,3.718328e-08,9.832548e-09,5.703708e-09,4.384649e-08,9.374565e-08,1.152245e-07,3.73744e-08,0.99999,0.99999,1.783914e-07
