In [40]:
from Bio import SeqIO
from glob import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [41]:
import numpy
numpy.__version__

'1.21.5'

In [42]:
from mito.genotyping import *

## Load reference

In [43]:
# Path to reference genome
reference_path = '../../data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

## Load read matrices

In [44]:
# Path to allele count files
INPUT_PATH = '../../data/P9855_and_bulk_ac/'

In [45]:
cell_count = []

# Get all allele counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt'))) #*.txt

for filename in sorted(cell_count_filenames):
    print('Reading {}'.format(filename))
    name = os.path.basename(filename).split('.')[0]
    

    # Load one allele count file
    counts = pd.read_csv(filename, sep='\t')
    counts.name = name
    cell_count.append(counts)
    
# Get sample order

sample_list = []

for filename in sorted(cell_count_filenames):
    name = os.path.basename(filename).split('-')[0]
    sample_list.append(name)

sample_list_df = pd.DataFrame(sample_list)
sample_list_df = sample_list_df.rename(columns={0: 'cell'})

Reading ../../data/P9855_and_bulk_ac/P3861_218.clean.dedup_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2085_S108_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2089_S112_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2090_S113_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2091_S114_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2093_S116_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2096_S119_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2101_S124_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2102_S125_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2104_S127_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2110_S133_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2111_S134_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2112_S135_L007_ac.txt


In [15]:
#e_rates = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
#e_rates = [0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017, 0.0019, 0.0019, 0.002]
#e_rates = [0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01]
#e_rates = [0.02, 0.025, 0.03, 0.035]


#e_rates = [0.00009]

## Get e_rates
## 0.0001 -> 0.05 in steps of 0.0001

In [16]:
#0.0001 -> 0.05 in steps of 0.0001

In [31]:
start = 0.0001
end = 0.036
step = 0.0001

e_rates = []

for number in range(int(start / step), int(end / step) + 1):
    current_number = number * step
    rounded_number = round(current_number, 4)
    e_rates.append(rounded_number)
print(e_rates)

[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017, 0.0018, 0.0019, 0.002, 0.0021, 0.0022, 0.0023, 0.0024, 0.0025, 0.0026, 0.0027, 0.0028, 0.0029, 0.003, 0.0031, 0.0032, 0.0033, 0.0034, 0.0035, 0.0036, 0.0037, 0.0038, 0.0039, 0.004, 0.0041, 0.0042, 0.0043, 0.0044, 0.0045, 0.0046, 0.0047, 0.0048, 0.0049, 0.005, 0.0051, 0.0052, 0.0053, 0.0054, 0.0055, 0.0056, 0.0057, 0.0058, 0.0059, 0.006, 0.0061, 0.0062, 0.0063, 0.0064, 0.0065, 0.0066, 0.0067, 0.0068, 0.0069, 0.007, 0.0071, 0.0072, 0.0073, 0.0074, 0.0075, 0.0076, 0.0077, 0.0078, 0.0079, 0.008, 0.0081, 0.0082, 0.0083, 0.0084, 0.0085, 0.0086, 0.0087, 0.0088, 0.0089, 0.009, 0.0091, 0.0092, 0.0093, 0.0094, 0.0095, 0.0096, 0.0097, 0.0098, 0.0099, 0.01, 0.0101, 0.0102, 0.0103, 0.0104, 0.0105, 0.0106, 0.0107, 0.0108, 0.0109, 0.011, 0.0111, 0.0112, 0.0113, 0.0114, 0.0115, 0.0116, 0.0117, 0.0118, 0.0119, 0.012, 0.0121, 0.0122, 0.0123, 0.0124, 0.0125, 0.0126, 0.01

## Compute mutations probabilities

In [32]:
def cell_prob(e_rate):
    
    error_rate_when_no_mutation = error_rate_when_mutation = e_rate
    p_mutation = 1 / 1000.0

    cell_prob = []
    for count in cell_count:
        count = count.iloc[:reference.shape[0]]  # discard trailing positions
        p = nucleotide_mutation_prob(
            cell_counts=count,
            reference=reference,
            error_rate_when_no_mutation=error_rate_when_no_mutation,
            error_rate_when_mutation=error_rate_when_mutation,
            p_mutation=p_mutation,
        )

        cell_prob.append(p)
        
    # Compute P(mutation | read counts)
    cells_p_mutation = []
    for cell_prob in cell_prob:
        p = mutation_prob(cell_prob, reference)
        cells_p_mutation.append(p)
        
        
    # Make mutation matrix
    mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
    mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
    mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)
    return mutation_matrix      

## Filter pmat

In [33]:
def flt_pmat(e_rate):
    mutation_matrix = cell_prob(e_rate)

    # Remove germline mutations
    mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]
    
    # Probabilities for location with mutation at any of the cells
    mutation_threshold = 0.9
    data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
    mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]
    
    # select rows that have a value greater than mutation_threshold in more than x cells
    mutation_threshold = 0.9
    mask = (data.values > mutation_threshold).sum(axis=1) > 1 #create an array with boolean values and sum these along the axis and select rows with more than 1 True
    two_cells_have_mut = data[mask]

    # Fix matrix
    half = len(two_cells_have_mut.columns) / 2

    # Remove rows where 50% of cols are nan and save in new df
    nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> half].index)

    # Replace NaN with row mean
    imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()
    
    # To skip removal of sites with overall high probability for mut
    high_prob_rows = imputed.copy()
    
    # Replace 1.0
    high_prob_rows = high_prob_rows.replace(1.0, 0.99999)
    
    np.set_printoptions(suppress=True)
    e_rate_name = str(e_rate)
    matrix_path = '../../data/P9855_matrix_output/' + e_rate_name + '.csv'
    high_prob_rows.to_csv(matrix_path, index=False, sep=' ', header= False)
    
    #print positions
    #matrix_path = '../../data/P9855_matrix_output_w_pos/' + e_rate_name + '.csv'
    #high_prob_rows.to_csv(matrix_path, sep='\t', header=False)
    


In [34]:
for e_rate in e_rates:
    flt_pmat(e_rate)

  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)


## Now run mt-SCITE

In [46]:
# prepare pmat data

pmat_names = []
shapes = []

pmat_input_path = f'../../data/P9855_matrix_output/'
#print(pmat_input_path)
pmats = list(glob(os.path.join(pmat_input_path, '*.csv')))
tree_name = []

for filename in sorted(pmats):
    name = os.path.basename(filename).split('-')[0].rsplit('.', 1)[0]
    #print(name)
    pmat_names.append(name)
    df = pd.read_csv(filename, sep=' ', header=None)
    shapes.append(len(df))

# make df with pmat info
pmat_data = pd.DataFrame(
    {'pmat_names': pmat_names,
     'len': shapes,
    })

pmat_data

Unnamed: 0,pmat_names,len
0,0.0001,176
1,0.0002,64
2,0.0003,40
3,0.0004,35
4,0.0005,34
...,...,...
354,0.0355,2
355,0.0356,2
356,0.0357,2
357,0.0358,2


In [47]:
# Check n samples
a_pmat = pd.read_csv('../../data/P9855_matrix_output/0.0006.csv', sep=' ', header=None)
n_cells = a_pmat.shape[1]
n_cells

13

In [None]:
# Run mt-SCITE

SCITE_PATH = '../../../mt-SCITE'
PMAT_PATH = f'../../data/P9855_matrix_output/'
OUTPUT = f'../../../mt-SCITE/mt-SCITE_output/P9855/'

for rep in range(1,11): #11
    
    for pmat in pmat_names:        
        run_id = pmat + '_' + str(rep)
        print('Running tree inference for error rate ' + pmat + ' repetition ' + str(rep))

        # Get number of mutations
        n = pmat_data.loc[pmat_data['pmat_names'] == pmat, 'len'].iloc[0].astype(str)

        try:
            os.makedirs(OUTPUT + '/stdout/') 
        except FileExistsError :
            pass
        except :
            raise

        ! $SCITE_PATH/mt-SCITE/scite -i $PMAT_PATH/$pmat\.csv -n $n -m $n_cells -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o $OUTPUT/$run_id 1> $OUTPUT/stdout/$run_id\.stdout.txt
        
 

Running tree inference for error rate 0.0001 repetition 1
Running tree inference for error rate 0.0002 repetition 1
Running tree inference for error rate 0.0003 repetition 1
Running tree inference for error rate 0.0004 repetition 1
Running tree inference for error rate 0.0005 repetition 1
Running tree inference for error rate 0.0006 repetition 1
Running tree inference for error rate 0.0007 repetition 1


In [14]:
#true_tree = pd.read_csv('../../data/P9855_matrix_output_w_pos/0.0007.csv', sep='\t', header=None)
#true_tree