In [56]:
from Bio import SeqIO
from glob import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [57]:
import numpy
numpy.__version__

'1.21.5'

In [58]:
from mito.genotyping import *

## Load reference

In [60]:
# Path to reference genome
reference_path = '../../data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

## Load read matrices

In [61]:
# Path to allele count files
INPUT_PATH = '../../data/P9855_and_bulk_ac/'

In [62]:
cell_count = []

# Get all allele counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt'))) #*.txt

for filename in sorted(cell_count_filenames):
    print('Reading {}'.format(filename))
    name = os.path.basename(filename).split('.')[0]
    

    # Load one allele count file
    counts = pd.read_csv(filename, sep='\t')
    counts.name = name
    cell_count.append(counts)
    
# Get sample order

sample_list = []

for filename in sorted(cell_count_filenames):
    name = os.path.basename(filename).split('-')[0]
    sample_list.append(name)

sample_list_df = pd.DataFrame(sample_list)
sample_list_df = sample_list_df.rename(columns={0: 'cell'})

Reading ../../data/P9855_and_bulk_ac/P3861_218.clean.dedup_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2085_S108_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2089_S112_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2090_S113_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2091_S114_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2093_S116_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2096_S119_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2101_S124_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2102_S125_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2104_S127_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2110_S133_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2111_S134_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2112_S135_L007_ac.txt


In [63]:
#e_rates = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
#e_rates = [0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016, 0.0017, 0.0019, 0.0019, 0.002]
#e_rates = [0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01]
#e_rates = [0.02, 0.025, 0.03, 0.035]


#e_rates = [0.00009]

## Get e_rates
## 0.0001 -> 0.05 in steps of 0.0001

In [16]:
#0.0001 -> 0.05 in steps of 0.0001

In [64]:
#start = 0.0001
#end = 0.036
#step = 0.0001

#e_rates = []

#for number in range(int(start / step), int(end / step) + 1):
#    current_number = number * step
#    rounded_number = round(current_number, 4)
#    e_rates.append(rounded_number)
#print(e_rates)

In [70]:
# Error rates from performance notebook, learn error rates plot

e_rates = [0.0001,
 0.0002,
 0.0003,
 0.0004,
 0.0005,
 0.0006,
 0.0007,
 0.0008,
 0.0009,
 0.001,
 0.0011,
 0.0012,
 0.0013,
 0.0014,
 0.0015,
 0.0016,
 0.0017,
 0.0018,
 0.0019,
 0.002,
 0.0021,
 0.0031,
 0.0041,
 0.0051,
 0.0061,
 0.0071,
 0.0081,
 0.0091,
 0.0101,
 0.0111,
 0.0121,
 0.0131,
 0.0141,
 0.0151,
 0.0161,
 0.0171,
 0.0181,
 0.0191,
 0.0201,
 0.0211,
 0.0221,
 0.0231,
 0.0241,
 0.0251,
 0.0261,
 0.0271,
 0.0281,
 0.0291,
 0.0301,
 0.0311,
 0.0321,
 0.0331,
 0.0341,
 0.0351]

## Compute mutations probabilities

In [72]:
def cell_prob(e_rate):
    
    error_rate_when_no_mutation = error_rate_when_mutation = e_rate
    p_mutation = 1 / 1000.0

    cell_prob = []
    for count in cell_count:
        count = count.iloc[:reference.shape[0]]  # discard trailing positions
        p = nucleotide_mutation_prob(
            cell_counts=count,
            reference=reference,
            error_rate_when_no_mutation=error_rate_when_no_mutation,
            error_rate_when_mutation=error_rate_when_mutation,
            p_mutation=p_mutation,
        )

        cell_prob.append(p)
        
    # Compute P(mutation | read counts)
    cells_p_mutation = []
    for cell_prob in cell_prob:
        p = mutation_prob(cell_prob, reference)
        cells_p_mutation.append(p)
        
        
    # Make mutation matrix
    mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
    mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
    mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)
    return mutation_matrix      

## Filter pmat

In [73]:
def flt_pmat(e_rate):
    mutation_matrix = cell_prob(e_rate)

    # Remove germline mutations
    mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]
    
    # Probabilities for location with mutation at any of the cells
    mutation_threshold = 0.9
    data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
    mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]
    
    # select rows that have a value greater than mutation_threshold in more than x cells
    mutation_threshold = 0.9
    mask = (data.values > mutation_threshold).sum(axis=1) > 1 #create an array with boolean values and sum these along the axis and select rows with more than 1 True
    two_cells_have_mut = data[mask]

    # Fix matrix
    half = len(two_cells_have_mut.columns) / 2

    # Remove rows where 50% of cols are nan and save in new df
    nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> half].index)

    # Replace NaN with row mean
    imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()
    
    # To skip removal of sites with overall high probability for mut
    high_prob_rows = imputed.copy()
    
    # Replace 1.0
    high_prob_rows = high_prob_rows.replace(1.0, 0.99999)
    
    np.set_printoptions(suppress=True)
    e_rate_name = str(e_rate)
    matrix_path = '../../data/P9855_matrix_output/' + e_rate_name + '.csv'
    high_prob_rows.to_csv(matrix_path, index=False, sep=' ', header= False)
    
    #print positions
    #matrix_path = '../../data/P9855_matrix_output_w_pos/' + e_rate_name + '.csv'
    #high_prob_rows.to_csv(matrix_path, sep='\t', header=False)
    


In [74]:
for e_rate in e_rates:
    flt_pmat(e_rate)

  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)


## Now run mt-SCITE

In [75]:
# prepare pmat data

pmat_names = []
shapes = []

pmat_input_path = f'../../data/P9855_matrix_output/'
#print(pmat_input_path)
pmats = list(glob(os.path.join(pmat_input_path, '*.csv')))
tree_name = []

for filename in sorted(pmats):
    name = os.path.basename(filename).split('-')[0].rsplit('.', 1)[0]
    #print(name)
    pmat_names.append(name)
    df = pd.read_csv(filename, sep=' ', header=None)
    shapes.append(len(df))

# make df with pmat info
pmat_data = pd.DataFrame(
    {'pmat_names': pmat_names,
     'len': shapes,
    })

pmat_data

Unnamed: 0,pmat_names,len
0,0.0001,176
1,0.0002,64
2,0.0003,40
3,0.0004,35
4,0.0005,34
5,0.0006,30
6,0.0007,28
7,0.0008,26
8,0.0009,24
9,0.001,24


In [76]:
# Check n samples
a_pmat = pd.read_csv('../../data/P9855_matrix_output/0.0006.csv', sep=' ', header=None)
n_cells = a_pmat.shape[1]
n_cells

13

In [77]:
# Run mt-SCITE

SCITE_PATH = '../../../mt-SCITE'
PMAT_PATH = f'../../data/P9855_matrix_output/'
OUTPUT = f'../../../mt-SCITE/mt-SCITE_output/P9855/'

for rep in range(1,11): #11
    
    for pmat in pmat_names:        
        run_id = pmat + '_' + str(rep)
        print('Running tree inference for error rate ' + pmat + ' repetition ' + str(rep))

        # Get number of mutations
        n = pmat_data.loc[pmat_data['pmat_names'] == pmat, 'len'].iloc[0].astype(str)

        try:
            os.makedirs(OUTPUT + '/stdout/') 
        except FileExistsError :
            pass
        except :
            raise

        ! $SCITE_PATH/mt-SCITE/scite -i $PMAT_PATH/$pmat\.csv -n $n -m $n_cells -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o $OUTPUT/$run_id 1> $OUTPUT/stdout/$run_id\.stdout.txt
        
 

Running tree inference for error rate 0.0001 repetition 1
Running tree inference for error rate 0.0002 repetition 1
Running tree inference for error rate 0.0003 repetition 1
Running tree inference for error rate 0.0004 repetition 1
Running tree inference for error rate 0.0005 repetition 1
Running tree inference for error rate 0.0006 repetition 1
Running tree inference for error rate 0.0007 repetition 1
Running tree inference for error rate 0.0008 repetition 1
Running tree inference for error rate 0.0009 repetition 1
Running tree inference for error rate 0.001 repetition 1
Running tree inference for error rate 0.0011 repetition 1
Running tree inference for error rate 0.0012 repetition 1
Running tree inference for error rate 0.0013 repetition 1
Running tree inference for error rate 0.0014 repetition 1
Running tree inference for error rate 0.0015 repetition 1
Running tree inference for error rate 0.0016 repetition 1
Running tree inference for error rate 0.0017 repetition 1
Running tree in

Running tree inference for error rate 0.0161 repetition 3
Running tree inference for error rate 0.0171 repetition 3
Running tree inference for error rate 0.0181 repetition 3
Running tree inference for error rate 0.0191 repetition 3
Running tree inference for error rate 0.0201 repetition 3
Running tree inference for error rate 0.0211 repetition 3
Running tree inference for error rate 0.0221 repetition 3
Running tree inference for error rate 0.0231 repetition 3
Running tree inference for error rate 0.0241 repetition 3
Running tree inference for error rate 0.0251 repetition 3
Running tree inference for error rate 0.0261 repetition 3
Running tree inference for error rate 0.0271 repetition 3
Running tree inference for error rate 0.0281 repetition 3
Running tree inference for error rate 0.0291 repetition 3
Running tree inference for error rate 0.0301 repetition 3
Running tree inference for error rate 0.0311 repetition 3
Running tree inference for error rate 0.0321 repetition 3
Running tree i

Running tree inference for error rate 0.0015 repetition 6
Running tree inference for error rate 0.0016 repetition 6
Running tree inference for error rate 0.0017 repetition 6
Running tree inference for error rate 0.0018 repetition 6
Running tree inference for error rate 0.0019 repetition 6
Running tree inference for error rate 0.002 repetition 6
Running tree inference for error rate 0.0021 repetition 6
Running tree inference for error rate 0.0031 repetition 6
Running tree inference for error rate 0.0041 repetition 6
Running tree inference for error rate 0.0051 repetition 6
Running tree inference for error rate 0.0061 repetition 6
Running tree inference for error rate 0.0071 repetition 6
Running tree inference for error rate 0.0081 repetition 6
Running tree inference for error rate 0.0091 repetition 6
Running tree inference for error rate 0.0101 repetition 6
Running tree inference for error rate 0.0111 repetition 6
Running tree inference for error rate 0.0121 repetition 6
Running tree in

Running tree inference for error rate 0.0301 repetition 8
Running tree inference for error rate 0.0311 repetition 8
Running tree inference for error rate 0.0321 repetition 8
Running tree inference for error rate 0.0331 repetition 8
Running tree inference for error rate 0.0341 repetition 8
Running tree inference for error rate 0.0351 repetition 8
Running tree inference for error rate 0.0001 repetition 9
Running tree inference for error rate 0.0002 repetition 9
Running tree inference for error rate 0.0003 repetition 9
Running tree inference for error rate 0.0004 repetition 9
Running tree inference for error rate 0.0005 repetition 9
Running tree inference for error rate 0.0006 repetition 9
Running tree inference for error rate 0.0007 repetition 9
Running tree inference for error rate 0.0008 repetition 9
Running tree inference for error rate 0.0009 repetition 9
Running tree inference for error rate 0.001 repetition 9
Running tree inference for error rate 0.0011 repetition 9
Running tree in

In [14]:
#true_tree = pd.read_csv('../../data/P9855_matrix_output_w_pos/0.0007.csv', sep='\t', header=None)
#true_tree