In [1]:
from Bio import SeqIO
from glob import glob
import os

In [2]:
from mito.genotyping import *

## Load reference

In [3]:
# Path to reference genome
reference_path = '../../data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

## Load read matrices

In [4]:
# Path to allele count files
INPUT_PATH = '../../data/P9855_and_bulk_ac/'

In [5]:
cell_count = []

# Get all allele counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt'))) #*.txt

for filename in sorted(cell_count_filenames):
    print('Reading {}'.format(filename))
    name = os.path.basename(filename).split('.')[0]
    

    # Load one allele count file
    counts = pd.read_csv(filename, sep='\t')
    counts.name = name
    cell_count.append(counts)
    
# Get sample order

sample_list = []

for filename in sorted(cell_count_filenames):
    name = os.path.basename(filename).split('-')[0]#.split('_')[-1]
    sample_list.append(name)

sample_list_df = pd.DataFrame(sample_list)
sample_list_df = sample_list_df.rename(columns={0: 'cell'})

Reading ../../data/P9855_and_bulk_ac/P3861_218.clean.dedup_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2085_S108_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2089_S112_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2090_S113_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2091_S114_L004_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2093_S116_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2096_S119_L005_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2101_S124_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2102_S125_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2104_S127_L006_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2110_S133_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2111_S134_L007_ac.txt
Reading ../../data/P9855_and_bulk_ac/P9855_2112_S135_L007_ac.txt


In [6]:
# Obtain a set of error rates to test

e_rates = np.linspace(0.0001, 0.01, 10)

e_rates

array([0.0001, 0.0012, 0.0023, 0.0034, 0.0045, 0.0056, 0.0067, 0.0078,
       0.0089, 0.01  ])

## Compute mutations probabilities

In [7]:
def cell_prob(e_rate):
    
    error_rate_when_no_mutation = error_rate_when_mutation = e_rate
    p_mutation = 1 / 500.0

    cell_prob = []
    for count in cell_count:
        count = count.iloc[:reference.shape[0]]  # discard trailing positions
        p = nucleotide_mutation_prob(
            cell_counts=count,
            reference=reference,
            error_rate_when_no_mutation=error_rate_when_no_mutation,
            error_rate_when_mutation=error_rate_when_mutation,
            p_mutation=p_mutation,
        )

        cell_prob.append(p)
        
    # Compute P(mutation | read counts)
    cells_p_mutation = []
    for cell_prob in cell_prob:
        p = mutation_prob(cell_prob, reference)
        cells_p_mutation.append(p)
        
        
    # Make mutation matrix
    mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
    mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
    mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)
    return mutation_matrix      

## Filter pmat

In [8]:
def flt_pmat(e_rate):
    mutation_matrix = cell_prob(e_rate)

    # Remove germline mutations
    mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]
    
    # Probabilities for location with mutation at any of the cells
    mutation_threshold = 0.9
    data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
    mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]
    
    # select rows that have a value greater than mutation_threshold in more than x cells
    mutation_threshold = 0.9
    mask = (data.values > mutation_threshold).sum(axis=1) > 1 #create an array with boolean values and sum these along the axis and select rows with more than 1 True
    two_cells_have_mut = data[mask]

    # Fix matrix
    half = len(two_cells_have_mut.columns) / 2

    # Remove rows where 50% of cols are nan and save in new df
    nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> half].index)

    # Replace NaN with row mean
    imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()
    
    # To skip removal of sites with overall high probability for mut
    high_prob_rows = imputed.copy()
    
    # Replace 1.0
    #high_prob_rows = high_prob_rows.apply(lambda x: [y if y <= 0.9999 else 0.99 for y in x])
    high_prob_rows = high_prob_rows.replace(1.0, 0.99999)
    
    e_rate_name = str(e_rate)
    matrix_path = '../../data/P9855_matrix_output/' + e_rate_name + '.csv'
    high_prob_rows.to_csv(matrix_path, index=False, sep=' ', header= False)

In [9]:
for e_rate in e_rates:
    flt_pmat(e_rate)

  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mut

  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mut

  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mut

  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
  prob_mutation['Prob_mutation'] = 1 - p.lookup(p.index, reference.values)
