In [1]:
from Bio import SeqIO
from glob import glob
import os

In [2]:
import numpy
numpy.__version__

'1.22.3'

In [3]:
from mito.genotyping import *

## Load reference

In [4]:
# Path to reference genome
reference_path = '../../data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

## Load read matrices

In [5]:
# Path to allele count files
INPUT_PATH = '../../data/YFV2001_scRNAseq_sub1/'

In [6]:
cell_count = []

# Get all allele counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt'))) #*.txt

for filename in sorted(cell_count_filenames):
    print('Reading {}'.format(filename))
    name = os.path.basename(filename).split('.')[0]
    

    # Load one allele count file
    counts = pd.read_csv(filename, sep='\t')
    counts.name = name
    cell_count.append(counts)
    
# Get sample order

sample_list = []

for filename in sorted(cell_count_filenames):
    name = os.path.basename(filename).split('-')[0]#.split('_')[-1]
    sample_list.append(name)

sample_list_df = pd.DataFrame(sample_list)
sample_list_df = sample_list_df.rename(columns={0: 'cell'})

Reading ../../data/YFV2001_scRNAseq_sub1/1_12_P3861_210.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1009_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1029_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1047_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1051_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1065_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1075_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1082_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1087_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/1_150312_BC6BFMANXX_P1902_1094_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/2_140812_AC492YACXX_P1299_1141_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/2_140812_AC492YACXX_P1299_1160_ac.txt
Reading ../../data/YFV2001_scRNAseq_sub1/2_150312_BC6BFMANXX_P1902_1099

In [7]:
def remove_low_cov_pos(cell_count):
    
    # Filter positions with low mean coverage
    
    # First make list with positions with low coverage
    # concat cell counts into new df
    cc_flt = pd.concat(cell_count, axis=1)
    # drop unwanted cols
    cc_flt.drop(['#CHR', 'POS', 'Count_A', 'Count_C', 'Count_G', 'Count_T'], axis=1, inplace=True)
    # drop bulk col
    column_numbers = [x for x in range(cc_flt.shape[1])] # list of columns' integer indices
    column_numbers.remove(0) #removing column integer index 0
    cc_flt = cc_flt.iloc[:, column_numbers] #return all columns except the 0th column
    
    # compute average coverage
    cc_flt['mean'] = cc_flt.mean(axis=1)
    
    # Get positions in new col
    cc_flt.reset_index(inplace=True)
    cc_flt['POS'] = cc_flt['index']+1
    
    # make new df with only relevant info
    cc_flt_pos = cc_flt[['mean', 'POS']]
    
    # reset index
    cc_flt_pos.set_index('POS', inplace=True)
    
    # Select rows with low coverage
    threshold = 100
    below_thres = cc_flt_pos.loc[cc_flt_pos['mean'] < threshold]

    # save index (which is POS) to list
    below_thres_lst = below_thres.index.tolist()
    
    return below_thres_lst

In [8]:
# Obtain a set of error rates to test


e_rates = [0.0001 ,0.0112 ,0.0223 ,0.0334 ,0.0445 ,0.0556 ,0.0667 ,0.0778,0.0889 ,0.1 ,0.2 ,0.22]

#e_rates = np.linspace(0.0001, 0.0009, 10).round(4)
#extra_error_rates = [0.001, 0.002, 0.005]
#extra_error_rates_ar = np.array(extra_error_rates)
#all_e_rates = np.concatenate((e_rates1, extra_error_rates_ar))

all_e_rates = e_rates
all_e_rates

[0.0001,
 0.0112,
 0.0223,
 0.0334,
 0.0445,
 0.0556,
 0.0667,
 0.0778,
 0.0889,
 0.1,
 0.2,
 0.22]

In [9]:
#np.linspace(0.0001, 0.22, 10).round(4)

## Compute mutations probabilities

In [10]:
def cell_prob(e_rate):
    
    error_rate_when_no_mutation = error_rate_when_mutation = e_rate
    p_mutation = 1 / 500.0

    cell_prob = []
    for count in cell_count:
        count = count.iloc[:reference.shape[0]]  # discard trailing positions
        p = nucleotide_mutation_prob(
            cell_counts=count,
            reference=reference,
            error_rate_when_no_mutation=error_rate_when_no_mutation,
            error_rate_when_mutation=error_rate_when_mutation,
            p_mutation=p_mutation,
        )

        cell_prob.append(p)
        
    # Compute P(mutation | read counts)
    cells_p_mutation = []
    for cell_prob in cell_prob:
        p = mutation_prob(cell_prob, reference)
        cells_p_mutation.append(p)
        
        
    # Make mutation matrix
    mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
    mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
    mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)
    return mutation_matrix      

## Filter pmat

In [11]:
def flt_pmat(e_rate, cell_count):
    mutation_matrix = cell_prob(e_rate)

    # Remove germline mutations
    mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]
    
    # Probabilities for location with mutation at any of the cells
    mutation_threshold = 0.9
    data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
    mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]
    
    # select rows that have a value greater than mutation_threshold in more than x cells
    mutation_threshold = 0.9
    mask = (data.values > mutation_threshold).sum(axis=1) > 1 #create an array with boolean values and sum these along the axis and select rows with more than 1 True
    two_cells_have_mut = data[mask]

    # Define size of 50% of the population
    half = len(two_cells_have_mut.columns) / 2

    # Remove rows where 50% of cols are nan and save in new df
    nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> half].index)

    # Replace NaN with row mean
    imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()
    
    # To skip removal of sites with overall high probability for mut
    #high_prob_rows = imputed.copy()
    
    # remove locations close to another location
    index = imputed.index.tolist() + [2000000]
    ind = [ a for a,b in zip(index[:-1], index[1:]) if b-a > 4]
    clust = imputed.loc[ind]
    #clust.shape
    
    # Replace 1.0
    replaced = clust.replace(1.0, 0.99999)
        
    # Remove pos with low cov
    below_thres_lst = remove_low_cov_pos(cell_count)
    
    low_dp = replaced.copy()
    low_dp.reset_index(inplace = True)
    low_dp = low_dp[~low_dp['POS'].isin(below_thres_lst)]
    low_dp.set_index('POS', inplace=True)

    # save matrix
    np.set_printoptions(suppress=True)
    e_rate_name = str(e_rate)
    matrix_path = '../../data/YFV2001_matrix_output_2/' + e_rate_name + '.csv'
    low_dp.to_csv(matrix_path, index=False, sep=' ', header= False)
    


In [12]:
for e_rate in all_e_rates:
    flt_pmat(e_rate, cell_count)

1


  print(p.lookup(p.index, reference.values))
  print(len(p.lookup(p.index, reference.values)))
  print(type(p.lookup(p.index, reference.values)))


[0.9999955  0.99999561 0.99999564 ... 0.99999735 0.99999731 0.99999725]
16569
<class 'numpy.ndarray'>
2
                  A             C             G             T   N
0      1.501123e-06  1.501123e-06  9.999955e-01  1.501123e-06 NaN
1      9.999956e-01  1.461706e-06  1.461706e-06  1.461706e-06 NaN
2      1.452173e-06  1.452173e-06  1.452173e-06  9.999956e-01 NaN
3      1.397489e-06  9.999958e-01  1.397489e-06  1.397489e-06 NaN
4      9.999959e-01  1.371663e-06  1.371663e-06  1.371663e-06 NaN
...             ...           ...           ...           ...  ..
16564  8.498740e-07  9.999975e-01  8.498740e-07  8.498740e-07 NaN
16565  8.686618e-07  8.686618e-07  9.999974e-01  8.686618e-07 NaN
16566  9.999973e-01  8.835991e-07  8.835991e-07  8.835991e-07 NaN
16567  8.966455e-07  8.966455e-07  8.966455e-07  9.999973e-01 NaN
16568  9.150696e-07  9.150696e-07  9.999973e-01  9.150696e-07 NaN

[16569 rows x 5 columns]
3
['G' 'A' 'T' ... 'A' 'T' 'G']
16569


NameError: name 'quit' is not defined

## Now run mt-SCITE