In [3]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from Bio import SeqIO
from glob import glob
import os

from mito.genotyping import nucleotide_mutation_prob, mutation_prob, COUNTS_COLUMNS


# Plotting style
sns.set_style('white')
sns.set_context('notebook')
pd.set_option('max_rows', 1000)
pd.set_option('max_columns', 100)

def plot_style(figsize=(12, 6), labelsize=20, titlesize=24, ticklabelsize=14, **kwargs):
   basic_style = {
       'figure.figsize': figsize,
       'axes.labelsize': labelsize,
       'axes.titlesize': titlesize,
       'xtick.labelsize': ticklabelsize,
       'ytick.labelsize': ticklabelsize,
       'axes.spines.top': False,
       'axes.spines.right': False,
       'axes.spines.left': False,
       'axes.grid': False,
       'axes.grid.axis': 'y',
   }
   basic_style.update(kwargs)
   return plt.rc_context(rc=basic_style)

blue = sns.xkcd_rgb['ocean blue']


# Load reference

In [4]:
reference_path = 'data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

# Load cells read matrixes

In [5]:
INPUT_PATH = 'data/invivo_scRNAseq/train_data/'

In [6]:
cell_count = []

# Get all cell counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt'))) #*.txt

for filename in sorted(cell_count_filenames):
    print('Reading {}'.format(filename))
    name = os.path.basename(filename).split('.')[0]
    

    # Load one cell count file
    counts = pd.read_csv(filename, sep='\t')
    counts.name = name
    cell_count.append(counts)


Reading data/invivo_scRNAseq/train_data/1_12_P3861_210.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1009_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1029_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1047_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1051_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1065_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1075_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1082_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1087_ac.txt
Reading data/invivo_scRNAseq/train_data/1_150312_BC6BFMANXX_P1902_1094_ac.txt
Reading data/invivo_scRNAseq/train_data/2_140812_AC492YACXX_P1299_1141_ac.txt
Reading data/invivo_scRNAseq/train_data/2_140812_AC492YACXX_P1299_1160_ac.txt
Reading data/invivo_scRNAseq/train_data/2_150312_BC6BFMANXX_P1902_1099_ac.txt
Readi

In [11]:
e_rate = 0.2

## Construct matrix

In [12]:
# Compute P(mutation at X | read counts)

%time
error_rate_when_no_mutation = error_rate_when_mutation = e_rate
p_mutation = 1 / 500.0


cell_prob = []
for count in cell_count:
    count = count.iloc[:reference.shape[0]]  # discard trailing positions
    p = nucleotide_mutation_prob(
        cell_counts=count,
        reference=reference,
        error_rate_when_no_mutation=error_rate_when_no_mutation,
        error_rate_when_mutation=error_rate_when_mutation,
        p_mutation=p_mutation,
    )

    cell_prob.append(p)
    
    
# Compute P(mutation | read counts)
cells_p_mutation = []
for cell_prob in cell_prob:
    p = mutation_prob(cell_prob, reference)
    cells_p_mutation.append(p)
    
    
# Mutation matrix

mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)



CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs


  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan


  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan


  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan


### Matrix filter

In [13]:
mutation_matrix.shape

(16569, 75)

In [14]:
# Remove germline mutations
mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]
mutation_matrix.shape

(6523, 75)

In [22]:
# Probabilities for location with mutation at any of the cells
mutation_threshold = 0.9
data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]
mutation_any_cell.shape

(119, 73)

In [23]:
# select rows that have a value greater than mutation_threshold in >1 cells
mutation_threshold = 0.9
mask = (data.values > mutation_threshold).sum(axis=1) > 1
two_cells_have_mut = data[mask]
two_cells_have_mut.shape

(17, 73)

In [24]:
# Remove rows if more than 50% have missing data
halva = len(two_cells_have_mut.columns) / 2
nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> halva].index)
nan_rows.shape

(17, 73)

In [25]:
# Fill nans with row mean
imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()
imputed.shape

(17, 73)

In [26]:
# To skip removal of sites with overall high probability for mut
#high_prob_rows = imputed.copy()

# Replace 1.0
imputed = imputed.apply(lambda x: [y if y <= 0.9999 else 0.99 for y in x])
imputed.shape

(17, 73)

In [28]:
# remove locations close to another location
ind = [ a for a,b in zip(imputed.index,imputed.index[1:]) if b-a > 4]
clust = imputed.loc[ind]
clust.shape

(12, 73)

In [None]:
#Save matrix
e_rate_name = str(e_rate)
matrix_path = 'data/matrix_output/' + e_rate_name + '.csv'
clust.to_csv(matrix_path, index=False, sep=' ', header= False)

In [None]:
# TODO
# remove positions were the average total counts < x

# Now run mito-SCITE...