In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from Bio import SeqIO
from glob import glob
import os

from mito.genotyping import nucleotide_mutation_prob, mutation_prob, COUNTS_COLUMNS


# Plotting style
sns.set_style('white')
sns.set_context('notebook')
pd.set_option('max_rows', 1000)
pd.set_option('max_columns', 100)

def plot_style(figsize=(12, 6), labelsize=20, titlesize=24, ticklabelsize=14, **kwargs):
   basic_style = {
       'figure.figsize': figsize,
       'axes.labelsize': labelsize,
       'axes.titlesize': titlesize,
       'xtick.labelsize': ticklabelsize,
       'ytick.labelsize': ticklabelsize,
       'axes.spines.top': False,
       'axes.spines.right': False,
       'axes.spines.left': False,
       'axes.grid': False,
       'axes.grid.axis': 'y',
   }
   basic_style.update(kwargs)
   return plt.rc_context(rc=basic_style)

blue = sns.xkcd_rgb['ocean blue']


# Load reference

In [4]:
#reference_path = 'data/reference/mito_GRCh38_gimlet.fasta'

reference_path = '../../../../pberkes/data/reference/mito_GRCh38_gimlet.fasta'

with open(reference_path, 'r'):
    reference_seq = next(SeqIO.parse(reference_path, "fasta"))
reference = pd.Series(list(reference_seq.seq))

# Load cells read matrixes

In [12]:
# Path with the cell count files


INPUT_PATH = '../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data'

#INPUT_PATH = '../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data'
#INPUT_PATH = '../kth_data_science_assignment_4_aug2020/data/atac/test_atac_data'

In [13]:
cell_count = []

# Get all cell counts files in the input path
cell_count_filenames = list(glob(os.path.join(INPUT_PATH, '*.txt'))) #*.txt

for filename in sorted(cell_count_filenames):
    print('Reading {}'.format(filename))
    name = os.path.basename(filename).split('.')[0]
    

    # Load one cell count file
    counts = pd.read_csv(filename, sep='\t')
    counts.name = name
    cell_count.append(counts)
    
# Get sample order

sample_list = []

for filename in sorted(cell_count_filenames):
    name = os.path.basename(filename).split('-')[0]#.split('_')[-1]
    sample_list.append(name)

sample_list_df = pd.DataFrame(sample_list)
sample_list_df = sample_list_df.rename(columns={0: 'cell'})

#sample_list_df
#sample_list_df.to_csv('data/matrix_output/sample_order_test.txt', sep=',')
#sample_list_df.to_csv('data/matrix_output/sample_order_train.txt', sep=',')
#sample_list_df.to_csv('data/matrix_output/sample_order_full.txt', sep=',')
#sample_list_df.to_csv('data/matrix_output/sample_order_test.txt', sep=',')

Reading ../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data/P3861_218.clean.dedup_ac.txt
Reading ../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data/P9855_2089_S112_L004_ac.txt
Reading ../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data/P9855_2090_S113_L004_ac.txt
Reading ../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data/P9855_2104_S127_L006_ac.txt
Reading ../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data/P9855_2110_S133_L007_ac.txt
Reading ../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data/P9855_2111_S134_L007_ac.txt
Reading ../../../kth_data_science_assignment_4_aug2020/data/atac/train_atac_data/P9855_2112_S135_L007_ac.txt


In [90]:
# Obtain a set of error rates to test
e_rates = np.linspace(0.0001, 0.1, 10) 
e_rates


array([0.0001, 0.0112, 0.0223, 0.0334, 0.0445, 0.0556, 0.0667, 0.0778,
       0.0889, 0.1   ])

In [14]:

# Run this cell for each error rate
e_rate = 0.0112

# Make matrix

# Compute P(mutation at X | read counts)

%time
error_rate_when_no_mutation = error_rate_when_mutation = e_rate
p_mutation = 1 / 500.0


cell_prob = []
for count in cell_count:
    count = count.iloc[:reference.shape[0]]  # discard trailing positions
    p = nucleotide_mutation_prob(
        cell_counts=count,
        reference=reference,
        error_rate_when_no_mutation=error_rate_when_no_mutation,
        error_rate_when_mutation=error_rate_when_mutation,
        p_mutation=p_mutation,
    )

    cell_prob.append(p)
    
    
# Compute P(mutation | read counts)
cells_p_mutation = []
for cell_prob in cell_prob:
    p = mutation_prob(cell_prob, reference)
    cells_p_mutation.append(p)
    
    
# Make mutation matrix

mutation_matrix = cells_p_mutation[0][['#CHR', 'POS']].copy()
mutation_matrix_data = pd.DataFrame(np.dstack([c['Prob_mutation'].values for c in cells_p_mutation]).squeeze())
mutation_matrix = pd.concat([mutation_matrix, mutation_matrix_data], axis=1)


# Remove germline mutations
mutation_matrix = mutation_matrix.loc[mutation_matrix[0] < 0.9]
#print('no germline: ' + str(len(mutation_matrix)))


# Probabilities for location with mutation at any of the cells
mutation_threshold = 0.9
data = mutation_matrix.set_index('POS').drop('#CHR', axis=1)
mutation_any_cell = data[(data > mutation_threshold).any(axis=1)]
#print('any cell: ' + str(len(mutation_any_cell)))


# select rows that have a value greater than mutation_threshold in more than x cells
mutation_threshold = 0.9
mask = (data.values > mutation_threshold).sum(axis=1) > 1 #create an array with boolean values and sum these along the axis and select rows with more than 1 True
two_cells_have_mut = data[mask]
#print('two cells: ' + str(len(two_cells_have_mut)))


# Fix matrix

half = len(two_cells_have_mut.columns) / 2

# Remove rows where 50% of cols are nan and save in new df
nan_rows = two_cells_have_mut.drop(two_cells_have_mut[(two_cells_have_mut.isna()).sum(axis=1)> half].index)
#print('no nans: ' + str(len(nan_rows)))


# Replace NaN with row mean
imputed = nan_rows.transpose().fillna(nan_rows.mean(axis=1)).transpose()
#print(len(imputed))


# To skip removal of sites with overall high probability for mut
high_prob_rows = imputed.copy()


# Replace 1.0
high_prob_rows = high_prob_rows.apply(lambda x: [y if y <= 0.9999 else 0.99 for y in x])
#len(high_prob_rows)


# To skip removal of cluster muts
#clust = high_prob_rows.copy()

##### for scRNAseq ####
ind = [ a for a,b in zip(high_prob_rows.index,high_prob_rows.index[1:]) if b-a > 4]
clust = high_prob_rows.loc[ind]
#print('clust flt: ' + str(len(clust)))

#len(high_prob_rows)

# Turn e_rate float variable into str and save matrix
e_rate_name = str(e_rate)
matrix_path = 'data/matrix_output/' + e_rate_name + '.csv'
clust.to_csv(matrix_path, index=False, sep=' ', header= False)


CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan
  prob_mutation[col] = pd.np.nan


FileNotFoundError: [Errno 2] No such file or directory: 'data/matrix_output/0.0112.csv'

In [43]:
# list with shapes of matrices
#mx_shape = []
shape = clust.shape
mx_shape.append(shape)
mx_shape

[(739, 139),
 (286, 139),
 (175, 139),
 (130, 139),
 (114, 139),
 (107, 139),
 (105, 139),
 (102, 139),
 (101, 139),
 (99, 139),
 (39, 139),
 (13, 139)]

# Now run mtSCITE...

In [216]:
#sc RNAseq train

train_stats = {'error_rate':[0.0001, 0.0112, 0.0223, 0.0334, 0.0445, 0.0556, 0.0667, 0.0778, 0.0889, 0.1], 
               'n_mutations': [804, 158, 110, 98, 91, 84, 84, 80, 76, 73], 
               'n_trees':[110, 14, 1, 3, 1, 3, 3, 2, 13, 2], 
               'log_lhood':[-15864.824542912322, -2255.3024230724359, -1747.5377732338588, -1556.7459523984112, -1510.4281888874198, -1358.8919474351001, -1527.6094153251213, -1282.9984406630792, -1485.6061285236638, -1197.0117054675788]}
train_df = pd.DataFrame(train_stats)
#train_df
#train_df.to_csv('data/invivo_scRNAseq/train_results/invivo_scRNAseq_train_stats.csv', header=True, sep='\t', index=None)
sc_RNAseq = train_df.copy()


In [217]:
#sc RNAseq train no clust muts (nc)

train_stats = {'error_rate':[0.0001, 0.0112, 0.0223, 0.0334, 0.0445, 0.0556, 0.0667, 0.0778, 0.0889, 0.1], 
               'n_mutations': [482, 116, 74, 63, 57, 51, 51, 48, 46, 44], 
               'n_trees':[22, 5, 3, 1, 1, 1, 1, 3, 1, 1], 
               'log_lhood':[-8211.3977585565463, -1358.4833067070274, -982.45894518366504, -899.11114827340623, -743.79017348788091, -713.17903779946141, -686.14027150293055, -661.00174275164545, -654.25592210320633, -641.02502373188327]}
train_df = pd.DataFrame(train_stats)
train_df
sc_RNAseq_nc = train_df.copy()

#train_df.to_csv('data/invivo_scRNAseq/train_results_nc/invivo_scRNAseq_train_nc_stats.csv', header=True, sep='\t', index=None)



## Plots

In [219]:
atac.set_index('error_rate', inplace=True)
sc_RNAseq.set_index('error_rate', inplace=True)
sc_RNAseq_nc.set_index('error_rate', inplace=True)

In [276]:
sc_RNAseq

Unnamed: 0_level_0,n_mutations,n_trees,log_lhood
error_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0001,804,110,-15864.824543
0.0112,158,14,-2255.302423
0.0223,110,1,-1747.537773
0.0334,98,3,-1556.745952
0.0445,91,1,-1510.428189
0.0556,84,3,-1358.891947
0.0667,84,3,-1527.609415
0.0778,80,2,-1282.998441
0.0889,76,13,-1485.606129
0.1,73,2,-1197.011705
