In [2]:
import sys 
import networkx as nx
import pandas as pd
import numpy as np
import pickle as pic
import random

import cassiopeia.TreeSolver.simulation_tools.simulation_utils as sim_utils
import cassiopeia.TreeSolver.simulation_tools.dataset_generation as data_gen
from cassiopeia.TreeSolver.Node import Node
from cassiopeia.TreeSolver.Cassiopeia_Tree import Cassiopeia_Tree

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

import subprocess

#import seaborn as sns
import os

In [8]:
def simulate_mutation(sample, mutation_prob_map):
    new_sample = []
    for i in range(0, len(sample)):
        character = sample[i]
        if character == '0':
            values, probabilities = zip(*mutation_prob_map[i].items())
            new_character = np.random.choice(values, p=probabilities)
            new_sample.append(new_character)
        else:
            new_sample.append(character)
    return new_sample

def simulate_dropout(sample, variable_dropout_probability_map):
    new_sample = []
    for i in range(0, len(sample)):
        if random.uniform(0, 1) <= variable_dropout_probability_map[i]:
            new_sample.append('-')
        else:
            new_sample.append(sample[i])
    return new_sample

def get_character_matrix(nodes):
    
    char_arrays = []
    for n in nodes:
        chars = n.char_string.split("_")[0].split("|")
        char_arrays.append(chars)
        
    return pd.DataFrame(char_arrays)

def compute_priors(C, S, p, mean=0.01, disp=0.1, skew_factor = 0.05, num_skew=1, empirical = np.array([]), mixture = 0):
    
    sp = {}
    prior_probabilities = {}
    for i in range(0, C):
        if len(empirical) > 0:
            sampled_probabilities = sorted(empirical)
        else:
            sampled_probabilities = sorted([np.random.negative_binomial(mean,disp) for _ in range(1,S+1)])
        s = C % num_skew
        mut_rate = p * (1 + num_skew * skew_factor)
        prior_probabilities[i] = {'0': (1-mut_rate)}
        total = np.sum(sampled_probabilities)

        sampled_probabilities = list(map(lambda x: x / (1.0 * total), sampled_probabilities))
        
        if mixture > 0: 
            for s in range(len(sampled_probabilities)):
                if np.random.uniform() <= mixture:
                    sampled_probabilities[s] = np.random.uniform()
            
            sp[i] = sampled_probabilities 
            total = np.sum(sampled_probabilities)
            sampled_probabilities = list(map(lambda x: x / (1.0 * total), sampled_probabilities))
            
            
        for j in range(1, S+1):
            prior_probabilities[i][str(j)] = (mut_rate)*sampled_probabilities[j-1]

    return prior_probabilities, sp

def count_all_dropouts_leaves(leaves):
    count = 0
    for node in leaves:
        sample = node.get_character_string().split('|')
        for i in sample:
            if (i == '-' or i == 'H'):
                count += 1
    return count

In [9]:
def generate_simulated_ground_tree(mutation_prob_map, characters=10, depth=12, num_cells=400):
    network = nx.DiGraph()
    current_depth = [[['0' for _ in range(0, characters)], '0']]
    network.add_node(sim_utils.node_to_string(current_depth[0]))
    uniq = 1
    for i in range(0, depth):
        temp_current_depth = []
        for node in current_depth:
            for _ in range(0,2):
                child_node = simulate_mutation(node[0], mutation_prob_map)
                temp_current_depth.append([child_node, uniq])
                network.add_edge(sim_utils.node_to_string(node), sim_utils.node_to_string([child_node, str(uniq)]))
                uniq +=1

        current_depth = temp_current_depth

    rdict = {}
    i = 0
    for n in network.nodes:
        nn = Node("StateNode" + str(i), n.split("_")[0].split("|"), pid = n.split("_")[1], is_target=False)
        i += 1
        rdict[n] = nn

    network = nx.relabel_nodes(network, rdict)

    leaves = [n for n in network if network.out_degree(n) == 0]
    subsampled_population_for_removal = random.sample(leaves, len(leaves) - num_cells)
    
    for node in subsampled_population_for_removal:
        network.remove_node(node)
        
    remaining = list(set(leaves)-set(subsampled_population_for_removal))

    state_tree = Cassiopeia_Tree('simulated', network = network)
    return state_tree, remaining

def hereditary_helper(network, node, dropout_probability_map, hereditary_drop_indices, counter):
    
    sample = node.get_character_string().split('|')
    temp_drop_indices = hereditary_drop_indices.copy()
    
    new_sample = []
    for i in range(0, len(sample)):
        if i in hereditary_drop_indices:
            new_sample.append('-')
        elif np.random.sample() <= dropout_probability_map[i]:
            new_sample.append('-')
            temp_drop_indices.append(i)
            counter[0] = counter[0] + 1
        else:
            new_sample.append(sample[i])
    
    node.char_vec = new_sample
    node.char_string = '|'.join([str(c) for c in new_sample])
    
    if network.out_degree(node) > 0:
        for i in network.successors(node):
            hereditary_helper(network, i, dropout_probability_map, temp_drop_indices, counter)

def stochastic_helper(network, node, dropout_probability_map, counter):
    sample = node.get_character_string().split('|')
    new_sample = simulate_dropout(sample, dropout_probability_map)
    
    node.char_vec = new_sample
    node.char_string = '|'.join([str(c) for c in new_sample])
    
    if network.out_degree(node) > 0:
        for i in network.successors(node):
            stochastic_helper(network, i, dropout_probability_map, counter)
            
def stochastic_helper_leaves(leaves, dropout_probability_map, counter):
    for node in leaves:
        sample = node.get_character_string().split('|')
        new_sample = simulate_dropout(sample, dropout_probability_map)
    
        node.char_vec = new_sample
        node.char_string = '|'.join([str(c) for c in new_sample])

In [10]:
#Set parameters. 'hdropout_percent' is a string that represents the dropout percent, for pathing purposes

no_mut_rate = .985
number_of_states = 40
dropout = 0.02
depth = 11
number_of_characters = 40
NUM_CELLS = 1500

In [11]:
dropout_map = {"0": 0.00, "1": 0.01, "2": 0.02, "3": 0.03, "4": 0.04, "5": 0.05}

In [13]:
for hdropout_percent in dropout_map:

    hdropout = dropout_map[hdropout_percent]
    
    print(hdropout_percent)
    print(hdropout)

    #Create dropout maps
    dropouts = pd.DataFrame(np.full((number_of_characters, 1), dropout, dtype=float))
    dropout_prob_map = {i: dropout for i in range(0,number_of_characters)}
    hdropout_prob_map = {i: hdropout for i in range(0,number_of_characters)}

    #Establish the path, and create it if it doesn't yet exist
    path = "/data/yosef2/users/richardz/projects/dropout_testing/ground_truth_testing/" + str(NUM_CELLS) + "cells/" + hdropout_percent + "percent"
    if os.path.exists(path) == False:
        os.mkdir(path)

    #Main loop for simulation
    counts = []
    for i in range(0, 50):

        #Compute Priors and generate the simulated tree
        prior_probabilities = compute_priors(number_of_characters, number_of_states, 1-no_mut_rate, 5, 0.5, skew_factor=0.0, num_skew=1)[0]
        out, leaves = generate_simulated_ground_tree(prior_probabilities, characters=number_of_characters, num_cells = NUM_CELLS, depth=depth)
        #     pic.dump(out, open('/data/yosef2/users/richardz/projects/dropout_testing/ground_truth_testing/ground_truth_tree' + str(i) + '.pkl', 'wb'))
        network = out.get_network()
        #     pic.dump(out, open(path + '/sim_net' + str(i) + '.pkl', 'wb'))
        pic.dump(prior_probabilities, open(path + '/sim_net_priors' + str(i) + '.pkl', 'wb'))

        #Save the ground truth character matrix
        ground_cm = get_character_matrix(leaves)
        ground_cm.to_csv(path + '/ground_truth_cm' + str(i) + '.txt', sep = '\t')

        #Introduce both stochastic and heritable dropout
        root = [n for n in network if network.in_degree(n) == 0][0]
        counter = [0]
        stochastic_helper_leaves(leaves, dropout_prob_map, counter)
        counter2 = [0]
        hereditary_helper(network, root, hdropout_prob_map, [], counter2)

        #Create the character matrix post dropout, giving names to the indeces
        dropout_cm = get_character_matrix(leaves)
        dropout_cm = dropout_cm.astype(str)
        row_names = ['c' + str(i) for i in range(dropout_cm.shape[0])]
        dropout_cm.index = row_names
        dropout_cm.to_csv(path + '/dropout_cm' + str(i) + '.txt', sep = '\t')
        pic.dump(out, open(path + '/dropout_net' + str(i) + '.pkl', 'wb'))

        #Count the dropout proportion and save it
        count = count_all_dropouts_leaves(leaves)/(NUM_CELLS*number_of_states)
        counts.append(count)
        print(i, count)

    import csv

    #Write the dropout proportions to CSV
    with open('/data/yosef2/users/richardz/projects/dropout_testing/ground_truth_testing/' + str(NUM_CELLS) + 'cells/dropout_percentage' + hdropout_percent + '.csv', 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(counts)
    csvFile.close()

0
0.0
0 0.020883333333333334
1 0.0197
2 0.020816666666666667
3 0.019416666666666665
4 0.019083333333333334
5 0.02015
6 0.019783333333333333
7 0.020433333333333335
8 0.019983333333333332
9 0.020566666666666667
10 0.018816666666666666
11 0.020166666666666666
12 0.02015
13 0.020166666666666666
14 0.021166666666666667
15 0.02033333333333333
16 0.020266666666666665
17 0.019466666666666667
18 0.021116666666666666
19 0.020083333333333335
20 0.020133333333333333
21 0.01965
22 0.021
23 0.019733333333333332
24 0.01995
25 0.020066666666666667
26 0.019933333333333334
27 0.020716666666666668
28 0.019933333333333334
29 0.01985
30 0.02055
31 0.020133333333333333
32 0.0199
33 0.020116666666666668
34 0.01948333333333333
35 0.020633333333333333
36 0.019633333333333332
37 0.01965
38 0.02115
39 0.020766666666666666
40 0.0195
41 0.02033333333333333
42 0.020383333333333333
43 0.019933333333333334
44 0.01985
45 0.020566666666666667
46 0.019916666666666666
47 0.019616666666666668
48 0.019016666666666668
49 0.

In [None]:
counts = []
i = 1
prior_probabilities = compute_priors(number_of_characters, number_of_states, 1-no_mut_rate, 5, 0.5, skew_factor=0.0, num_skew=1)[0]
out, leaves = generate_simulated_ground_tree(prior_probabilities, characters=number_of_characters, num_cells = NUM_CELLS, depth=depth)

network = out.get_network()
ground_cm = get_character_matrix(leaves)

root = [n for n in network if network.in_degree(n) == 0][0]
counter = [0]
stochastic_helper(network, root, dropout_prob_map, counter)
counter2 = [0]
hereditary_helper(network, root, hdropout_prob_map, [], counter2)

dropout_cm = get_character_matrix(leaves)
dropout_cm = dropout_cm.astype(str)
row_names = ['c' + str(i) for i in range(dropout_cm.shape[0])]
dropout_cm.index = row_names

count = count_all_dropouts_leaves(leaves)/(NUM_CELLS*number_of_states)
counts.append(count)

In [None]:
ground_cm.shape

In [None]:
dropout_cm.shape