In [1]:
import sys 
import networkx as nx
import pandas as pd
import numpy as np
import pickle as pic
import random

import cassiopeia.TreeSolver.simulation_tools.simulation_utils as sim_utils
import cassiopeia.TreeSolver.simulation_tools.dataset_generation as data_gen
from cassiopeia.TreeSolver.Node import Node
from cassiopeia.TreeSolver.Cassiopeia_Tree import Cassiopeia_Tree

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

import subprocess

#import seaborn as sns
import os

In [2]:
def simulate_mutation(sample, mutation_prob_map):
    new_sample = []
    for i in range(0, len(sample)):
        character = sample[i]
        if character == '0':
            values, probabilities = zip(*mutation_prob_map[i].items())
            new_character = np.random.choice(values, p=probabilities)
            new_sample.append(new_character)
        else:
            new_sample.append(character)
    return new_sample

def simulate_dropout(sample, variable_dropout_probability_map):
    new_sample = []
    for i in range(0, len(sample)):
        if random.uniform(0, 1) <= variable_dropout_probability_map[i]:
            new_sample.append('-')
        else:
            new_sample.append(sample[i])
    return new_sample

def get_character_matrix(nodes):
    
    char_arrays = []
    for n in nodes:
        chars = n.char_string.split("_")[0].split("|")
        char_arrays.append(chars)
        
    return pd.DataFrame(char_arrays)

def compute_priors(C, S, p, mean=0.01, disp=0.1, skew_factor = 0.05, num_skew=1, empirical = np.array([]), mixture = 0):
    
    sp = {}
    prior_probabilities = {}
    for i in range(0, C):
        if len(empirical) > 0:
            sampled_probabilities = sorted(empirical)
        else:
            sampled_probabilities = sorted([np.random.negative_binomial(mean,disp) for _ in range(1,S+1)])
        s = C % num_skew
        mut_rate = p * (1 + num_skew * skew_factor)
        prior_probabilities[i] = {'0': (1-mut_rate)}
        total = np.sum(sampled_probabilities)

        sampled_probabilities = list(map(lambda x: x / (1.0 * total), sampled_probabilities))
        
        if mixture > 0: 
            for s in range(len(sampled_probabilities)):
                if np.random.uniform() <= mixture:
                    sampled_probabilities[s] = np.random.uniform()
            
            sp[i] = sampled_probabilities 
            total = np.sum(sampled_probabilities)
            sampled_probabilities = list(map(lambda x: x / (1.0 * total), sampled_probabilities))
            
            
        for j in range(1, S+1):
            prior_probabilities[i][str(j)] = (mut_rate)*sampled_probabilities[j-1]

    return prior_probabilities, sp

def count_all_dropouts_leaves(leaves):
    count = 0
    for node in leaves:
        sample = node.get_character_string().split('|')
        for i in sample:
            if (i == '-' or i == '*'):
                count += 1
    return count

In [30]:
def generate_simulated_ground_tree(mutation_prob_map, characters=10, depth=10, min_division_rate=0.8, cell_death=0.01):
    network = nx.DiGraph()
    current_depth = [[['0' for _ in range(0, characters)], '0']]
    network.add_node(sim_utils.node_to_string(current_depth[0]))
    uniq = 1
    
#     division_rate = min_division_rate+((1-min_division_rate)*np.random.random())
    division_rate = min_division_rate
    
    for i in range(0, depth):
        temp_current_depth = []
        for node in current_depth:
            if np.random.random() >= cell_death:
                if np.random.random() <= division_rate:
                    for _ in range(0,2):
                        child_node = simulate_mutation(node[0], mutation_prob_map)
                        temp_current_depth.append([child_node, uniq])
                        network.add_edge(sim_utils.node_to_string(node), sim_utils.node_to_string([child_node, str(uniq)]))
                        uniq +=1
                else:
                    child_node = simulate_mutation(node[0], mutation_prob_map)
                    temp_current_depth.append([child_node, node[1]])
                    network = nx.relabel_nodes(network, {sim_utils.node_to_string(node): sim_utils.node_to_string([child_node, node[1]])}, copy = False)
            else:
                curr_parent = sim_utils.node_to_string(node)
                while network.out_degree(curr_parent) < 1 and network.in_degree(curr_parent) > 0:
                    next_parent = list(network.predecessors(curr_parent))[0]
                    network.remove_node(curr_parent)
                    curr_parent = next_parent
                
        current_depth = temp_current_depth

    rdict = {}
    i = 0
    for n in network.nodes:
        nn = Node("StateNode" + str(i), n.split("_")[0].split("|"), pid = n.split("_")[1], is_target=False)
        i += 1
        rdict[n] = nn

    network = nx.relabel_nodes(network, rdict)
    
#     source = [x for x in network.nodes() if network.in_degree(x)==0][0]

#     max_depth = max(nx.shortest_path_length(network,source,node) for node in network.nodes())
#     shortest_paths = nx.shortest_path_length(network,source)

#     leaves = [x for x in network.nodes() if network.out_degree(x)==0 and network.in_degree(x) == 1 and shortest_paths[x] == max_depth]

    leaves = [n for n in network if network.out_degree(n) == 0 and network.in_degree(n) == 1] 
    
    state_tree = Cassiopeia_Tree('simulated', network = network)
    return state_tree, leaves

def hereditary_helper(network, node, dropout_probability_map, hereditary_drop_indices, counter):
    
    sample = node.get_character_string().split('|')
    temp_drop_indices = hereditary_drop_indices.copy()
    
    new_sample = []
    for i in range(0, len(sample)):
        if i in hereditary_drop_indices:
            new_sample.append('-')
        elif np.random.sample() <= dropout_probability_map[i]:
            new_sample.append('-')
            temp_drop_indices.append(i)
            counter[0] = counter[0] + 1
        else:
            new_sample.append(sample[i])
    
    node.char_vec = new_sample
    node.char_string = '|'.join([str(c) for c in new_sample])
    
    if network.out_degree(node) > 0:
        for i in network.successors(node):
            hereditary_helper(network, i, dropout_probability_map, temp_drop_indices, counter)

def stochastic_helper(network, node, dropout_probability_map, counter):
    sample = node.get_character_string().split('|')
    new_sample = simulate_dropout(sample, dropout_probability_map)
    
    node.char_vec = new_sample
    node.char_string = '|'.join([str(c) for c in new_sample])
    
    if network.out_degree(node) > 0:
        for i in network.successors(node):
            stochastic_helper(network, i, dropout_probability_map, counter)
            
def stochastic_helper_leaves(leaves, dropout_probability_map, counter):
    for node in leaves:
        sample = node.get_character_string().split('|')
        new_sample = simulate_dropout(sample, dropout_probability_map)
    
        node.char_vec = new_sample
        node.char_string = '|'.join([str(c) for c in new_sample])
        
def states_per_char(cm):
    unique_chars = [0 for n in range(0, cm.shape[1])]
    seen = [[] for n in range(0, cm.shape[1])]
    for j in range(0, cm.shape[1]):
        for i in range(0,cm.shape[0]):
            val = cm.iloc[i, j]
            if val != '0' and val != '-' and val not in seen[j]:
                unique_chars[j] += 1
                seen[j].append(val)
    return unique_chars

In [144]:
#Set parameters. 'hdropout_percent' is a string that represents the dropout percent, for pathing purposes

NUM_CELLS = 1500
mut_rate = .015
number_of_states = 100
depth = 11
number_of_characters = 30
death_rate = 0.06
s_dropout = 0.20
h_dropout = 0.0002

In [145]:
# dropout_map = {"0": 0.00, "1": 0.01, "2": 0.02, "3": 0.03, "4": 0.04, "5": 0.05, "7": 0.07, "10": 0.10}

In [146]:
# for dropout_str in dropout_map:

#     dropout = dropout_map[dropout_str]
#     print(dropout_str)

#     #Create dropout maps
#     dropouts = pd.DataFrame(np.full((number_of_characters, 1), dropout, dtype=float))
s_dropout_prob_map = {i: s_dropout for i in range(0,number_of_characters)}
h_dropout_prob_map = {i: h_dropout for i in range(0,number_of_characters)}

#Establish the path, and create it if it doesn't yet exist
path = "/data/yosef2/users/richardz/projects/Yule/benchmarking/test"
if os.path.exists(path) == False:
    os.mkdir(path)

#Main loop for simulation
counts = []
size = []
avg_spc = []
for i in range(0, 50):

    #Compute Priors and generate the simulated tree
    prior_probabilities = compute_priors(number_of_characters, number_of_states, mut_rate, 5, 0.5, skew_factor=0.0, num_skew=1)[0]
    out, leaves = generate_simulated_ground_tree(prior_probabilities, characters=number_of_characters, depth=depth, min_division_rate= 0.7, cell_death = death_rate)
    #     pic.dump(out, open('/data/yosef2/users/richardz/projects/dropout_testing/ground_truth_testing/ground_truth_tree' + str(i) + '.pkl', 'wb'))
    while len(leaves) < 300 or len(leaves) > 500:
        out, leaves = generate_simulated_ground_tree(prior_probabilities, characters=number_of_characters, depth=depth, min_division_rate= 0.8, cell_death = death_rate)
        
    network = out.get_network()
    #     pic.dump(out, open(path + '/sim_net' + str(i) + '.pkl', 'wb'))
    pic.dump(prior_probabilities, open(path + '/sim_net_priors' + str(i) + '.pkl', 'wb'))

    #Save the ground truth character matrix
    ground_cm = get_character_matrix(leaves)
    ground_cm.to_csv(path + '/ground_truth_cm' + str(i) + '.txt', sep = '\t')

    #Introduce Stochastic Dropout
    root = [n for n in network if network.in_degree(n) == 0][0]
    counter = [0]
    stochastic_helper_leaves(leaves, s_dropout_prob_map, counter)
    counter2 = [0]
    hereditary_helper(network, root, h_dropout_prob_map, [], counter2)

    #Create the character matrix post dropout, giving names to the indeces
    dropout_cm = get_character_matrix(leaves)
    dropout_cm = dropout_cm.astype(str)
    row_names = ['c' + str(i) for i in range(dropout_cm.shape[0])]
    dropout_cm.index = row_names
    dropout_cm.to_csv(path + '/dropout_cm' + str(i) + '.txt', sep = '\t')
    pic.dump(out, open(path + '/dropout_net' + str(i) + '.pkl', 'wb'))

    #Count the dropout proportion and save it
    num_leaves = len(leaves)
    size.append(num_leaves)

    count = count_all_dropouts_leaves(leaves)/(num_leaves*number_of_characters)
    counts.append(count)
    
    spc = states_per_char(dropout_cm)
    avg_spc.append(sum(spc)/len(spc))

    print(i, count, num_leaves, sum(spc)/len(spc))

# import csv

# #Write the dropout proportions to CSV
# with open('/data/yosef2/users/richardz/projects/dropout_testing/cell_death_testing_new_both/dropout_percentage' + dropout_str + '.csv', 'w') as csvFile:
#     writer = csv.writer(csvFile)
#     writer.writerow(counts)
# csvFile.close()

# with open('/data/yosef2/users/richardz/projects/dropout_testing/cell_death_testing_new_both/sample_size' + dropout_str + '.csv', 'w') as csvFile:
#     writer = csv.writer(csvFile)
#     writer.writerow(size)
# csvFile.close()


0 0.20087976539589442 341 9.066666666666666
1 0.20797285835453774 393 10.766666666666667
2 0.20476190476190476 434 11.733333333333333
3 0.19663512092534174 317 7.533333333333333
4 0.1947075208913649 359 9.2
5 0.19725829725829727 462 11.8
6 0.20479351032448379 452 11.566666666666666
7 0.22758620689655173 464 12.533333333333333
8 0.19705573080967403 317 8.266666666666667
9 0.2021164021164021 378 9.9
10 0.20540540540540542 370 9.333333333333334


KeyboardInterrupt: 

In [141]:
import statistics as stat
print(stat.mean(avg_spc))
print(stat.median(avg_spc))

5.806666666666667
5.85


In [142]:
print(stat.mean(counts))
print(stat.median(counts))

0.2018153573601391
0.20166352228089635


In [143]:
print(stat.mean(size))
print(stat.median(size))

392.2
391.5
