In [1]:
import sys 
import networkx as nx
import pandas as pd
import numpy as np
import pickle as pic
import random

import cassiopeia.TreeSolver.simulation_tools.simulation_utils as sim_utils
import cassiopeia.TreeSolver.simulation_tools.dataset_generation as data_gen
from cassiopeia.TreeSolver.Node import Node
from cassiopeia.TreeSolver.Cassiopeia_Tree import Cassiopeia_Tree

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

import subprocess

#import seaborn as sns
import os

In [None]:
def compute_priors(C, S, p, mean=0.01, disp=0.1, skew_factor = 0.05, num_skew=1, empirical = np.array([]), mixture = 0):
    
    sp = {}
    prior_probabilities = {}
    for i in range(0, C):
        if len(empirical) > 0:
            sampled_probabilities = sorted(empirical)
        else:
            sampled_probabilities = sorted([np.random.negative_binomial(mean,disp) for _ in range(1,S+1)])
        s = C % num_skew
        mut_rate = p * (1 + num_skew * skew_factor)
        prior_probabilities[i] = {'0': (1-mut_rate)}
        total = np.sum(sampled_probabilities)

        sampled_probabilities = list(map(lambda x: x / (1.0 * total), sampled_probabilities))
        
        if mixture > 0: 
            for s in range(len(sampled_probabilities)):
                if np.random.uniform() <= mixture:
                    sampled_probabilities[s] = np.random.uniform()
            
            sp[i] = sampled_probabilities 
            total = np.sum(sampled_probabilities)
            sampled_probabilities = list(map(lambda x: x / (1.0 * total), sampled_probabilities))
            
            
        for j in range(1, S+1):
            prior_probabilities[i][str(j)] = (mut_rate)*sampled_probabilities[j-1]

    return prior_probabilities, sp 

def get_character_matrix(nodes):
    
    char_arrays = []
    for n in nodes:
        chars = n.char_string.split("_")[0].split("|")
        char_arrays.append(chars)
        
    return pd.DataFrame(char_arrays)


def normalize(x):
    
    tot = np.sum(x)
    return [i / (1.0 * tot) for i in x]

def simulate_hereditary(tree, dropout_probability_map):
    counter = [0]
    
    network = tree.get_network().copy()
    root = [n for n in network if network.in_degree(n) == 0][0]
    hereditary_helper(network, root, dropout_probability_map, [], counter)
    tree.network = network
    return tree, counter[0]

def hereditary_helper(network, node, dropout_probability_map, hereditary_drop_indices, counter):
    
    sample = node.get_character_string().split('|')
    temp_drop_indices = hereditary_drop_indices.copy()
    
    new_sample = []
    for i in range(0, len(sample)):
        if i in hereditary_drop_indices:
            new_sample.append('H')
        elif np.random.sample() <= dropout_probability_map[i]:
            new_sample.append('H')
            temp_drop_indices.append(i)
            counter[0] = counter[0] + 1
        else:
            new_sample.append(sample[i])
    
    node.char_vec = new_sample
    node.char_string = '|'.join([str(c) for c in new_sample])
    
    if network.out_degree(node) > 0:
        for i in network.successors(node):
            hereditary_helper(network, i, dropout_probability_map, temp_drop_indices, counter)

def count_stochastic_dropouts(tree):
    counter = [0]
    
    network = tree.get_network()
    root = [n for n in network if network.in_degree(n) == 0][0]
    count_stochastic_dropouts_helper(network, root, counter)
    
    return counter[0]

def count_stochastic_dropouts_helper(network, node, counter):
    sample = node.get_character_string().split('|')
    for i in sample:
        if i == '-':
            counter[0] = counter[0] + 1
    
    if network.out_degree(node) > 0:
        for i in network.successors(node):
            count_stochastic_dropouts_helper(network, i, counter)

def count_all_dropouts(tree):
    counter = [0]
    
    network = tree.get_network()
    root = [n for n in network if network.in_degree(n) == 0][0]
    count_all_dropouts_helper(network, root, counter)
    
    return counter[0]

def count_all_dropouts_helper(network, node, counter):
    sample = node.get_character_string().split('|')
    for i in sample:
        if (i == '-' or i == 'H'):
            counter[0] = counter[0] + 1
    
    if network.out_degree(node) > 0:
        for i in network.successors(node):
            count_all_dropouts_helper(network, i, counter)
            
def count_all_dropouts_leaves(tree, num_cells):
    count = 0
    network = tree.get_network()
    leaves = [n for n in network if network.out_degree(n) == 0]
    leaves = leaves[-num_cells:]
    for node in leaves:
        sample = node.get_character_string().split('|')
        for i in sample:
            if (i == '-' or i == 'H'):
                count += 1
    return count

def generate_simulated_full_hereditary_tree(mutation_prob_map, variable_dropout_prob_map, characters=10, depth=12, num_cells = 400, dropout=True):
    network = nx.DiGraph()
    current_depth = [[['0' for _ in range(0, characters)], '0']]
    network.add_node(sim_utils.node_to_string(current_depth[0]))
    uniq = 1
    for i in range(0, depth):
        temp_current_depth = []
        for node in current_depth:
            for _ in range(0,2):
                child_node = simulate_mutation(node[0], mutation_prob_map)
                if i == depth - 1 and dropout:
                    child_node = simulate_dropout(child_node, variable_dropout_prob_map)
                temp_current_depth.append([child_node, uniq])
                network.add_edge(sim_utils.node_to_string(node), sim_utils.node_to_string([child_node, str(uniq)]))
                uniq +=1

        current_depth = temp_current_depth

    rdict = {}
    i = 0
    for n in network.nodes:
        nn = Node("StateNode" + str(i), n.split("_")[0].split("|"), pid = n.split("_")[1], is_target=False)
        i += 1
        rdict[n] = nn

    network = nx.relabel_nodes(network, rdict)

    root = [n for n in network if network.in_degree(n) == 0][0]
    counter = [0]
    hereditary_helper(network, root, hdropout_prob_map, [], counter)

    leaves = [n for n in network if network.out_degree(n) == 0]
    subsampled_population_for_removal = random.sample(leaves, len(leaves) - num_cells)
    for node in subsampled_population_for_removal:
        network.remove_node(node)

    state_tree = Cassiopeia_Tree('simulated', network = network)
    return state_tree, counter[0]

def simulate_mutation(sample, mutation_prob_map):
    new_sample = []
    for i in range(0, len(sample)):
        character = sample[i]
        if character == '0':
            values, probabilities = zip(*mutation_prob_map[i].items())
            new_character = np.random.choice(values, p=probabilities)
            new_sample.append(new_character)
        else:
            new_sample.append(character)
    return new_sample

def simulate_dropout(sample, variable_dropout_probability_map):
    new_sample = []
    for i in range(0, len(sample)):
        if random.uniform(0, 1) <= variable_dropout_probability_map[i]:
            new_sample.append('-')
        else:
            new_sample.append(sample[i])
    return new_sample

In [None]:
no_mut_rate = .985
number_of_states = 40
dropout = 0.02
hdropout = 0.00
hdropout_percent = "0"
depth = 11
number_of_characters = 40
NUM_CELLS = 400

#np.arange(.85, .99, .014)

In [None]:
# for i in np.arange(1,11):
    
#     prior_probabilities = compute_priors(number_of_characters, number_of_states, 1-no_mut_rate, 5, 0.5, skew_factor=0.0, num_skew=1)[0]
#     dropouts = pd.DataFrame(np.full((number_of_characters, 1), dropout, dtype=float))
#     dropout_prob_map = {i: dropout for i in range(0,number_of_characters)}
#     data = data_gen.generate_simulated_full_tree(prior_probabilities, dropout_prob_map, characters=number_of_characters, subsample_percentage= NUM_CELLS / (2**depth), depth=depth)
#     pic.dump(data, open('sim_net' + str(i) + '.pkl', 'wb'))

In [None]:
#prior_probabilities = compute_priors(number_of_characters, number_of_states, 1-no_mut_rate, 5, 0.5, skew_factor=0.0, num_skew=1)[0]
dropouts = pd.DataFrame(np.full((number_of_characters, 1), dropout, dtype=float))
dropout_prob_map = {i: dropout for i in range(0,number_of_characters)}
hdropout_prob_map = {i: hdropout for i in range(0,number_of_characters)}
#data = generate_simulated_full_hereditary_tree(prior_probabilities, dropout_prob_map, characters=number_of_characters, num_cells= NUM_CELLS, depth=depth)[0]
#pic.dump(data, open('sim_net' + str(i) + '.pkl', 'wb'))

In [None]:
# out = simulate_hereditary(data, hdropout_prob_map)[0]
# network = out.get_network()
# root = [n for n in network if network.in_degree(n) == 0][0]
# print(root.get_character_string())
# for i in network.successors(root):
#     print(i.get_character_string())

In [None]:
path = "/data/yosef2/users/richardz/projects/Cassiopeia/dropout_testing/" + str(NUM_CELLS) + "cells/sim_nets_dropout_" + hdropout_percent
os.mkdir(path)

counts = []
for i in range(0, 50):
    prior_probabilities = compute_priors(number_of_characters, number_of_states, 1-no_mut_rate, 5, 0.5, skew_factor=0.0, num_skew=1)[0]
    out = generate_simulated_full_hereditary_tree(prior_probabilities, dropout_prob_map, characters=number_of_characters, num_cells = NUM_CELLS, depth=depth)[0]
    network = out.get_network()
    counts.append(count_all_dropouts_leaves(out, NUM_CELLS)/(NUM_CELLS*number_of_states))
    pic.dump(out, open(path + '/sim_net_' + hdropout_percent + '-' + str(i) + '.pkl', 'wb'))
    pic.dump(prior_probabilities, open(path + '/sim_net_' + hdropout_percent + '-' + str(i) + '_priors.pkl', 'wb'))
    print(i)

import csv

with open('/data/yosef2/users/richardz/projects/Cassiopeia/dropout_testing/' + str(NUM_CELLS) + 'cells/dropout_percentage' + hdropout_percent + '.csv', 'a') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(counts)
csvFile.close()

In [None]:
# characters = number_of_characters
# def simulate_mutation(sample, mutation_prob_map):
#     new_sample = []
#     for i in range(0, len(sample)):
#         character = sample[i]
#         if character == '0':
#             values, probabilities = zip(*mutation_prob_map[i].items())
#             new_character = np.random.choice(values, p=probabilities)
#             new_sample.append(new_character)
#         else:
#             new_sample.append(character)
#     return new_sample
# mutation_prob_map = prior_probabilities
# def simulate_dropout(sample, variable_dropout_probability_map):
#     """
#     Applies dropout to a given sample

#     :param sample:
#         Samples in list form: I.e. ['0','1','0','1']
#     :param variable_dropout_prob_map:
#         A dictionary containing dropout probabilities for each individual character
#         I.e {0: 0.05, 1: 0.01, 2: 0.2,...}
#     :return:
#         A sample with characters potential dropped out (Dropped out characters in the form '-')
#     """
#     new_sample = []
#     for i in range(0, len(sample)):
#         if random.uniform(0, 1) <= variable_dropout_probability_map[i]:
#             new_sample.append('-')
#         else:
#             new_sample.append(sample[i])
#     return new_sample
# variable_dropout_prob_map = dropout_prob_map
# subsample_percentage = 0.1

In [None]:
counts

In [None]:
sum(counts)/50

In [None]:
prior_probabilities = compute_priors(number_of_characters, number_of_states, 1-no_mut_rate, 5, 0.5, skew_factor=0.0, num_skew=1)[0]
out = generate_simulated_full_hereditary_tree(prior_probabilities, dropout_prob_map, characters=number_of_characters, num_cells = NUM_CELLS, depth=depth)[0]


In [None]:
print(out)

In [None]:
test = pic.load(open("/data/yosef2/users/richardz/projects/Cassiopeia/dropout_testing/400cells/nngreedy_avg_priors/sim_net_2-11_nngreedy_avg.pkl", "rb"), encoding = "latin1")
test