In [36]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import pickle as pic
from matplotlib.backends.backend_pdf import PdfPages
import time
start_time = time.time()

In [2]:
class Node:
    
    def __init__(self, char_list, num_states, parent=None, left=None, right=None):
        # num_states includes the 0 state. For example, if the possible states are 0 or 1, then num_states=2
        self.chars = char_list
        self.parent = parent
        self.left = left
        self.right = right
        self.num_chars = len(self.chars)
        self.num_states = num_states
        
    def is_leaf(self):
        return not (self.left or self.right)

    def duplicate(self, p, q=None, dropout_rate=0):
        assert len(p) == len(self.chars), "invalid p vector"
        if q:
            for i in range(len(p)):
                assert len(q[i]) + 1 == self.num_states, "invalid q[" + str(i) + "] vector"
                
        new_chars = []
        if not q:
            q = [None for i in self.chars]

        for l in range(len(self.chars)):
            if self.chars[l] != 0:
                new_chars.append(self.chars[l])
            else:
                if np.random.random() < p[l]:
                    new_chars.append(np.random.choice(np.arange(1, self.num_states), 1, p=q[l])[0])
                else:
                    new_chars.append(self.chars[l])
        
        return Node(new_chars, self.num_states)
    
    def __str__(self):
        s = ''
        for x in self.chars:
            s += str(x) + '|'
        return s[:-1]
                    
                    
def simulation(p, num_states, time, q=None):
    root = Node([0 for i in p], num_states)
    curr_gen = [root]
    for t in range(time):
        new_gen = []
        for n in curr_gen:
            c1 = n.duplicate(p, q)
            c2 = n.duplicate(p, q)
            c1.parent = n
            c2.parent = n
            n.left = c1
            n.right = c2
            new_gen.append(c1)
            new_gen.append(c2)
        curr_gen = new_gen
    return curr_gen

def find_lineage(node):
    lineage = [node]
    while node.parent:
        node = node.parent
        lineage.insert(0, node)
    return lineage

def find_root(samples):
    return find_lineage(samples[0])[0]

def print_tree(root):
    
    def tree_str(node, level=0):
        ret = "\t"*level+str(node)+"\n"
        if node.left:
            ret += tree_str(node.left, level+1)
        if node.right:
            ret += tree_str(node.right, level+1)
        return ret
    
    print(tree_str(root))
    


In [3]:
def generate_frequency_matrix(samples, subset=None):
    k = samples[0].num_chars
    m = samples[0].num_states + 1
    F = np.zeros((k,m), dtype=int)
    if not subset:
        subset = list(range(len(samples)))
    for i in subset:
        for j in range(k):
            F[j][samples[i].chars[j]] += 1
    return F
            
def split_data(F):
    k,m = F.shape[0], F.shape[1]
    split_data = []
    for i in range(k):
        for j in range(1, m-1):
            split_data.append((i,j))
    split_data.sort(key=lambda tup: F[tup[0]][tup[1]], reverse=True)
    index = 0
    
    for i in range(5):
        s = ''
        for j in range(1, 5):
            a, b = split_data[index][0], split_data[index][1]
            s += str((a,b)) + " freq =" + str(F[a][b]) + " "
            index += 1
        print(s)
            
    
def construct_connectivity_graph(samples, subset=None):
    n = len(samples)
    k = samples[0].num_chars
    m = samples[0].num_states
    G = nx.Graph()
    if not subset:
        subset = range(n)
    for i in subset:
        G.add_node(i)
    F = generate_frequency_matrix(samples, subset)
    for i in subset:
        for j in subset:
            if j <= i:
                continue
            n1 = samples[i]
            n2 = samples[j]
            #compute simularity score
            score = 0
            for l in range(k):
                x = n1.chars[l]
                y = n2.chars[l]
                if min(x, y) >= 0 and max(x,y) > 0:
                    if x==y:
                        score -= 3*(len(subset) - F[l][x] - F[l][-1])
                    elif min(x,y) == 0:
                        score += F[l][max(x,y)] - 1
                    else:
                        score += (F[l][x] + F[l][y]) - 2
                        
                if score != 0:
                    G.add_edge(i,j, weight=score)
    return G


In [4]:
def max_cut_heuristic(G, sdimension, iterations, show_steps=False):
    #n = len(G.nodes())
    d = sdimension+1
    emb = {}        
    for i in G.nodes():
        x = np.random.normal(size=d)
        x = x/np.linalg.norm(x)
        emb[i] = x
        
    def show_relaxed_objective():
        score = 0
        for e in G.edges():
            u = e[0]
            v = e[1]
            score += G[u][v]['weight']*np.linalg.norm(emb[u]-emb[v])
        print(score)
        
    for k in range(iterations):
        new_emb = {}
        for i in G.nodes:
            cm = np.zeros(d, dtype=float)
            for j in G.neighbors(i):
                cm -= G[i][j]['weight']*np.linalg.norm(emb[i]-emb[j])*emb[j]
            cm = cm/np.linalg.norm(cm)
            new_emb[i] = cm
        emb = new_emb
        
    #print("final relaxed objective:")
    #show_relaxed_objective()
    return_set = set()
    best_score = 0
    for k in range(3*d):
        b = np.random.normal(size=d)
        b = b/np.linalg.norm(b)
        S = set()
        for i in G.nodes():
            if np.dot(emb[i], b) > 0:
                S.add(i)
        this_score = evaluate_cut(S, G)
        if this_score > best_score:
            return_set = S
            best_score = this_score
    #print("score before hill climb = ", best_score)
    improved_S = improve_cut(G, return_set)
    #final_score = evaluate_cut(improved_S, G)
    #print("final score = ", final_score)
    return improved_S

def improve_cut(G, S):
    #n = len(G.nodes())
    ip = {}
    new_S = S.copy()
    for i in G.nodes():
        improvement_potential = 0
        for j in G.neighbors(i):
            if cut(i,j,new_S):
                improvement_potential -= G[i][j]['weight']
            else:
                improvement_potential += G[i][j]['weight']
        ip[i] = improvement_potential
        
    all_neg = False
    iters = 0
    while (not all_neg) and (iters < 2*len(G.nodes)):
        best_potential = 0
        best_index = 0
        for i in G.nodes():
            if ip[i] > best_potential:
                best_potential = ip[i]
                best_index = i
        if best_potential > 0:
            for j in G.neighbors(best_index):
                if cut(best_index,j,new_S):
                    ip[j] += 2*G[best_index][j]['weight']
                else:
                    ip[j] -= 2*G[best_index][j]['weight']
            ip[best_index] = -ip[best_index]
            if best_index in new_S:
                new_S.remove(best_index)
            else:
                new_S.add(best_index)
        else:
            all_neg = True
        iters += 1
    #print("number of hill climbing interations: ", iters)
    return new_S


In [5]:
def evaluate_cut(S, G, B=None, show_total=False):
    cut_score = 0
    total_good = 0
    total_bad = 0
    for e in G.edges():
        u = e[0]
        v = e[1]
        w_uv = G[u][v]['weight']
        total_good += float(w_uv)
        if cut(u,v,S):
            cut_score += float(w_uv)

    if B:
        for e in B.edges():
            u = e[0]
            v = e[1]
            w_uv = B[u][v]['weight']
            total_bad += float(w_uv)
            if cut(u,v,S):
                cut_score -= float(w_uv)
            
    if show_total:
        print("total good = ", total_good)
        print("total bad = ", total_bad)
    return(cut_score)

def greedy_cut(samples, subset=None):
    F = generate_frequency_matrix(samples, subset)
    k,m = F.shape[0], F.shape[1]
    freq = 0
    char = 0
    state = 0
    if not subset:
        subset = list(range(len(samples)))
    for i in range(k):
        for j in range(1, m-1):
            if F[i][j] > freq and F[i][j] < len(subset) - F[i][-1] :
                char, state = i,j
                freq = F[i][j]
    if freq == 0:
        return random_nontrivial_cut(subset)
    S = set()
    Sc = set()
    missing = set()
#     print(char, state)
    for i in subset:
        if samples[i].chars[char] == state:
            S.add(i)
        elif samples[i].chars[char] == -1:
            missing.add(i)
        else:
            Sc.add(i)
            
    if not Sc:
#         if len(S) == len(subset) or len(S) == 0:
#             print(F)
#             print(char, state, len(subset))
        return S
    
    for i in missing:
        s_score = 0
        sc_score = 0
        for j in S:
            for l in range(k):
                if samples[i].chars[l] > 0 and samples[i].chars[l] == samples[j].chars[l]:
                    s_score += 1
        for j in Sc:
            for l in range(k):
                if samples[i].chars[l] > 0  and samples[i].chars[l] == samples[j].chars[l]:
                    sc_score += 1
        if s_score/len(S) > sc_score/len(Sc):
            S.add(i)
        else:
            Sc.add(i)
        
#     if len(S) == len(subset) or len(S) == 0:
#             print(F)
#             print(char, state, len(subset))
    return S
    
def random_cut(subset):
    S = set()
    for i in subset:
        if np.random.random() > 0.5:
            S.add(i)
    return S

def random_nontrivial_cut(subset):
    assert len(subset) > 1
    S = set()
    lst = list(subset)
    S.add(lst[0])
    for i in range(2,len(lst)):
        if np.random.random() > 0.5:
            S.add(lst[i])
    return S


def cut(u, v, S):
    return ((u in S) and (not v in S)) or ((v in S) and (not u in S))

            
def num_incorrect(S, h):
    num = 0
    for i in range(int(2**h/2)):
        if not i in S:
            num += 1

    for i in range(int(2**h/2), 2**h):
        if i in S:
            num += 1

    return min(num, 2**h - num)

In [6]:
class Experiment:
    
    def __init__(self, p, num_states, time, q=None, dropout_rate=0):
        self.p = p
        self.num_states = num_states
        self.q = q
        self.time = time
        self.samples = simulation(p, num_states, time, q)
#         print([n.chars for n in self.samples])
        if dropout_rate:
            for n in self.samples:
                for i in range(len(n.chars)):
                    if np.random.random() < dropout_rate:
                        n.chars[i] = -1
        self.num_samples = len(self.samples)
        self.root = find_root(self.samples)        
        T = nx.DiGraph()
        node_map = {self.samples[i]:i for i in range(self.num_samples)}
        self.node_map = node_map
        for i in range(self.num_samples):
            T.add_node(i)
        def build_gt_tree(n):
            if n.is_leaf():
                return node_map[n]
            root_label = len(T.nodes())
            T.add_node(root_label)
            if n.left:
                left_label = build_gt_tree(n.left)
                T.add_edge(root_label, left_label)
            if n.right:
                right_label = build_gt_tree(n.right)
                T.add_edge(root_label, right_label)
            return root_label
        build_gt_tree(self.root)
        self.ground_truth_tree = T
      
        
    def build_tree(self, method='greedy', subset=None):
        if not subset:
            subset = list(range(len(self.samples)))
        else:
            subset = list(subset)
#         print(self.samples)
        subset = remove_duplicates(self.samples, subset)
#         print(subset)
        T = nx.DiGraph()
        for i in subset:
            T.add_node(i)
#         print(T.nodes())
        def build_helper(S):
            assert S, "error, S = "+ str(S)
            if len(S) == 1:
                return list(S)[0]
            left_set = set()
            if method == 'greedy':
                left_set = greedy_cut(self.samples, subset=S)
            elif method == 'SDP':
                G = construct_connectivity_graph(self.samples, subset=S)
                left_set = max_cut_heuristic(G, 3, 50)
            elif method == 'greedy+':
                G = construct_connectivity_graph(self.samples, subset=S)
                left_set = greedy_cut(self.samples, subset=S)
                left_set = improve_cut(G,left_set)
                
            if len(left_set) == 0 or len(left_set) == len(S):
                left_set = greedy_cut(self.samples, subset=S)
            right_set = set()
            for i in S:
                if not i in left_set:
                    right_set.add(i)
            root = len(T.nodes) - len(subset) + len(self.samples)
            T.add_node(root)
            left_child = build_helper(left_set)
            right_child = build_helper(right_set)
            T.add_edge(root, left_child)
            T.add_edge(root, right_child)
            return root
        build_helper(subset)
        return T
        
    def triplets_correct(self, T, sample_size=1000):
        TC = 0
        sample_set = np.array([v for v in T.nodes() if T.in_degree(v) == 1 and T.out_degree(v) == 0])
        Tt = self.ground_truth_tree
        
        for a in range(sample_size):
            chosen = np.random.choice(sample_set, 3, replace=False)
            if outgroup(chosen[0], chosen[1], chosen[2], T) == outgroup(chosen[0], chosen[1], chosen[2], Tt):
                TC += 1
        return TC/sample_size
    
    def triplets_correct_at_depth(self, T, method='aggregate', sample_size=1000, sampling_depths=None):
        sample_set = set([v for v in T.nodes() if T.in_degree(v) == 1 and T.out_degree(v) == 0])
        Tt = self.ground_truth_tree
        children = {}
        num_triplets = {}
        nodes_at_depth = [[] for i in range(2*int(np.log2(len(sample_set))))]
        def find_children(node, depth):
            children[node] = []
            if Tt.out_degree(node) == 0:
                if node in sample_set:
                    children[node].append(node)
                return
            for n in Tt.neighbors(node):
                find_children(n, depth+1)
                children[node] += children[n]
            L, R = list(Tt.neighbors(node))[0], list(Tt.neighbors(node))[1]
            num_triplets[node] = len(children[L])*nCr(len(children[R]), 2) + len(children[R])*nCr(len(children[L]), 2)
            if num_triplets[node] > 0:
                nodes_at_depth[depth].append(node)
        print(len(self.samples))
        find_children(len(self.samples), 0)
        
        def sample_at_depth(d):
            denom = sum([num_triplets[v] for v in nodes_at_depth[d]])
            node = np.random.choice(nodes_at_depth[d], 1, [num_triplets[v]/denom for v in nodes_at_depth[d]])[0]
            L, R = list(Tt.neighbors(node))[0], list(Tt.neighbors(node))[1]
            if np.random.random() < (len(children[R])-1)/(len(children[R])+len(children[L])-2):
                outgrp = np.random.choice(children[L], 1)[0]
                ingrp = np.random.choice(children[R], 2, replace=False)
            else:
                outgrp = np.random.choice(children[R], 1)[0]
                ingrp = np.random.choice(children[L], 2, replace=False)
            return outgroup(ingrp[0], ingrp[1], outgrp, T) == outgrp
        
        if not sampling_depths:
                sampling_depths = [d for d in range(len(nodes_at_depth)) if nodes_at_depth[d]]
        if method == 'aggregate':
            score = 0
            for d in sampling_depths:
                for a in range(sample_size):
                    score += int(sample_at_depth(d))
            return score/(sample_size*len(sampling_depths))
        elif method == 'geometric':
            score = 0
            for a in range(sample_size):
                w_list = [1/(sampling_depths[-1]-x+1) for i in sampling_depths]
                d = np.random.choice(sampling_depths, 1, [w/sum(w_list) for w in w_list])[0]
                score += int(sample_at_depth(d))
            return score /sample_size
        elif method == 'all':
            ret = []
            for d in sampling_depths:
                score = 0
                for a in range(sample_size):
                    score += int(sample_at_depth(d))
                ret.append(score/sample_size)
            return np.array(ret) 

In [None]:
h = 10
E = Experiment([0.1 for i in range(10)], 11, h)
subsample = []
for i in range(2**h):
    if np.random.random() < 0.2:
        subsample.append(i)
T = E.build_tree('greedy', subset=subsample)
print(E. triplets_correct_at_depth(T, method='all'))
print('\n')
E. triplets_correct_at_depth(E.ground_truth_tree, method='all')

In [None]:
A = [1,2]
B = [3,4]
print(A+B)
print(A)
print(B)
int(0)
A.append(1)

In [None]:
ret = []
score = 0
ret.append(score)

In [7]:
def remove_duplicates(nodes, indices):
    indices = list(indices)
    indices.sort(key=lambda i: nodes[i].chars)
    final_set = set()
    i = 0
    j = 1
    while j < len(indices):
        if nodes[indices[i]].chars != nodes[indices[j]].chars:
            final_set.add(indices[i])
            i = j
        j += 1
    final_set.add(indices[i])
    return final_set
        
def find_tree_lineage(i, T):
    p = list(T.predecessors(i))
    curr_node = i
    ancestor_list = [curr_node]
    while p:
        curr_node = p[0]
        ancestor_list.insert(0, curr_node)
        p = list(T.predecessors(curr_node))
    return(ancestor_list)

        
def outgroup(i, j, k, T):
    assert i != j and i != k and j != k, str(i) + ' ' + str(j) + ' ' + str(k) + ' not distinct'
    
    Li = find_tree_lineage(i, T)
    Lj = find_tree_lineage(j, T)
    Lk = find_tree_lineage(k, T)
    l = 0
    while Li[l] == Lj[l] and Lj[l] == Lk[l]:
        l += 1
    if Li[l] != Lj[l] and Li[l] != Lk[l] and Lj[l] != Lk[l]:
        return None
    if Li[l] == Lj[l]:
        return k
    if Li[l] == Lk[l]:
        return j
    if Lj[l] == Lk[l]:
        return i
    
    
    l = 0
    while Li[l] == Lj[l] and Lj[l] == Lk[l]:
        l += 1
    if Li[l] != Lj[l] and Li[l] != Lk[l] and Lj[l] != Lk[l]:
        return None
    if Li[l] == Lj[l]:
        return k
    if Li[l] == Lk[l]:
        return j
    if Lj[l] == Lk[l]:
        return i
     
def evaluate_split(S, subset, T, sample_size=1000):
    # assume S \subseteq T.leaves
    def S_outgroup(i,j,k):
        if (not cut(i,j,subset)) and (not cut(j,k,subset)):
            return None
        if not cut(i,j,subset):
            return k
        if not cut(i,k,subset):
            return j
        return i
    
    TC = 0
    TI = 0
    unresolved = 0
    superset = np.array(list(S))
    num_sampled = 0
    for a in range(sample_size):
        chosen = np.random.choice(superset, 3, replace=False)
        oS = S_outgroup(chosen[0], chosen[1], chosen[2])
        oT = outgroup(chosen[0], chosen[1], chosen[2], T)
        if oS == None or oT == None:
            unresolved += 1
        else:
            if oS == oT:
                TC += 1
            else:
                TI += 1
    return TC/sample_size, TI/sample_size, unresolved/sample_size
        
                    
def remove_duplicates(nodes, indices):
    indices = list(indices)
    indices.sort(key=lambda i: nodes[i].chars)
#     print(indices)
    final_set = set()
    i = 0
    j = 1
    while j < len(indices):
        if nodes[indices[i]].chars != nodes[indices[j]].chars:
            final_set.add(indices[i])
            i = j
        j += 1
    final_set.add(indices[i])
    return final_set
            
def mult_chain(a,b):
    f = 1
    for i in range(a, b+1):
        f*=i
    return f

def nCr(n, k):
    if k > n:
        return 0
    if k > n/2:
        return nCr(n, n-k)
    return int(mult_chain(n-k+1,n)/mult_chain(1,k))


In [29]:
for i in range(10):
    E = Experiment([.1 for i in range(20)], 10, 10, dropout_rate=.2)
    network = E.build_tree('greedy')
#     network2 = E.build_tree('greedy+')
    print(triplets_correct_at_depth_sep(network, E.ground_truth_tree, "all"))#, E.triplets_correct_at_depth(network2))
# print(E.node_map)
# for i in E.samples:
# #     print(E.node_map[i])
#     print(i.chars)
# for i in E.ground_truth_tree.nodes():
#     print(i)
# # nx.draw(network)
# # print("=============================")
# leaves = [n for n in network if network.out_degree(n) == 0 and network.in_degree(n) == 1]
# for i in leaves:
#     print(i)



[0.903  0.8112 0.775  0.7324 0.6536 0.5762 0.5448 0.4722 0.4388]
[0.7722 0.7336 0.7624 0.696  0.6912 0.6734 0.6204 0.5736 0.5428]
[0.966  0.7864 0.7338 0.6976 0.6734 0.6796 0.6064 0.538  0.5296]
[0.9554 0.8582 0.7822 0.7826 0.7468 0.692  0.6136 0.5784 0.5752]
[0.8912 0.7724 0.7496 0.6126 0.7032 0.6742 0.6214 0.5484 0.4972]
[0.8598 0.7484 0.7514 0.7394 0.7084 0.6848 0.6228 0.547  0.5122]


KeyboardInterrupt: 

In [None]:
avg = pic.load(open("/data/yosef2/users/richardz/projects/benchmarking/400htest/avg_h/dropout_cm6_avg_h.pkl", 'rb'))

In [None]:
avg

In [None]:
net = pic.load(open("/data/yosef2/users/richardz/projects/benchmarking/400htest/dropout_net6.pkl", 'rb'))

In [None]:
network = net.network
leaves = [n for n in network if network.out_degree(n) == 0 and network.in_degree(n) == 1]
print(len(leaves))
for i in leaves:
    print(i.name)

In [None]:
from cassiopeia.TreeSolver.Node import Node
rdict = {}
i = 0
for n in network.nodes:
    nn = Node("StateNode" + str(i), n.char_vec, pid = i, is_target=False)
    i += 1
    rdict[n] = nn
network = nx.relabel_nodes(network, rdict)
network.predecessors('StateNode8')

In [28]:
dropout_cm = pd.read_csv("/data/yosef2/users/richardz/projects/benchmarking/test_bench_imp/dropout_cm39.txt", sep = '\t', index_col = 0)
dropout_cm = dropout_cm.applymap(str)
# print(list(dropout_cm.iloc[0,:]))
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(dropout_cm)
samples = []
for index, row in dropout_cm.iterrows():
    node = Node(list(row), 1001, parent=None, left=None, right=None)
    samples.append(node)

subset = list(range(len(samples)))
prune_samples = remove_duplicates(samples, subset)

node_map = {}
for i in prune_samples:
    node_map[i] = list(dropout_cm.iloc[i,:])

print(len(prune_samples))

1004


In [29]:
net = pic.load(open("/data/yosef2/users/richardz/projects/benchmarking/test_bench_imp/dropout_net39.pkl", 'rb'))
network = net.network
leaves = [n for n in network if network.out_degree(n) == 0 and network.in_degree(n) == 1]
for i in leaves:
    print(i.char_vec)
ground_net_map = {}
for i in node_map:
    for j in leaves:
        if node_map[i] == j.char_vec:
            ground_net_map[j] = i
            break
print(ground_net_map)
network = nx.relabel_nodes(network, ground_net_map)
leaves = [n for n in network if network.out_degree(n) == 0 and network.in_degree(n) == 1]

print(len(ground_net_map))
sum([type(n) == int for n in leaves])

['8', '0', '9', '0', '0', '0', '0', '10', '3', '0', '6', '0', '5', '9', '8', '0', '10', '0', '10', '3', '10', '0', '5', '3', '1', '8', '10', '0', '5', '0', '0', '5', '1', '0', '0', '0', '1', '9', '3', '1']
['8', '0', '9', '0', '0', '0', '9', '10', '3', '0', '6', '0', '5', '9', '8', '0', '0', '6', '10', '3', '10', '0', '5', '3', '1', '8', '10', '0', '5', '0', '0', '5', '1', '0', '0', '0', '1', '9', '3', '1']
['0', '0', '9', '0', '0', '1', '3', '10', '3', '0', '6', '0', '5', '9', '8', '0', '2', '0', '10', '0', '10', '0', '5', '3', '1', '8', '10', '0', '5', '0', '0', '5', '1', '8', '0', '0', '1', '9', '3', '1']
['0', '0', '9', '0', '0', '0', '3', '10', '3', '0', '6', '8', '5', '9', '8', '0', '2', '0', '10', '0', '10', '0', '5', '3', '1', '8', '10', '0', '5', '0', '0', '5', '1', '8', '0', '0', '1', '9', '3', '1']
['4', '0', '9', '0', '0', '0', '4', '10', '3', '0', '6', '0', '5', '9', '8', '7', '0', '0', '10', '8', '10', '0', '5', '3', '1', '8', '10', '0', '5', '0', '0', '5', '1', '0', '7',

['0', '5', '3', '2', '6', '0', '0', '2', '0', '7', '0', '10', '4', '0', '8', '0', '2', '0', '5', '3', '4', '5', '0', '4', '1', '1', '2', '2', '8', '0', '0', '9', '2', '5', '10', '2', '0', '7', '0', '9']
['0', '0', '3', '2', '0', '0', '5', '2', '0', '7', '0', '10', '4', '0', '0', '0', '2', '0', '5', '3', '4', '5', '0', '4', '1', '1', '2', '2', '8', '0', '0', '9', '2', '5', '10', '2', '0', '7', '0', '9']
['0', '4', '3', '2', '0', '10', '0', '2', '8', '7', '0', '10', '4', '1', '0', '0', '2', '10', '0', '3', '4', '2', '0', '4', '1', '1', '2', '2', '8', '0', '8', '9', '2', '5', '10', '2', '0', '0', '1', '9']
['0', '4', '3', '2', '10', '10', '0', '2', '8', '7', '0', '10', '4', '0', '0', '0', '2', '10', '0', '3', '4', '2', '0', '4', '1', '1', '2', '2', '8', '0', '8', '9', '2', '5', '10', '2', '0', '6', '1', '9']
['4', '4', '3', '2', '0', '10', '0', '2', '0', '7', '10', '10', '4', '3', '0', '0', '2', '10', '0', '3', '4', '2', '0', '4', '1', '1', '2', '2', '8', '0', '0', '9', '2', '5', '10', '2

1004


1004

In [30]:
dropout_cm = dropout_cm.replace("-", -1)
dropout_cm = dropout_cm.replace("*", -1)
dropout_cm = dropout_cm.apply(pd.to_numeric)

samples = []
for index, row in dropout_cm.iterrows():
    node = Node(list(row), 1001, parent=None, left=None, right=None)
    samples.append(node)
sample_map = {}
for i in range(len(samples)):
    sample_map[i] = samples[i]

In [None]:
# subset = list(range(len(samples)))
# prune_samples = remove_duplicates(samples, subset)

In [None]:
print(samples[300].chars)
def get_key(val): 
    for key, value in ground_net_map.items(): 
         if val == value: 
             return key
print(get_key(300).char_vec)
print(node_map[300])

In [8]:
def build_tree_sep(samples, method='greedy', subset = None):
    if not subset:
        subset = list(range(len(samples)))
    else:
        subset = list(subset)
    subset = remove_duplicates(samples, subset)
    T = nx.DiGraph()
    for i in subset:
        T.add_node(i)
    def build_helper(S):
        assert S, "error, S = "+ str(S)
        if len(S) == 1:
            return list(S)[0]
        left_set = set()
        if method == 'greedy':
            left_set = greedy_cut(samples, subset=S)
        elif method == 'SDP':
            G = construct_connectivity_graph(samples, subset=S)
            left_set = max_cut_heuristic(G, 3, 50)
        elif method == 'greedy+':
            G = construct_connectivity_graph(samples, subset=S)
            left_set = greedy_cut(samples, subset=S)
            left_set = improve_cut(G,left_set)

        if len(left_set) == 0 or len(left_set) == len(S):
            left_set = greedy_cut(samples, subset=S)
        right_set = set()
        for i in S:
            if not i in left_set:
                right_set.add(i)
        root = len(T.nodes) - len(subset) + len(samples)
        T.add_node(root)
        left_child = build_helper(left_set)
        right_child = build_helper(right_set)
        T.add_edge(root, left_child)
        T.add_edge(root, right_child)
        return root
    build_helper(subset)
    return T

In [9]:
def triplets_correct(T, Tt, sample_size=5000):
    TC = 0
    sample_set = np.array([v for v in T.nodes() if T.in_degree(v) == 1 and T.out_degree(v) == 0])
    for a in range(sample_size):
        chosen = np.random.choice(sample_set, 3, replace=False)
        if outgroup2(chosen[0], chosen[1], chosen[2], T)[0] == outgroup2(chosen[0], chosen[1], chosen[2], Tt)[0]:
            TC += 1
    return TC/sample_size

def outgroup2(i, j, k, T):
    assert i != j and i != k and j != k, str(i) + ' ' + str(j) + ' ' + str(k) + ' not distinct'
    
#     Li = find_tree_lineage(i, T)
#     Lj = find_tree_lineage(j, T)
#     Lk = find_tree_lineage(k, T)

    Li = [node for node in nx.ancestors(T, i)]
    Lj = [node for node in nx.ancestors(T, j)]
    Lk = [node for node in nx.ancestors(T, k)]
    
    ij_common = len(set(Li) & set(Lj))
    ik_common = len(set(Li) & set(Lk))
    jk_common = len(set(Lj) & set(Lk))
    index = min(ij_common, ik_common, jk_common)

    if ij_common == ik_common and ik_common == jk_common:
        return None, index
    if ij_common > ik_common and ij_common > jk_common:
        return k, index
    elif jk_common > ik_common and jk_common > ij_common:
        return i, index
    elif ik_common > ij_common and ik_common > jk_common:
        return j, index

from collections import defaultdict
def triplets_correct_stratified(T, Tt, sample_size=5000, min_size_depth = 20):
    correct_class = defaultdict(int)
    freqs = defaultdict(int)
    sample_set = np.array([v for v in T.nodes() if T.in_degree(v) == 1 and T.out_degree(v) == 0])
    
    for a in range(sample_size):
        chosen = np.random.choice(sample_set, 3, replace=False)
        out1, index = outgroup2(chosen[0], chosen[1], chosen[2], T)
        out2, index2 = outgroup2(chosen[0], chosen[1], chosen[2], Tt)
        correct_class[index] += (out1 == out2)
        freqs[index] += 1
        
    tot_tp = 0
    num_consid = 0
    for k in correct_class.keys():
        if freqs[k] > min_size_depth:

            num_consid += 1
            tot_tp += correct_class[k] / freqs[k]

    tot_tp /= num_consid
    return tot_tp

In [None]:
leaves = [n for n in network if network.out_degree(n) == 0 and network.in_degree(n) == 1]
rleaves = [n for n in recon if recon.out_degree(n) == 0 and recon.in_degree(n) == 1]
print(len(leaves))
print(len(rleaves))

set(leaves) - set(rleaves)

In [34]:
recon = build_tree_sep(samples, method='greedy')
print(triplets_correct(recon, network))
recon2 = build_tree_sep(samples, method='greedy+')
print(triplets_correct(recon2, network))

0.868
0.9934


In [10]:
def triplets_correct_at_depth_sep(T, Tt, method='all', sample_size=5000, sampling_depths=None):
        sample_set = set([v for v in T.nodes() if T.in_degree(v) == 1 and T.out_degree(v) == 0])
        children = {}
        num_triplets = {}
        nodes_at_depth = {}
        
        def find_children(node, depth):
            children[node] = []
            if Tt.out_degree(node) == 0:
                if node in sample_set:
                    children[node].append(node)
                return
            
            for n in Tt.neighbors(node):
                find_children(n, depth+1)
                children[node] += children[n]
            
            L, R = list(Tt.neighbors(node))[0], list(Tt.neighbors(node))[1]
            num_triplets[node] = len(children[L])*nCr(len(children[R]), 2) + len(children[R])*nCr(len(children[L]), 2)
            if num_triplets[node] > 0:
                if depth in nodes_at_depth:
                    nodes_at_depth[depth].append(node)
                else:
                    nodes_at_depth[depth] = [node]
        root = [n for n in Tt if Tt.in_degree(n) == 0][0]
        find_children(root, 0)
        
        def sample_at_depth(d):
            denom = sum([num_triplets[v] for v in nodes_at_depth[d]])
            node = np.random.choice(nodes_at_depth[d], 1, [num_triplets[v]/denom for v in nodes_at_depth[d]])[0]
            L, R = list(Tt.neighbors(node))[0], list(Tt.neighbors(node))[1]
            if np.random.random() < (len(children[R])-1)/(len(children[R])+len(children[L])-2):
                outgrp = np.random.choice(children[L], 1)[0]
                ingrp = np.random.choice(children[R], 2, replace=False)
            else:
                outgrp = np.random.choice(children[R], 1)[0]
                ingrp = np.random.choice(children[L], 2, replace=False)
            return outgroup(ingrp[0], ingrp[1], outgrp, T) == outgrp
        
        if not sampling_depths:
                sampling_depths = [d for d in range(len(nodes_at_depth)) if nodes_at_depth[d]]
        if method == 'aggregate':
            score = 0
            for d in sampling_depths:
                for a in range(sample_size):
                    score += int(sample_at_depth(d))
            return score/(sample_size*len(sampling_depths))
        elif method == 'geometric':
            score = 0
            for a in range(sample_size):
                w_list = [1/(sampling_depths[-1]-x+1) for i in sampling_depths]
                d = np.random.choice(sampling_depths, 1, [w/sum(w_list) for w in w_list])[0]
                score += int(sample_at_depth(d))
            return score /sample_size
        elif method == 'all':
            ret = []
            for d in sampling_depths:
                score = 0
                for a in range(sample_size):
                    score += int(sample_at_depth(d))
                ret.append(score/sample_size)
            return np.array(ret) 

In [49]:
def triplets_correct_at_time_sep(T, Tt, method='all', bin_size = 10, sample_size=5000, sampling_depths=None):
    sample_set = set([v for v in T.nodes() if T.in_degree(v) == 1 and T.out_degree(v) == 0])
    children = {}
    num_triplets = {}
    nodes_at_depth = {}

    def find_children(node, total_time):
        t = total_time + Tt.nodes[node]['parent_lifespan']
        children[node] = []
        if Tt.out_degree(node) == 0:
            if node in sample_set:
                children[node].append(node)
            return

        for n in Tt.neighbors(node):
            find_children(n, t)
            children[node] += children[n]

        L, R = list(Tt.neighbors(node))[0], list(Tt.neighbors(node))[1]
        num_triplets[node] = len(children[L])*nCr(len(children[R]), 2) + len(children[R])*nCr(len(children[L]), 2)
        if num_triplets[node] > 0:
            bin_num = t//bin_size
            
            if bin_num in nodes_at_depth:
                nodes_at_depth[bin_num].append(node)
            else:
                nodes_at_depth[bin_num] = [node]
                
    root = [n for n in Tt if Tt.in_degree(n) == 0][0]
    find_children(root, 0)

    def sample_at_depth(d):
        denom = sum([num_triplets[v] for v in nodes_at_depth[d]])
        node = np.random.choice(nodes_at_depth[d], 1, [num_triplets[v]/denom for v in nodes_at_depth[d]])[0]
        L, R = list(Tt.neighbors(node))[0], list(Tt.neighbors(node))[1]
        if np.random.random() < (len(children[R])-1)/(len(children[R])+len(children[L])-2):
            outgrp = np.random.choice(children[L], 1)[0]
            ingrp = np.random.choice(children[R], 2, replace=False)
        else:
            outgrp = np.random.choice(children[R], 1)[0]
            ingrp = np.random.choice(children[L], 2, replace=False)
        return outgroup(ingrp[0], ingrp[1], outgrp, T) == outgrp

    if not sampling_depths:
        sampling_depths = [d for d in range(len(nodes_at_depth))]
    if method == 'aggregate':
        score = 0
        freq = 0
        for d in sampling_depths:
            if d in nodes_at_depth:
                max_children = 0
                for i in nodes_at_depth[d]:
                    if len(children[i]) > max_children:
                        max_children = len(children[i])
                if max_children > 10:
                    freq += 1
                    for a in range(sample_size):
                        score += int(sample_at_depth(d))
        return score/(sample_size*freq)
    elif method == 'all':
        ret = ['NaN'] * len(sampling_depths)
        for d in sampling_depths:
            if d in nodes_at_depth:
                max_children = 0
                for i in nodes_at_depth[d]:
                    if len(children[i]) > max_children:
                        max_children = len(children[i])
                if max_children > 10:
                    score = 0
                    for a in range(sample_size):
                        score += int(sample_at_depth(d))
                    ret[d] = score/sample_size
        return np.array(ret) 

In [12]:
def get_colless(network):
    root = [n for n in network if network.in_degree(n) == 0][0]
    colless = [0]
    colless_helper(network, root, colless)
    n = len([n for n in network if network.out_degree(n) == 0 and network.in_degree(n) == 1]) 
    return colless[0], (colless[0] - n * np.log(n) - n * (np.euler_gamma - 1 - np.log(2)))/n
def colless_helper(network, node, colless):
    if network.out_degree(node) == 0:
        return 1
    else:
        leaves = []
        for i in network.successors(node):
            leaves.append(colless_helper(network, i, colless))
        colless[0] += abs(leaves[0] - leaves[1])
        return sum(leaves)

In [51]:
for folder in ["test_bench_imp/20_chars_drop_sample"]:
#     folder = "400cells_" + drop + "_drop"
    path = "/data/yosef2/users/richardz/projects/benchmarking/" + folder + "/"
    nums = []
    triplets = []
    triplets_new = []
    colless = []
    types = []
    methods = []

    for method in ["SDP"]:
        for num in range(0, 10):
            dropout_cm = pd.read_csv(path + "dropout_cm" + str(num) + ".txt", sep = '\t', index_col = 0)
            dropout_cm = dropout_cm.applymap(str)

            samples = []
            for index, row in dropout_cm.iterrows():
                node = Node(list(row), 1001, parent=None, left=None, right=None)
                samples.append(node)

            subset = list(range(len(samples)))
            prune_samples = remove_duplicates(samples, subset)

            node_map = {}
            for i in prune_samples:
                node_map[i] = list(dropout_cm.iloc[i,:])

            net = pic.load(open(path + "dropout_net" + str(num) + ".pkl", 'rb'))
            ground = net.network
            leaves = [n for n in ground if ground.out_degree(n) == 0 and ground.in_degree(n) == 1]
            ground_net_map = {}
            for i in node_map:
                for j in leaves:
                    if node_map[i] == j.char_vec:
                        ground_net_map[j] = i
                        break
            ground = nx.relabel_nodes(ground, ground_net_map)

            dropout_cm = dropout_cm.replace("-", -1)
            dropout_cm = dropout_cm.replace("*", -1)
            dropout_cm = dropout_cm.apply(pd.to_numeric)

            samples = []
            for index, row in dropout_cm.iterrows():
                node = Node(list(row), 1001, parent=None, left=None, right=None)
                samples.append(node)
        #     sample_map = {}
        #     for i in range(len(samples)):
        #         sample_map[i] = samples[i]
            recon = build_tree_sep(samples, method=method)
            trip = triplets_correct(recon, ground)
#             trip = triplets_correct_stratified(recon, ground)
#             trip2 = triplets_correct_at_time_sep(recon, ground, 'all')
            print(num, trip, method)
            triplets.append(trip)
            triplets_new.append(trip2)
            nums.append(num)
            colless.append(get_colless(ground)[0])
            types.append(folder)
            methods.append(method)

# #     data = [triplets_new]
#     df = pd.DataFrame(triplets_new)
#     max_len = max([len(n) for n in triplets_new])
#     for i in range(len(triplets_new)):
#         triplets_new[i]
#         while(len(triplets_new[i]) < max_len):
#             print(len(triplets_new[i]))
#             triplets_new[i] = np.append(triplets_new[i], ['NaN'], axis = 0)
# #     df = df.T
# #     df = df.rename(columns = {0: 'Run', 1: 'TripletsCorrect', 2: 'TripletsCorrect2', 3:'Colless', 4:'Method',5: 'Type'})
#     df.to_csv(path + 'methods_triplets_time.txt', sep = '\t', index = False)

0 0.6331151357548984 ['0.7716' '0.8976' '0.8072' '0.774' '0.7048' '0.6624' '0.6126' 'NaN'] greedy
1 0.8698934535568952 ['1.0' '0.7986' '0.7674' '0.8362' '0.8584' '0.806' '0.761' '0.676'
 '0.5688' 'NaN'] greedy
2 0.9714285714285715 ['1.0' '1.0' '0.8198' '0.9892' '0.7754' '0.6918' '0.592' '0.6314' 'NaN'
 'NaN'] greedy
3 0.8655972948010735 ['1.0' '0.8672' '0.8356' '0.7432' '0.8954' '0.7324' '0.7272' '0.5446'
 'NaN' 'NaN' 'NaN'] greedy
4 0.7848113609820261 ['0.9304' '1.0' '1.0' '0.622' '0.713' '0.7726' '0.7502' '0.6628' '0.6558'
 'NaN' 'NaN' 'NaN'] greedy
5 0.9290527905226972 ['1.0' 'NaN' '1.0' '0.9532' '0.8796' '0.7492' '0.5986' '0.5768' '0.5324'
 '0.5992' '0.5758'] greedy
6 0.8910537460146291 ['1.0' '0.9136' '0.8252' '0.8172' '0.8174' '0.7376' '0.636' '0.5958' 'NaN'
 'NaN'] greedy
7 0.9254909707359275 ['1.0' '1.0' '1.0' 'NaN' '0.786' '1.0' '0.7802' '0.7306' '0.887' '0.7258'
 '0.6786' '0.539' '0.496'] greedy
8 0.8342975138699504 ['0.8922' '0.8058' '0.7634' '0.7736' '0.7218' '0.6748' 'NaN'



0 0.7527085347476721 ['0.8428' '0.8896' '0.822' '0.807' '0.8012' '0.6844' '0.6784' 'NaN'] SDP
1 0.8417703100359297 ['1.0' '0.8366' '0.8412' '0.7174' '0.8364' '0.8272' '0.8226' '0.7082'
 '0.7036' 'NaN'] SDP
2 0.9046270355057484 ['1.0' '0.7658' '0.808' '0.7496' '0.8054' '0.7262' '0.6394' '0.7072' 'NaN'
 'NaN'] SDP
3 0.7769692852437416 ['1.0' '0.7602' '0.8134' '0.7242' '0.8536' '0.7846' '0.7276' '0.5916'
 'NaN' 'NaN' 'NaN'] SDP
4 0.757351560137842 ['0.6952' '1.0' '0.9554' '0.6744' '0.7308' '0.7674' '0.8196' '0.6542'
 '0.7076' 'NaN' 'NaN' 'NaN'] SDP
5 0.89132213226175 ['0.8898' 'NaN' '1.0' '0.954' '0.8148' '0.8448' '0.7238' '0.6234' '0.679'
 '0.7342' '0.6218'] SDP
6 0.9333907003213333 ['0.5994' '1.0' '0.793' '0.8044' '0.792' '0.797' '0.7152' '0.6186' 'NaN'
 'NaN'] SDP
7 0.8230901820416702 ['1.0' '0.6398' '0.731' 'NaN' '0.784' '1.0' '0.7326' '0.646' '0.891'
 '0.7108' '0.6922' '0.5686' '0.5446'] SDP
8 0.804596563153417 ['0.7352' '0.7542' '0.808' '0.809' '0.7688' '0.7074' 'NaN' 'NaN'] SDP
9 0

In [53]:
max_len = max([len(n) for n in triplets_new])
for i in range(len(triplets_new)):
    triplets_new[i]
    while(len(triplets_new[i]) < max_len):
        print(len(triplets_new[i]))
        triplets_new[i] = np.append(triplets_new[i], ['NA'], axis = 0)
df = pd.DataFrame(triplets_new)
# [len(n) for n in triplets_new]
df.to_csv(path + 'methods_triplets_time.txt', sep = '\t', index = False)

In [42]:
np.append(n, ['NA'], axis = 0)

array(['0.772', '0.898', '0.8136', '0.7796', '0.6988', '0.6328', '0.6342',
       'NA', 'NA'], dtype='<U32')

In [79]:
def longest_path(network, node, ans):
#     print(network.nodes[node]['parent_lifespan'])
    ans[0] += network.nodes[node]['parent_lifespan']
    for i in network.successors(node):
        longest_path(network, i, ans)
        break



for num in range(0, 50):
    net = pic.load(open("/data/yosef2/users/richardz/projects/benchmarking/400test_bench_imp/dropout_net" + str(num) + ".pkl", 'rb'))
    root = [n for n in net.network if net.network.in_degree(n) == 0][0] 
    ans = [0]
    longest_path(net.network, root, ans)
    print(ans)

# for i in net.network.nodes:
#     print(i)

[92.1069919988]
[137.826691225]
[96.40093020859999]
[82.56314202]
[111.25309705599999]
[95.07960426800001]
[81.30707096900001]
[120.673705029]
[92.779312745]
[94.62066679150001]
[96.905129276]
[106.213141713]
[95.337831573]
[101.149377817]
[101.52891695771]
[98.367339764]
[93.56223726619999]
[90.0580855415]
[113.012546507]
[98.928021614]
[106.94962046539999]
[122.35460945]
[91.3657421138]
[88.14028949939998]
[94.48670092100001]
[113.65798584199999]
[85.30693418420002]
[91.83862708619999]
[102.690220161]
[92.33377588629999]
[126.97482966200002]
[93.26909393026]
[89.45930803169999]
[113.09091454399999]
[99.01765042699999]
[135.227986235]
[134.118871643]
[89.6215393696]
[132.8581470978]
[95.371333771]
[108.6984798734]
[96.0178506746]
[109.6346875982]
[98.4474194974]
[97.173184097]
[102.932841325]
[98.284959998]
[99.362481543]
[96.68536755]
[101.673532922]


In [None]:
def TC_levels(sample_size=10, h=10, sample_rate=0.2, num_char=40, p=0.1, num_states=11, dr=0):
    depths = [d for d in range(int(h/2)+2)]
    tc_mxc = []
    tc_g = []
    tc_gi = []
    for a in range(sample_size):
        print('experiment', a)
        E = Experiment([p for i in range(num_char)], num_states, h, dropout_rate=dr)
        subsample = set()
        for j in range(len(E.samples)):
            if np.random.random() < sample_rate:
                subsample.add(j)
        Tm = E.build_tree('SDP', subset=subsample)
        Tg = E.build_tree('greedy', subset=subsample)
        Tgi = E.build_tree('greedy+', subset=subsample)
        tc_mxc.append(E.triplets_correct_at_depth(Tm, method='all', sampling_depths=depths))
        tc_g.append(E.triplets_correct_at_depth(Tg, method='all', sampling_depths=depths))
        tc_gi.append(E.triplets_correct_at_depth(Tgi, method='all', sampling_depths=depths))
    plt.errorbar(depths, np.average(tc_mxc, axis=0), yerr=np.std(tc_mxc, axis=0)/np.sqrt(sample_size), marker='o', markersize='5', color='black', label='SDP Heuristic + HC')
    plt.errorbar(depths, np.average(tc_g, axis=0), yerr=np.std(tc_g, axis=0)/np.sqrt(sample_size), marker='o', markersize='5', color='red', label='greedy')
    plt.errorbar(depths, np.average(tc_gi, axis=0), yerr=np.std(tc_gi, axis=0)/np.sqrt(sample_size), marker='o', markersize='5', color='blue', label='SDP + HC')
    plt.xlabel("depth")
    plt.ylabel("triplets correct")
    plt.legend()
    plt.show()

In [None]:
TC_levels()

In [None]:
TC_levels(dr=0.2)

In [None]:
TC_levels(num_states=4)

In [None]:
TC_levels(num_states=4, dr=0.2)

In [15]:
def test_ps(lst, sample_size=5, h=10, sample_rate=0.2, num_char=40, num_states=11, dr=0):
    print("testing p vals")
    data = np.zeros((3, len(lst), sample_size))
    pos = 0
    for p in lst:
        for i in range(sample_size):
            E = Experiment([p for i in range(num_char)], num_states, h, dropout_rate=dr)
            subsample = set()
            for j in range(len(E.samples)):
                if np.random.random() < sample_rate:
                    subsample.add(j)
            Tm = E.build_tree('SDP', subset=subsample)
            Tg = E.build_tree('greedy', subset=subsample)
            Tgi = E.build_tree('greedy+', subset=subsample)
            data[0,pos,i] = E.triplets_correct(Tm)
            data[1,pos,i] = E.triplets_correct(Tg)
            data[2,pos,i] = E.triplets_correct(Tgi)
            print("pos = ", pos, "sample number", i)
        pos += 1
    mu_mxc = [np.average(data[0, pos, :]) for pos in range(len(lst))]
    std_mxc = [np.std(data[0, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_g = [np.average(data[1, pos, :]) for pos in range(len(lst))]
    std_g = [np.std(data[1, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_gi = [np.average(data[2, pos, :]) for pos in range(len(lst))]
    std_gi = [np.std(data[2, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]

    plt.errorbar(lst, mu_mxc, yerr=std_mxc, marker='o', markersize='5', color='black', label='SDP-MXC')
    plt.errorbar(lst, mu_g, yerr=std_g, marker='o', markersize='5',  color='red', label='greedy ')
    plt.errorbar(lst, mu_gi, yerr=std_gi, marker='o', markersize='5', color='blue', label='greedy+ hill climb')
    plt.xlabel("character mutation probability")
    plt.ylabel("triplets correct")
    plt.legend()
    plt.show()

def test_nc(lst, sample_size=5, h=10, sample_rate=0.2, p=0.1, num_states=11, dr=0):
    print("testing number of chars")
    data = np.zeros((3, len(lst), sample_size))
    pos = 0
    for num_char in lst:
        for i in range(sample_size):
            E = Experiment([p for i in range(num_char)], num_states, h, dropout_rate=dr)
            subsample = set()
            for j in range(len(E.samples)):
                if np.random.random() < sample_rate:
                    subsample.add(j)
            Tm = E.build_tree('SDP', subset=subsample)
            Tg = E.build_tree('greedy', subset=subsample)
            Tgi = E.build_tree('greedy+', subset=subsample)
            data[0,pos,i] = E.triplets_correct(Tm)
            data[1,pos,i] = E.triplets_correct(Tg)
            data[2,pos,i] = E.triplets_correct(Tgi)
            print("pos = ", pos, "sample number", i)
        pos += 1
    mu_mxc = [np.average(data[0, pos, :]) for pos in range(len(lst))]
    std_mxc = [np.std(data[0, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_g = [np.average(data[1, pos, :]) for pos in range(len(lst))]
    std_g = [np.std(data[1, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_gi = [np.average(data[2, pos, :]) for pos in range(len(lst))]
    std_gi = [np.std(data[2, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]

    plt.errorbar(lst, mu_mxc, yerr=std_mxc, marker='o', markersize='5', color='black', label='SDP-MXC')
    plt.errorbar(lst, mu_g, yerr=std_g, marker='o', markersize='5',  color='red', label='greedy ')
    plt.errorbar(lst, mu_gi, yerr=std_gi, marker='o', markersize='5', color='blue', label='greedy+ hill climb')
    plt.xlabel("number of characters")
    plt.ylabel("triplets correct")
    plt.legend()
    plt.show()

def test_drop_out(lst, sample_size=5, h=10, sample_rate=0.2, num_char=40, p=0.1, num_states=11):
    print("testing dropout")
    data = np.zeros((3, len(lst), sample_size))
    pos = 0
    for dr in lst:
        for i in range(sample_size):
            E = Experiment([p for i in range(num_char)], num_states, h, dropout_rate=dr)
            subsample = set()
            for j in range(len(E.samples)):
                if np.random.random() < sample_rate:
                    subsample.add(j)
            Tm = E.build_tree('SDP', subset=subsample)
            Tg = E.build_tree('greedy', subset=subsample)
            Tgi = E.build_tree('greedy+', subset=subsample)
            data[0,pos,i] = E.triplets_correct(Tm)
            data[1,pos,i] = E.triplets_correct(Tg)
            data[2,pos,i] = E.triplets_correct(Tgi)
            print("pos = ", pos, "sample number", i)
        pos += 1
    mu_mxc = [np.average(data[0, pos, :]) for pos in range(len(lst))]
    std_mxc = [np.std(data[0, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_g = [np.average(data[1, pos, :]) for pos in range(len(lst))]
    std_g = [np.std(data[1, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_gi = [np.average(data[2, pos, :]) for pos in range(len(lst))]
    std_gi = [np.std(data[2, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]

    plt.errorbar(lst, mu_mxc, yerr=std_mxc, marker='o', markersize='5', color='black', label='SDP-MXC')
    plt.errorbar(lst, mu_g, yerr=std_g, marker='o', markersize='5',  color='red', label='greedy ')
    plt.errorbar(lst, mu_gi, yerr=std_gi, marker='o', markersize='5', color='blue', label='greedy+ hill climb')
    plt.xlabel("drop_out_rate")
    plt.ylabel("triplets correct")
    plt.legend()
    plt.show()
    
def test_qs(lst, sample_size=5, h=10, sample_rate=0.2, num_char=40, p=0.1, dr=0):
    print("testing number of states")
    data = np.zeros((3, len(lst), sample_size))
    pos = 0
    for num_states in lst:
        for i in range(sample_size):
            E = Experiment([p for i in range(num_char)], num_states, h, dropout_rate=dr)
            subsample = set()
            for j in range(len(E.samples)):
                if np.random.random() < sample_rate:
                    subsample.add(j)
            Tm = E.build_tree('SDP', subset=subsample)
            Tg = E.build_tree('greedy', subset=subsample)
            Tgi = E.build_tree('greedy+', subset=subsample)
            data[0,pos,i] = E.triplets_correct(Tm)
            data[1,pos,i] = E.triplets_correct(Tg)
            data[2,pos,i] = E.triplets_correct(Tgi)
            #print("pos = ", pos, "sample number", i)
        pos += 1
    mu_mxc = [np.average(data[0, pos, :]) for pos in range(len(lst))]
    std_mxc = [np.std(data[0, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_g = [np.average(data[1, pos, :]) for pos in range(len(lst))]
    std_g = [np.std(data[1, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]
    mu_gi = [np.average(data[2, pos, :]) for pos in range(len(lst))]
    std_gi = [np.std(data[2, pos, :])/np.sqrt(sample_size) for pos in range(len(lst))]

    plt.errorbar(lst, mu_mxc, yerr=std_mxc, marker='o', markersize='5', color='black', label='SDP-MXC')
    plt.errorbar(lst, mu_g, yerr=std_g, marker='o', markersize='5',  color='red', label='greedy ')
    plt.errorbar(lst, mu_gi, yerr=std_gi, marker='o', markersize='5', color='blue', label='greedy+ hill climb')
    plt.xlabel("number of states")
    plt.ylabel("triplets correct")
    plt.title("p = " + str(p))
    plt.legend()
    plt.show()


In [16]:
test_nc([40,60,80,100,120], sample_size=20, num_states=6, h=10, dr=0.2)

testing number of chars
[[2, 3, 2, 0, 2, 4, 2, 2, 0, 4, 5, 4, 4, 0, 5, 0, 1, 0, 5, 3, 1, 0, 5, 5, 1, 4, 5, 0, 4, 3, 1, 3, 3, 0, 1, 0, 5, 5, 1, 4], [2, 3, 2, 0, 2, 4, 2, 2, 0, 4, 5, 4, 4, 5, 5, 0, 1, 0, 5, 3, 1, 0, 5, 5, 1, 4, 5, 0, 4, 3, 1, 3, 3, 0, 1, 0, 5, 5, 1, 4], [5, 3, 2, 0, 2, 4, 2, 2, 0, 4, 5, 4, 4, 2, 5, 0, 1, 0, 5, 3, 1, 0, 5, 5, 1, 4, 5, 0, 0, 0, 1, 3, 3, 0, 1, 0, 5, 5, 1, 4], [5, 3, 2, 0, 2, 4, 2, 2, 0, 4, 5, 4, 4, 2, 5, 0, 1, 0, 5, 3, 1, 0, 5, 5, 1, 4, 5, 0, 0, 0, 1, 3, 3, 0, 1, 0, 5, 5, 1, 4], [0, 3, 2, 0, 2, 5, 2, 2, 0, 4, 5, 4, 4, 0, 5, 5, 1, 0, 5, 3, 1, 0, 5, 5, 1, 4, 5, 1, 0, 0, 1, 0, 3, 2, 1, 0, 5, 5, 1, 4], [0, 3, 2, 5, 2, 3, 2, 2, 0, 4, 5, 4, 4, 0, 5, 5, 1, 0, 5, 3, 1, 2, 5, 5, 1, 4, 5, 1, 0, 0, 1, 0, 3, 2, 1, 0, 5, 5, 1, 4], [0, 3, 2, 0, 2, 0, 2, 2, 0, 4, 5, 4, 4, 4, 5, 5, 1, 0, 5, 3, 1, 0, 5, 5, 1, 4, 5, 1, 0, 0, 1, 0, 3, 4, 1, 0, 5, 5, 1, 4], [0, 3, 2, 0, 2, 0, 2, 2, 0, 4, 5, 4, 4, 0, 5, 5, 1, 0, 5, 3, 1, 5, 5, 5, 1, 4, 5, 1, 4, 0, 1, 0, 3, 4, 1, 0, 5, 5, 1, 4],

KeyboardInterrupt: 

In [None]:
test_drop_out([0,0.1,0.2,0.3,0.4,0.5,0.6], sample_size=20, num_states=4, h=10)

In [None]:
test_drop_out([0,0.1,0.2,0.3,0.4,0.5,0.6], sample_size=20, num_states=13, p=0.1, h=10)

In [None]:
ps = [0.05,0.1,0.15,0.2,0.25]
ns = [2, 6, 11, 16, 21]

In [None]:
for prob in ps:
    test_qs(ns, h=10, p=prob)

In [None]:
'''
num_experiments = 10
h = 9
sample_rate = 0.2
dr = 0.2
experiments = []
MXC_cut_scores = np.zeros(num_experiments)
MXC_TC = np.zeros((3, num_experiments), dtype=float)
G_cut_scores = np.zeros(num_experiments)
G_TC = np.zeros((3, num_experiments), dtype=float)
GI_cut_scores = np.zeros(num_experiments)
GI_TC = np.zeros((3, num_experiments), dtype=float)

def get_cut_data(G, B, superset, S, T, cut_scores, TC_scores, i):
    cut_score = evaluate_cut(G,B,S)
    s = ''
    s += "cut score = " + str(cut_score) + " "
    cut_scores[i] = cut_score
    TC, TI, UR = evaluate_split(superset, S, T)
    TC_scores[0][i], TC_scores[1][i], TC_scores[2][i] = TC, TI, UR
    s += "triplets correct = " + str(TC) + " "
    s += "triplets incorrect = " + str(TI) + " "
    s += "unresolved = "+ str(UR)
    return s

for index in range(num_experiments):
    E = Experiment([0.1 for i in range(40)], 11, h, dropout_rate=dr)
    experiments.append(E)
    subsample = []
    for i in range(len(E.samples)):
        if np.random.random() < sample_rate:
            subsample.append(i)
    G = construct_connectivity_graph(E.samples, subset=subsample)
    MX_cut = max_cut_heuristic(G,3,50)
    g_cut = greedy_cut(E.samples, subset=subsample)
    gI_cut = improve_cut(G, g_cut.copy())
    print("new sample")
    print("max cut")
    print(get_cut_data(G, B, subsample, MX_cut, E.ground_truth_tree, MXC_cut_scores, MXC_TC, index))
    print("greedy cut")
    print(get_cut_data(G, B, subsample, g_cut, E.ground_truth_tree, G_cut_scores, G_TC, index))
    print("greedy + hill climb cut")
    print(get_cut_data(G, B, subsample, gI_cut, E.ground_truth_tree, GI_cut_scores, GI_TC, index))
#print(MXC_TC)
-------------------------------------------

'''
def num_chars_expr(char_nums, h=10, sample_rate=0.2, num_trials=5):
    num_experiments = len(char_nums)
    MXC_cut_scores = np.zeros(num_experiments)
    MXC_TC = np.zeros((3, num_experiments), dtype=float)
    G_cut_scores = np.zeros(num_experiments)
    G_TC = np.zeros((3, num_experiments), dtype=float)
    GI_cut_scores = np.zeros(num_experiments)
    GI_TC = np.zeros((3, num_experiments), dtype=float)

    for ind in range(len(char_nums)):
        cm = char_nums[ind]
        for trial in range(num_trials):
            E = Experiment([0.1 for i in range(cm)], 4, h)
            experiments.append(E)
            subsample = []
            for jay in range(len(E.samples)):
                if np.random.random() < sample_rate:
                    subsample.append(jay)
            G = construct_connectivity_graph(E.samples, subset=subsample)
            MX_cut = max_cut_heuristic(G,3,50)
            g_cut = greedy_cut(E.samples, subset=subsample)
            gI_cut = improve_cut(G, g_cut.copy())
            MXC_cut_scores[ind] += evaluate_cut(MX_cut, G)
            G_cut_scores[ind] += evaluate_cut(g_cut,G)
            GI_cut_scores[ind] += evaluate_cut(gI_cut,G)
            T = E.ground_truth_tree
            TC, TI, UR = evaluate_split(subsample, MX_cut, T)
            MXC_TC[0][ind] += TC
            MXC_TC[1][ind] += TI
            MXC_TC[2][ind] += UR
            TC, TI, UR = evaluate_split(subsample, g_cut, T)
            G_TC[0][ind] += TC
            G_TC[1][ind] += TI
            G_TC[2][ind] += UR
            TC, TI, UR = evaluate_split(subsample, gI_cut, T)
            GI_TC[0][ind] += TC
            GI_TC[1][ind] += TI
            GI_TC[2][ind] += UR
            print("trial " + str(trial) + " of experiment " + str(ind))
    MXC_cut_scores = MXC_cut_scores/num_trials
    MXC_TC = MXC_TC/num_trials
    G_cut_scores = G_cut_scores/num_trials
    G_TC = G_TC/num_trials
    GI_cut_scores = GI_cut_scores/num_trials
    GI_TC = GI_TC/num_trials
    
    f1 = plt.figure()
    plt.plot(char_nums, MXC_cut_scores,  marker='o', markersize='5', color='black', label='SDP Heuristic + HC')
    plt.plot(char_nums, G_cut_scores, ls='none', marker='o', markersize='5', color='red', label='Greedy')
    plt.plot(char_nums, GI_cut_scores, ls='none',  marker='o', markersize='5', color='blue', label='Greedy+HC')
    plt.ylabel("cut scores of splitting algorithms")
    plt.xlabel("number of characters")
    plt.show()
    plt.savefig("mxc.pdf")
    plt.plot(char_nums, MXC_TC[0,:], marker='o', markersize='5', color='black', label='SDP Heuristic + HC')
    plt.plot(char_nums, G_TC[0,:], ls='none', marker='o', markersize='5', color='red', label='Greedy')
    plt.plot(char_nums, GI_TC[0,:],  ls='none', marker='o', markersize='5', color='blue', label='Greedy+HC')
    plt.ylabel("proportions of triplets correct")
    plt.xlabel("number of characters")
    plt.legend()
    plt.show()
    plt.plot(char_nums, MXC_TC[1,:],  marker='o', markersize='5', color='black', label='SDP Heuristic + HC')
    plt.plot(char_nums, G_TC[1,:], ls='none',  marker='o', markersize='5', color='red', label='Greedy')
    plt.plot(char_nums, GI_TC[1,:], ls='none', marker='o', markersize='5', color='blue', label='Greedy+HC')
    plt.ylabel("proportions of triplets incorrect")
    plt.xlabel("number of characters")
    plt.legend()
    plt.show()
    plt.plot(char_nums, MXC_TC[2,:],  marker='o', markersize='5', color='black', label='SDP Heuristic + HC')
    plt.plot(char_nums, G_TC[2,:], ls='none', marker='o', markersize='5', color='red', label='Greedy')
    plt.plot(char_nums, GI_TC[2,:], ls='none', marker='o', markersize='5', color='blue', label='Greedy+HC')
    plt.ylabel("proportions of triplets unresolved")
    plt.xlabel("number of characters")
    plt.legend()
    plt.show()
    
#num_chars_expr([10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60], h=10, sample_rate=0.1, num_trials=5)

In [14]:
def compare_trees(num_experiments=10, h=10, sample_rate=0.2, p=0.1, num_char=20, num_states=4, dr=0):
    for i in range(num_experiments):
        E = Experiment([p for i in range(num_char)], num_states, h, dropout_rate=dr)
        subsample = set()
        for j in range(len(E.samples)):
            if np.random.random() < sample_rate:
                subsample.add(j)
        Tm = E.build_tree('SDP', subset=subsample)
        Tg = E.build_tree('greedy', subset=subsample)
        Tgi = E.build_tree('greedy+', subset=subsample)
        print("SDP TC: ", E.triplets_correct(Tm), " greedy TC:", E.triplets_correct(Tg), " greedy hill-climb TC:", E.triplets_correct(Tgi))

#compare_trees(10, h=10, sample_rate=0.2, p=0.1, num_char=20, num_states=4, dr=0)

In [None]:
'''
remove duplicates tests



n1 = Node([1,0,3], 5)
n2 = Node([0,3,3], 5)
n3 = Node([4,1,3], 5)
n4 = Node([4,2,3], 5)
n5 = Node([1,0,4], 5)
n6 = Node([0,3,0], 5)
n7 = Node([2,1,0], 5)
n8 = Node([2,2,2], 5)
nodes = [n1,n2,n2,n4,n1,n1,n4,n4,n1,n1,n5,n5]
indices = [0,1,2,3,6,8,9,10,11]

for i in indices:
    print(nodes[i])
remove_duplicates(nodes, indices)
-----------------------
node tests

n1 = Node([0,-1,1,4], 5)
n2 = Node([0,2,0,4], 4)
n3 = Node([2,2,0,4], 4)
n4 = Node([2,2,1,4], 4)
tsamples = [n1,n2,n3,n4]
Gt, Bt = construct_connectivity_graph(tsamples, subset={0,2,3})
print(nx.adjacency_matrix(Gt).todense())
print("\n")
print(nx.adjacency_matrix(Bt).todense())
'''
pass

In [None]:
{1,2,3}.copy()

In [None]:
start_time = time.time()
lst = [n for n in range(1000)]
for i in range(1000000):
    if lst[0] == lst[0]:
        z = "reeee"
    if lst[0] == lst[1]-1:
        z = "reeee"
    if lst[1]-1 == lst[0]:
        z = "reeee" 
    if lst[1]-1 == lst[0]:
        z = "reeee" 
print( (time.time() - start_time))