In [4]:
import sys 
import networkx as nx
import pandas as pd
import numpy as np
import pickle as pic
import random

import cassiopeia.TreeSolver.simulation_tools.simulation_utils as sim_utils
import cassiopeia.TreeSolver.simulation_tools.dataset_generation as data_gen
from cassiopeia.TreeSolver.Node import Node
from cassiopeia.TreeSolver.Cassiopeia_Tree import Cassiopeia_Tree

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

import subprocess

#import seaborn as sns
import os

In [5]:
#Specify number of cells and the imputation algorithm used
recon_method = "lookahead"
NUM_CELLS = 1500

#Recursive function to travel up the tree and grab a missing character from a node's first ancester
#where it is no longer missing
def get_predecessor_val(tree, node, ind):
    for pred in tree.network.predecessors(node):
#         print(pred.char_vec)
        if pred.char_vec[ind] == '-' or pred.char_vec[ind] == 'H':
            return get_predecessor_val(tree, pred, ind)
        else:
            return pred.char_vec[ind]

In [None]:
# Iterate over the heritable dropout percentages
for recon_method in ["avg", "lookahead", "knn"]:
    print(recon_method)

    for drop in range(6):

        hdropout_percent = str(drop)
        print("Percent: " + hdropout_percent)

        #Establish path
        path = "/data/yosef2/users/richardz/projects/dropout_testing/ground_truth_testing/" + str(NUM_CELLS) + "cells/" + hdropout_percent + "percent"

        #Main loop
        accuracies = []
        for num in range(0, 50):
            #Load the tree, the character matrix, and post-process the tree
            test_recon = pic.load(open(path + "/" + recon_method + "/dropout_cm" + str(num) + "_" + recon_method + ".pkl", 'rb'))
            dropout_cm = pd.read_csv(path + '/dropout_cm' + str(num) + '.txt', sep = '\t', index_col = 0)
            test_ppg = test_recon.post_process(dropout_cm)

            #Create a dictionary to map the names of the nodes to their memory addresses
            names_to_leaves = {}
            for i in list(test_ppg.network.nodes):
                if i.name != "state-node":
                    names_to_leaves[i.name] = i

            #For each named node in the character matrix (post dropout), grab its location in memory and go through
            #its character vector. For each missing value, grab the character from its first non-missing ancestor in 
            #the reconstructed tree. If a character value is still missing at root, implicitly assume its value is 0.
            #Then add the reconstructed row to the list and construct a data frame.
            reconstructed_rows = []
            for index, row in dropout_cm.iterrows():
                node = names_to_leaves[index]
                reconstructed_row = []
                for i in range(len(row)):
                    if row[i] == '-' or row[i] == 'H':
                        char = get_predecessor_val(test_ppg, node, i)
                        if char == None:
                            char = '0'
                        reconstructed_row.append(char)
                    else:
                        reconstructed_row.append(row[i])
                reconstructed_rows.append(reconstructed_row)
            reconstructed_cm = pd.DataFrame(reconstructed_rows)

            #Read in the ground-truth character matrix, before dropout is introduced
            ground_truth_cm = pd.read_csv(path + "/ground_truth_cm" + str(num) + ".txt", sep='\t', index_col = 0)

            #Go through the post-dropout character matrix. For each missing character in the post-dropout character matrix,
            #check to see if the imputed character matrix matches the character in the ground-truth character matrix
            #at that character. If yes, it is considered imputed correctly. Keep a running total of the number of total
            #dropouts and the number of imputed correctly to create a proportion correct.
            num_dropped = 0
            num_correct = 0
            for k in range(dropout_cm.shape[0]):
                for j in range(dropout_cm.shape[1]):
                    if dropout_cm.iloc[k,j] == "-" or dropout_cm.iloc[k,j] == "H":
                        num_dropped += 1
                        if str(ground_truth_cm.iloc[k,j]) == str(reconstructed_cm.iloc[k,j]):
                            num_correct += 1

            accuracy = num_correct/num_dropped
            accuracies.append(accuracy)
            print(num, accuracy)

        #Write the proportions correctly imputed to CSV
        import csv

        with open('/data/yosef2/users/richardz/projects/dropout_testing/ground_truth_testing/' + str(NUM_CELLS) + 'cells/' + recon_method + '_accuracies' + hdropout_percent + '.csv', 'w') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(accuracies)
        csvFile.close()

avg
Percent: 0


In [None]:
hdropout_percent = str(2)

path = "/data/yosef2/users/richardz/projects/dropout_testing/ground_truth_testing/" + str(NUM_CELLS) + "cells/" + hdropout_percent + "percent"
accuracies = []

num = 3

test_recon = pic.load(open(path + "/" + recon_method + "/dropout_cm" + str(num) + "_" + recon_method + ".pkl", 'rb'))
dropout_cm = pd.read_csv(path + '/dropout_cm' + str(num) + '.txt', sep = '\t', index_col = 0)
test_ppg = test_recon.post_process(dropout_cm)

names_to_leaves = {}

for i in list(test_ppg.network.nodes):
    if i.name != "state-node":
        names_to_leaves[i.name] = i

reconstructed_rows = []
for index, row in dropout_cm.iterrows():
    print(index)
    node = names_to_leaves[index]
    reconstructed_row = []
    for i in range(len(row)):
        if row[i] == '-':
            char = get_predecessor_val(test_ppg, node, i)
            if char == None:
                char = '0'
            reconstructed_row.append(char)
        else:
            reconstructed_row.append(node.char_vec[i])
    reconstructed_rows.append(reconstructed_row)
reconstructed_cm = pd.DataFrame(reconstructed_rows)

ground_truth_cm = pd.read_csv(path + "/ground_truth_cm" + str(num) + ".txt", sep='\t', index_col = 0)

num_dropped = 0
num_correct = 0
for k in range(10):
    for j in range(10):
        print("new")
        print(dropout_cm.iloc[k,j])
        if dropout_cm.iloc[k,j] == "-":
            num_dropped += 1
            print(ground_truth_cm.iloc[k,j])
            print(reconstructed_cm.iloc[k,j])
            if str(ground_truth_cm.iloc[k,j]) == str(reconstructed_cm.iloc[k,j]):
                num_correct += 1
                print("match")

print(dropout_cm.shape)
print(ground_truth_cm.shape)
print(reconstructed_cm.shape)
accuracy = num_correct/num_dropped
accuracies.append(accuracy)
print(num, accuracy)


In [None]:
def get_predecessor_val(tree, node, ind):
    for pred in tree.network.predecessors(node):
        print(pred.char_vec)
        if pred.char_vec[ind] == '-' or pred.char_vec[ind] == 'H':
            return get_predecessor_val(tree, pred, ind)
        else:
            return pred.char_vec[ind]

In [None]:
reconstructed_rows = []
for index, row in dropout_cm.iterrows():
    node = names_to_leaves[index]
    reconstructed_row = []
    print(row)
    for i in range(len(row)):
        print(i)
        if row[i] == '-' or row[i] == 'H':
            char = get_predecessor_val(test_ppg, node, i)
            if char == None:
                char = '0'
            reconstructed_row.append(char)
            print(char)
        else:
            reconstructed_row.append(row[i])
    reconstructed_rows.append(reconstructed_row)
    print(reconstructed_row)
reconstructed_cm = pd.DataFrame(reconstructed_rows)
