In [14]:
using JLD
using FileIO
using SparseArrays
using CSV
using Distances
using Clustering
using DataFrames
using MultivariateStats
using PhyloClustering
using ParallelKMeans

# used for accuracy()
using Hungarian
using MLBase
using LinearAlgebra

function ground_true(df_1, df_2)
    a = fill(1,size(df_1)[1])
    b = fill(2,size(df_2)[1])
    gt = cat(a,b, dims = 1)
    return gt
end

function accuracy(n, gt, pred)
    matrix = confusmat(n, gt, pred)
    # Hungarian algorithm minimizes the cost, so we need to transform the matrix
    A = -matrix .+ maximum(matrix)    
    matrix = matrix[:,hungarian(A)[1]]
    x = tr(matrix)/sum(matrix)
    return matrix, x
end

accuracy (generic function with 1 method)

In [3]:
tree1 = CSV.read("../../data/trees/4_diff_topo_1_1000_11.csv", DataFrame);
tree2 = CSV.read("../../data/trees/4_diff_topo_12_1000_11.csv", DataFrame);
tree1 = Matrix(tree1);
tree2 = Matrix(tree2);

In [5]:
gt = ground_true(tree1, tree2);
tree = vcat(tree1,tree2);
tree = standardize_tree(tree);
distances = distance(tree);
M = fit(MDS, distances; maxoutdim=5, distances=true)
Y = predict(M)

5×2000 Matrix{Float64}:
 -1.37519   -0.21631    1.02305   …  -1.72528   -1.9258    -1.30154
 -0.71968   -0.782244  -1.19323       0.390588   0.78688   -2.07822
 -0.548004   0.183745   0.871203     -1.17908   -0.295854   0.180827
  1.02889   -0.533747  -0.297327     -0.560565   1.22307    1.66759
  1.24852    0.116594  -0.568726     -0.290636  -1.33846    1.45876

In [6]:
matrix = pairwise(Euclidean(), Y, dims=2)
hc = hc_label(matrix, 2)
accuracy(2, gt, hc)

([839 161; 21 979], 0.909)

In [7]:
kmeans_pred = ParallelKMeans.kmeans(Yinyang(),Y, 2)
accuracy(2, gt, kmeans_pred.assignments)

([729 271; 10 990], 0.8595)

In [None]:
for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/8_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld2", "tree", tree)
    end 
end

for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_500_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/8_diff_topo_" * string(j) *  "_500_" * string(i) * ".jld2", "tree", tree)
    end 
end

for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_1000_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/8_diff_topo_" * string(j) *  "_1000_" * string(i) * ".jld2", "tree", tree)
    end 
end

for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_5000_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/8_diff_topo_" * string(j) *  "_5000_" * string(i) * ".jld2", "tree", tree)
    end 
end

for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/16_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld2", "tree", tree)
    end 
end

for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_500_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/16_diff_topo_" * string(j) *  "_500_" * string(i) * ".jld2", "tree", tree)
    end 
end

for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_1000_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/16_diff_topo_" * string(j) *  "_1000_" * string(i) * ".jld2", "tree", tree)
    end 
end

for i in 1:100
    print("$i ")
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_5000_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        save("data/16_diff_topo_" * string(j) *  "_5000_" * string(i) * ".jld2", "tree", tree)
    end 
end

In [7]:
function hc_matrix(trees, path)
    n = length(trees)
    result = zeros(n, n)
    for i in 2:n
        for j in  1:(i - 1)
            gt = ground_true(trees[i],trees[j])
            tree = vcat(trees[i],trees[j])
            tree = standardize_tree(Matrix(tree))
            distances = distance(tree)
            M = fit(MDS, distances; maxoutdim=5, distances=true)
            Y = predict(M)
            matrix = distance(Y)
            pred = hc_label(matrix, 2)
            m,x = accuracy(2, gt, pred)
            result[i,j] = x
            result[j,i] = x
        end
    end      
    header = Vector(1:n)
    header = string.(header)
    CSV.write(path, DataFrame(result, :auto),header = header);
end

hc_matrix (generic function with 1 method)

In [None]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_taxa_" * string(j) *"_1_50_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    hc_matrix(trees, "../../data/sd-result/4-taxon/same-topo/hc/hc_4taxa_50_" * string(i) * ".csv");
end

for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_taxa_" * string(j) *"_1_100_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    hc_matrix(trees, "../../data/sd-result/4-taxon/same-topo/hc/hc_4taxa_100_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/4_taxa_" * string(j) *"_1_500_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    hc_matrix(trees, "result/same-topo/hc/hc_4taxa_500_" * string(i) * ".csv"); 
end

# TODO data transfer
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_taxa_" * string(j) *"_1_1000_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    hc_matrix(trees, "../../data/sd-result/4-taxon/same-topo/hc/hc_4taxa_1000_" * string(i) * ".csv");
end

In [None]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_diff_topo_" * string(j) *  "_50_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    hc_matrix(trees, "../../data/sd-result/4-taxon/diff-topo/hc/hc_4_diff_topo_50_" * string(i) * ".csv");
end

for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_diff_topo_" * string(j) *  "_100_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    hc_matrix(trees, "../../data/sd-result/4-taxon/diff-topo/hc/hc_4_diff_topo_100_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/4_diff_topo_" * string(j) *  "_500_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/hc_4_diff_topo_500_" * string(i) * ".csv");
end

In [None]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/8_diff_topo_" * string(j) *  "_50_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "../../data/sd-result/8-taxon/hc/hc_8_diff_topo_50_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/8/hc_8_diff_topo_100_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_500_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/8/hc_8_diff_topo_500_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_1000_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/8/hc_8_diff_topo_1000_" * string(i) * ".csv");
end

# TODO
Threads.@threads for i in 1:100
    trees = []
    print("$i ")
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_5000_" * string(i) * ".jld2"
        tree = load(path, "tree")
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/8/hc_8_diff_topo_5000_" * string(i) * ".csv");
end

In [9]:
Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_50_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/16/hc_16_diff_topo_50_" * string(i) * ".csv");
end

for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/16_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "../../data/sd-result/16-taxon/hc/hc_16_diff_topo_100_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_500_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/16/hc_16_diff_topo_500_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_1000_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/16/hc_16_diff_topo_1000_" * string(i) * ".csv");
end

#TODO 
Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/16_diff_topo_" * string(j) *  "_5000_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    hc_matrix(trees, "result/hc/16/hc_16_diff_topo_5000_" * string(i) * ".csv");
end

In [10]:
function kmeans_matrix(trees, path)
    n = length(trees)
    result = zeros(n, n)
    for i in 2:n
        for j in  1:(i - 1)
            gt = ground_true(trees[i],trees[j])
            tree = vcat(trees[i],trees[j])
            tree = standardize_tree(Matrix(tree))
            distances = distance(tree)
            M = fit(MDS, distances; maxoutdim=5, distances=true)
            Y = predict(M)
            kmeans_pred = ParallelKMeans.kmeans(Yinyang(),Y, 2)
            m,x = accuracy(2, gt, kmeans_pred.assignments)
            result[i,j] = x
            result[j,i] = x
        end
    end      
    header = Vector(1:n)
    header = string.(header)
    CSV.write(path, DataFrame(result, :auto),header = header);
end

function rep_kmeans_matrix(trees, path)
    n = length(trees)
    result = zeros(n, n)
    for i in 2:n
        for j in  1:(i - 1)
            gt = ground_true(trees[i],trees[j])
            tree = vcat(trees[i],trees[j])
            tree = standardize_tree(Matrix(tree))
            distances = distance(tree)
            M = fit(MDS, distances; maxoutdim=5, distances=true)
            Y = predict(M)
            for k in 1:5
                kmeans_pred = ParallelKMeans.kmeans(Yinyang(),Y, 2)
                m,x = accuracy(2, gt, kmeans_pred.assignments)
                if x > result[i,j]
                    result[i,j] = x
                    result[j,i] = x
                end
            end
        end
    end      
    header = Vector(1:n)
    header = string.(header)
    CSV.write(path, DataFrame(result, :auto),header = header);
end

rep_kmeans_matrix (generic function with 1 method)

In [11]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_taxa_" * string(j) *"_1_50_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    rep_kmeans_matrix(trees, "../../data/sd-result/4-taxon/same-topo/repKmeans/kmeans_4taxa_50_" * string(i) * ".csv");
end

for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_taxa_" * string(j) *"_1_100_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    rep_kmeans_matrix(trees, "../../data/sd-result/4-taxon/same-topo/repKmeans/kmeans_4taxa_100_" * string(i) * ".csv");
end

In [15]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_diff_topo_" * string(j) *  "_50_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    rep_kmeans_matrix(trees, "../../data/sd-result/4-taxon/diff-topo/repKmeans/kmeans_4_diff_topo_50_" * string(i) * ".csv");
end

for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/4_diff_topo_" * string(j) *  "_100_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    rep_kmeans_matrix(trees, "../../data/sd-result/4-taxon/diff-topo/repKmeans/kmeans_4_diff_topo_100_" * string(i) * ".csv");
end

Threads.@threads for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        path = "data/4_diff_topo_" * string(j) *  "_500_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end
    rep_kmeans_matrix(trees, "result/4/kmeans_4_diff_topo_500_" * string(i) * ".csv");
end

In [16]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/8_diff_topo_" * string(j) *  "_50_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    rep_kmeans_matrix(trees, "../../data/sd-result/8-taxon/repKmeans/kmeans_8_diff_topo_50_" * string(i) * ".csv");
end

for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        for j in 1:15
            path = "data/8_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld"
            tree = jldopen(path, "r") do file
                read(file, "tree")
            end
            push!(trees, tree)
        end
    end
    rep_kmeans_matrix(trees, "result/8-taxon/repKmeans/kmeans_8_diff_topo_100_" * string(i) * ".csv");
end


for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        for j in 1:15
            path = "data/8_diff_topo_" * string(j) *  "_500_" * string(i) * ".jld"
            tree = jldopen(path, "r") do file
                read(file, "tree")
            end
            push!(trees, tree)
        end
    end
    rep_kmeans_matrix(trees, "result/8-taxon/repKmeans/kmeans_8_diff_topo_500_" * string(i) * ".csv");
end

#TODO
for i in 17:100
    print("$i ")
    trees = []
    for j in 1:15
        for j in 1:15
            path = "data/8_diff_topo_" * string(j) *  "_1000_" * string(i) * ".jld"
            tree = jldopen(path, "r") do file
                read(file, "tree")
            end
            push!(trees, tree)
        end
    end
    rep_kmeans_matrix(trees, "result/8-taxon/repKmeans/kmeans_8_diff_topo_1000_" * string(i) * ".csv");
end

In [17]:
for i in 1:100
    print("$i ")
    trees = []
    for j in 1:15
        for j in 1:15
            path = "data/16_diff_topo_" * string(j) *  "_50_" * string(i) * ".jld"
            tree = jldopen(path, "r") do file
                read(file, "tree")
            end
            push!(trees, tree)
        end
    end
    rep_kmeans_matrix(trees, "result/16taxa/kmeans_16_diff_topo_50_" * string(i) * ".csv");
end

# TODO
for i in 1:100
    trees = []
    for j in 1:15
        path = "../../data/trees/16_diff_topo_" * string(j) *  "_100_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end
    rep_kmeans_matrix(trees, "../../data/sd-result/16-taxon/repKmeans/kmeans_16_diff_topo_100_" * string(i) * ".csv");
end

