# Adjacency constrained Hierarchical clustering

In [1]:
# load packages for this tutorial
using Revise
using Knockoffs
using LinearAlgebra
using Random
using StatsBase
using Statistics
using ToeplitzMatrices
using Distributions
using Clustering
using ProgressMeter
using LowRankApprox
using Test
using RCall
# using Plots
# gr(fmt=:png);

# some helper functions to compute power and empirical FDR
function TP(correct_groups, signif_groups)
    return length(signif_groups ∩ correct_groups) / max(1, length(correct_groups))
end
function TP(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return TP(correct_groups, signif_groups)
end
function power(correct_snps, discovered_snps)
    return length(discovered_snps ∩ correct_snps) / length(correct_snps)
end
function FDR(correct_groups, signif_groups)
    FP = length(signif_groups) - length(signif_groups ∩ correct_groups) # number of false positives
    return FP / max(1, length(signif_groups))
end
function FDR(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return FDR(correct_groups, signif_groups)
end
function get_signif_groups(β, groups)
    correct_groups = Int[]
    for i in findall(!iszero, β)
        g = groups[i]
        g ∈ correct_groups || push!(correct_groups, g)
    end
    return correct_groups
end


┌ Info: Precompiling Knockoffs [878bf26d-0c49-448a-9df5-b057c815d613]
└ @ Base loading.jl:1423


get_signif_groups (generic function with 1 method)

## Simulate distance matrix

In [83]:
m = 1
p = 510
k = 10 # number of causal groups
n = 250 # sample size
μ = zeros(p)
Σ = simulate_AR1(p, a=3, b=1)
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1))

distmat = cor(X)
@inbounds @simd for i in eachindex(distmat)
    distmat[i] = 1 - abs(distmat[i])
end
distmat

510×510 Matrix{Float64}:
 0.0       0.167823  0.731613   0.756078   …  0.941255   0.995377  0.995266
 0.167823  0.0       0.697667   0.722166      0.944739   0.998593  0.984422
 0.731613  0.697667  0.0        0.0652971     0.906421   0.946203  0.952265
 0.756078  0.722166  0.0652971  0.0           0.882778   0.915279  0.94765
 0.759292  0.746016  0.227585   0.181946      0.869516   0.92456   0.924731
 0.795561  0.807483  0.434723   0.414544   …  0.863424   0.935694  0.967145
 0.884693  0.879195  0.5801     0.569747      0.897584   0.970291  0.959074
 0.89157   0.921525  0.623816   0.619271      0.914606   0.990103  0.964012
 0.891972  0.91165   0.687795   0.675397      0.908559   0.993061  0.984018
 0.908683  0.918382  0.705699   0.692846      0.921397   0.992739  0.948852
 0.945385  0.967093  0.776381   0.745749   …  0.944651   0.96732   0.93935
 0.94617   0.986658  0.858884   0.81102       0.983365   0.945028  0.976951
 0.951971  0.99347   0.8629     0.813121      0.984755   0.948925

In [104]:
# native implementation
@time groups = Knockoffs.adj_constrained_hclust(distmat, h=0.3)
groups

clusters = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [123], [124], [125], [126], [127], [128], [129], [130], [131], [132], [133], [134], [135], [136], [137], [138], [139], [140], [141], [142], [143], [144], [145], [146], [147], [148], [149], [150], [151], [152], [153], [154], [155], [156], [157

Excessive output truncated after 526925 bytes.


clusters = [[1], [2], [3, 4], [5], [6], [7], [8], [9], [10], [11], [12, 13, 14], [15, 16, 17], [18], [19], [20], [21], [22, 23], [24], [25], [26], [27, 28], [29], [30, 31], [32], [33], [34, 35], [36], [37], [38, 39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49, 50], [51], [52], [53, 54], [55], [56], [57], [58], [59, 60], [61, 62], [63, 64, 65], [66], [67], [68, 69], [70, 71, 72], [73, 74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89, 90], [91, 92], [93, 94, 95, 96], [97], [98], [99], [100], [101, 102, 103, 104], [105, 106], [107], [108], [109], [110], [111], [112, 113], [114, 115, 116], [117], [118], [119], [120], [121], [122], [123], [124], [125, 126], [127, 128, 129], [130], [131, 132], [133], [134], [135, 136, 137], [138], [139, 140, 141], [142], [143], [144], [145], [146], [147, 148], [149], [150, 151, 152], [153], [154, 155, 156], [157], [158], [159, 160], [161], [162, 163], [164], [165], [166], [167], [168], [169], [170],

510-element Vector{Int64}:
   1
   1
   2
   2
   2
   2
   2
   2
   2
   2
   2
   2
   2
   ⋮
 153
 153
 154
 154
 154
 154
 154
 154
 154
 154
 155
 156

In [101]:
# hierarchical clustering using Clustering.jl package
@time cluster_result = hclust(distmat; linkage=:single)
@time groups2 = cutree(cluster_result, h=0.3)

  0.004791 seconds (415 allocations: 2.132 MiB)
  0.000098 seconds (302 allocations: 41.062 KiB)


510-element Vector{Int64}:
   1
   1
   2
   2
   2
   2
   2
   2
   2
   2
   2
   2
   2
   ⋮
 154
 154
 155
 155
 155
 155
 155
 155
 155
 155
 156
 157

In [102]:
[groups groups2]

510×2 Matrix{Int64}:
   1    1
   1    1
   2    2
   2    2
   2    2
   2    2
   2    2
   2    2
   2    2
   2    2
   2    2
   2    2
   2    2
   ⋮  
 153  154
 153  154
 154  155
 154  155
 154  155
 154  155
 154  155
 154  155
 154  155
 154  155
 155  156
 156  157