# Package and Imports

In [87]:
using Pkg
Pkg.add("Clustering")
Pkg.add("CSV")
Pkg.add("DataFrames")
Pkg.add("Distances")
Pkg.add("GaussianMixtures")
Pkg.add("LinearAlgebra")
Pkg.add("Plots")
Pkg.add("ScikitLearn")
Pkg.add("Statistics")

[32m[1m   Updating[22m[39m registry at `~/.julia/registries/General`
######################################################################### 100.0%
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package

In [88]:
using Clustering, CSV, DataFrames, Distances, GaussianMixtures, LinearAlgebra, Plots, ScikitLearn, Statistics

In [89]:
@sk_import preprocessing : LabelEncoder;
@sk_import preprocessing : MinMaxScaler;



# Pre-Processing

In [176]:
# Read cancer data file
data = CSV.read("./01_hdp.csv", DataFrame);
data = convert(Array, data);

In [177]:
# Encode categorical attributes
enc = LabelEncoder();
data[:, 12] = enc.fit_transform(data[:, 12]);
data[:, 13] = enc.fit_transform(data[:, 13]);
data[:, 14] = enc.fit_transform(data[:, 14]);
data[:, 15] = enc.fit_transform(data[:, 15]);
data[:, 24] = enc.fit_transform(data[:, 24]);
data = convert(Array{Float64, 2}, data);

In [178]:
# Normalize columns
scaler = MinMaxScaler();
normalized_data = scaler.fit_transform(data);

In [179]:
# Transpose data
transposed_data = transpose(normalized_data);

In [180]:
# Calculate distances
distances = pairwise(SqEuclidean(), transposed_data, dims=2);

# K-Means

In [232]:
# Initialize dicts
scores = Dict();
results = Dict();
sils = Dict();

In [233]:
# Loop possible cluster sizes
for clust_count = 2 : 40
    # Cache kmeans
    result = kmeans(transposed_data, clust_count);
    results[clust_count] = result;

    # Cache silhouettes
    sil = silhouettes(result, distances);
    sils[clust_count] = sil

    # Cache silhouette score
    scores[clust_count] = mean(sil);
end

In [234]:
# Get cluster size with highest score
max = findmax(scores);
println("Highest silhouette score: $(max[2]) clusters (score: $(max[1]))");

Highest silhouette score: 35 clusters (score: 0.24572098632897574)


In [235]:
# Get outliers indices
s = sils[max[2]];
index1 = findmin(s)[2];
s[index1] = 25;
index2 = findmin(s)[2];
s[index2] = 25;
index3 = findmin(s)[2];

# Find in transposed data
result = results[max[2]];
a = assignments(result);
println("Outliers located in clusters $(a[index1]), $(a[index2]), and $(a[index3])");

Outliers located in clusters 25, 25, and 25


# K-Medoids

In [237]:
# Initialize dicts
scores = Dict();
results = Dict();
sils = Dict();

In [238]:
# Loop possible cluster sizes
for clust_count = 2 : 40
    # Get and cache kmedoids
    result = kmedoids(distances, clust_count);
    results[clust_count] = result;

    # Cache silhouettes
    sil = silhouettes(result, distances);
    sils[clust_count] = sil

    # Cache silhouette score
    scores[clust_count] = mean(silhouettes(result, distances));
end

In [239]:
# Get cluster size with highest score
max = findmax(scores);
println("Highest silhouette score: $(max[2]) clusters (Score: $(max[1]))");

Highest silhouette score: 15 clusters (Score: 0.1935728804576682)


In [240]:
# Get outliers indices
s = sils[max[2]];
index1 = findmin(s)[2];
s[index1] = 25;
index2 = findmin(s)[2];
s[index2] = 25;
index3 = findmin(s)[2];

# Find in transposed data
result = results[max[2]];
a = assignments(result);
println("Outliers located in clusters $(a[index1]), $(a[index2]), and $(a[index3])");

Outliers located in clusters 7, 7, and 7


# Affinity Propagation

In [207]:
# Propagate
result = affinityprop(distances);

# Get silhouettes
sil = silhouettes(result, distances);

# Calculate score
score = mean(sil);

println("Number of clusters: $(size(result.exemplars)[1]) (Silhouette score: $(score))")


Number of clusters: 90 (Silhouette score: 0.025714445769022744)


In [208]:
s = sil
index1 = findmin(s)[2];
s[index1] = 25;
index2 = findmin(s)[2];
s[index2] = 25;
index3 = findmin(s)[2];

# Find in transposed data
r = result;
a = assignments(r);
println("Outliers located in clusters $(a[index1]), $(a[index2]), and $(a[index3])");

Outliers located in clusters 9, 7, and 52


# Hierarchical Clustering

In [209]:
# Initialize dicts
scores = Dict();
sils = Dict();
as = Dict();

In [210]:
# Create tree
result = hclust(distances);

for clust_count = 2 : 40
    # Cache assignments
    assignments = cutree(result, k=clust_count);
    as[clust_count] = assignments

    # Cache silhouettes
    s = silhouettes(assignments, distances);
    sils[clust_count] = s;

    # Cache score
    score = mean(s);
    scores[clust_count] = score;
end

In [211]:
# Get cluster size with highest score
max = findmax(scores);
println("Highest silhouette score: $(max[2]) clusters (Score: $(max[1]))");

Highest silhouette score: 2 clusters (Score: 0.2258533173365853)


In [212]:
# Get outliers indices
s = sils[max[2]];
index1 = findmin(s)[2];
s[index1] = 25;
index2 = findmin(s)[2];
s[index2] = 25;
index3 = findmin(s)[2];

# Find in transposed data
result = results[max[2]];
a = assignments(result);
println("Outliers located in clusters $(a[index1]), $(a[index2]), and $(a[index3])");

Outliers located in clusters 1, 2, and 1


# GMM Model

In [221]:
# Initialize dicts
scores = Dict();
models = Dict();

In [222]:
# Loop cluster counts
for clust_count = 2 : 40
    # Cache model
    model = GMM(clust_count, 27);
    em!(model, normalized_data);
    models[clust_count] = model
    
    # Cache score of model
    score = avll(model, normalized_data);
    scores[clust_count] = score;
end

In [223]:
# Get cluster size with highest score
max = findmax(scores);
println("Highest silhouette score: $(max[2]) clusters (Log-likelihood: $(max[1]))");

Highest silhouette score: 35 clusters (Log-likelihood: 0.11334248049847166)


In [225]:
# Get outliers indices
ll = llpg(models[max[2]], normalized_data)
index1 = findmin(ll)[2];
ll[index1] = 25;
index2 = findmin(ll)[2];
ll[index2] = 25;
index3 = findmin(ll)[2];

# Find in transposed data
println("Outliers located at $(index1), $(index2), and $(index3)");

Outliers located at CartesianIndex(8051, 35), CartesianIndex(8051, 21), and CartesianIndex(8051, 22)
