In [None]:
include("../src/NotesData.jl")
using .NotesData

In [None]:
# ================
using DataFrames
using Clustering
using Plots
using StatsPlots
using MultivariateStats
using Plots.PlotMeasures
using StatsBase
using Statistics
# ================

In [None]:
df = NotesData.get_notes_dataframe(drop=true);

In [None]:
firstvarindex = 2;
lastvarindex = ncol(df) - 1;
df.CONT = round.(mean.(eachrow(df[:, firstvarindex:lastvarindex])), digits=1);
maxoutdim = 3;

In [None]:
xdict = Dict();
for i in firstvarindex:lastvarindex
    name =  names(df)[i];
    src = Int64.(round.(df[:, i], digits=0));
    xdict[name] = src;
end
xdf = DataFrame(xdict);
xdict = nothing;
for col in names(xdf)
    i = findfirst(isequal(col), names(xdf))
    xdf[:, i] = NotesData.discretize_notes(xdf[:, i])
end

In [None]:
X = permutedims(Matrix{Float64}(xdf[:, 1:ncol(xdf)]));
# delete varizble rdf from memory
xdf = nothing;
# PCA Analysis
pca = fit(PCA, X, maxoutdim=maxoutdim);
# proj = projection(pca);
X_transformed = projection(pca)' * (X .- mean(pca));
istart = ncol(df) + 1;
df.PC1 = X_transformed[1, :];
df.PC1 = round.(df.PC1, digits=3);
df.PC2 = X_transformed[2, :];
df.PC2 = round.(df.PC2, digits=3);
df.PC3 = X_transformed[3, :];
df.PC3 = round.(df.PC3, digits=3);
iend = ncol(df);
# ==================
pca = nothing;
X = nothing;
X_transformed = nothing;
# ================
X_pca = permutedims(Matrix{Float64}(df[:, istart:iend]));

In [None]:
hard_nclusters = 2:7;
clusterings = kmeans.(Ref(X_pca), hard_nclusters);
Plots.plot((
        Plots.plot(hard_nclusters,
            clustering_quality.(Ref(X_pca), clusterings, quality_index=qidx),
            marker=:circle,
            title=":$qidx", label=nothing,
        ) for qidx in [:silhouettes, :dunn, :calinski_harabasz, :xie_beni, :davies_bouldin])...,
    layout=(2, 3),
    xaxis="N clusters", yaxis="Quality",
    plot_title="\"Hard\" clustering quality indices",
    size=(1000, 600), left_margin=10pt
)

In [None]:
hard_nclusters = nothing;
clusterings = nothing;

In [None]:
k_pca = 4;

In [None]:
R_pca = kmeans(X_pca, k_pca; maxiter=200)
df.PCA_CLUST = assignments(R_pca);
R_pca = nothing;
X_pca = nothing;
vres_cont = [];
for i in 1:k_pca
    xdf = filter(row -> row.PCA_CLUST == i, df)
    push!(vres_cont, xdf.CONT)
end
catdf = DataFrame(
    INDX=1:k_pca,
    VALS_CONT=[mean(vres_cont[i]) for i in 1:k_pca]
);
catdf = sort(catdf, [:VALS_CONT])
INDSX = catdf.INDX;
catdf = nothing;
function xmap_category(x)
    return findfirst(INDSX .== x)
end
df.PCA_CATEG = map(xmap_category, df.PCA_CLUST);
INDSX = nothing;
categs = [];
vres_exam = [];
vres_cont = [];
for i in 1:k_pca
    push!(categs, "Cluster $i")
    xdf = filter(row -> row.PCA_CATEG == i, df)
    push!(vres_exam, xdf.EXAM)
    push!(vres_cont, xdf.CONT)
end

In [None]:
p1 = boxplot(vres_cont, legend=false, title="CONT vs PCA_CATEG", xticks=(1:k_pca, categs));
p2 = boxplot(vres_exam, legend=false, title="EXAM vs PCA_CATEG", xticks=(1:k_pca, categs));
plot(p1, p2, layout=(1, 2), size=(1200, 400))

In [None]:
vres_cont = nothing;
vres_exam = nothing;
categs = nothing;

In [None]:
p1 = @df df density(:CONT[:PCA_CATEG.==1], label="Cluster 1", legend=:topleft, linewidth=3, title="Density of CONT vs PCA_CATEG", xrange=(0, 20), yrange=(0, 0.35))
for k in 2:k_pca
    @df df density!(:CONT[:PCA_CATEG.==k], label="Cluster $k", linewidth=3)
end
@df df density!(:CONT, label="All", linewidth=3, linestyle=:dash)
p2 = @df df density(df.EXAM[:PCA_CATEG.==1], legend=:topleft, label="Cluster 1", linewidth=3, title="Density of EXAM vs PCA_CATEG", xrange=(0, 20), yrange=(0, 0.35))
for k in 2:k_pca
    @df df density!(:EXAM[:PCA_CATEG.==k], label="Cluster $k", linewidth=3)
end
@df df density!(:EXAM, label="All", linewidth=3, linestyle=:dash)
plot(p1, p2, layout=(1, 2), size=(1200, 400))

In [None]:
p1 = @df df scatter(:PC1, :PC2, group=:PCA_CATEG, markersize=6, title="PC2 vs PC1 by PCA_CATEG", legend=:topleft, xlabel="PC1", ylabel="PC2",framestyle=:zerolines)
p2 = @df df scatter(:PC3, :PC2, group=:PCA_CATEG, markersize=6, title="PC2 vs PC3 by PCA_CATEG", legend=:topleft, xlabel="PC3", ylabel="PC2",framestyle=:zerolines)
p3 = @df df scatter(:PC1, :PC3, group=:PCA_CATEG, markersize=6, title="PC3 vs PC1 by PCA_CATEG", legend=:topleft, xlabel="PC1", ylabel="PC3",framestyle=:zerolines)
p4 = @df df scatter(:CONT, :EXAM, group=:PCA_CATEG, markersize=5, title="EXAM vs CONT by PCA_CATEG",legend=:topleft, xlabel="CONT", ylabel="EXAM")
plot(p1, p2, p3, p4, layout=(2, 2), size=(1400, 800))