# ItaData2024

## URTI

propositional analysis

In [1]:
using Pkg
Pkg.activate(".")
using MLJ, ModalDecisionTrees
using SoleDecisionTreeInterface, Sole, SoleData
using CategoricalArrays
using DataFrames, JLD2, CSV
using Audio911
using Random
using StatsBase, Catch22
using Test
using Plots

[32m[1m  Activating[22m[39m project at `~/Documents/Aclai/audio-rules2024`


### Settings

In [2]:
# experiment = :Pneumonia
# experiment = :Bronchiectasis
# experiment = :COPD
experiment = :URTI
# experiment = :Bronchiolitis

features = :catch9
# features = :minmax
# features = :custom

# loadset = false
loadset = true

scale = :semitones
# scale = :mel_htk

sr = 8000

featset = ()
# featset = (:mfcc,)
# featset = (:f0,)
# featset = (:mfcc, :f0)

audioparams = (
    sr = sr,
    nfft = 512,
    mel_scale = scale, # :mel_htk, :mel_slaney, :erb, :bark, :semitones, :tuned_semitones
    mel_nbands = scale == :semitones ? 14 : 26,
    mfcc_ncoeffs = scale == :semitones ? 7 : 13,
    mel_freqrange = (300, round(Int, sr / 2)),
    mel_dbscale = :mfcc in featset ? false : true,
    audio_norm = true,
)

memguard = false;
# memguard = true;
n_elems = 15;

avail_exp = [:Pneumonia, :Bronchiectasis, :COPD, :URTI, :Bronchiolitis]

@assert experiment in avail_exp "Unknown type of experiment: $experiment."

findhealthy = y -> findall(x -> x == "Healthy", y)
ds_path = "/datasets/respiratory_Healthy_" * String(experiment)
findsick = y -> findall(x -> x == String(experiment), y)
filename = "/datasets/itadata2024_" * String(experiment) * "_files"
memguard && begin filename *= string("_memguard") end

destpath = "results/propositional"
jld2file = destpath * "/itadata2024_" * String(experiment) * "_" * String(scale) * ".jld2"
dsfile = destpath * "/ds_test_" * String(experiment) * "_" * String(scale) * ".jld2"

color_code = Dict(:red => 31, :green => 32, :yellow => 33, :blue => 34, :magenta => 35, :cyan => 36);
r_select = r"\e\[\d+m(.*?)\e\[0m";

### Prepare dataset for analysis

In [4]:
d = jldopen(string((@__DIR__), ds_path, ".jld2"))
x, y = d["dataframe_validated"]
@assert x isa DataFrame
close(d)

memguard && begin
    cat2 = round(Int, length(y)/2)
    indices = [1:n_elems; cat2:cat2+n_elems-1]
    x = x[indices, :]
    y = y[indices]
end

freq = round.(Int, afe(x[1, :audio]; featset=(:get_only_freqs), audioparams...))

catch9_f = ["max", "min", "mean", "med", "std", "bsm", "bsd", "qnt", "3ac"]
variable_names = vcat([
    vcat(
        ["\e[$(color_code[:yellow])m$j(mel$i=$(freq[i])Hz)\e[0m" for i in 1:audioparams.mel_nbands],
        :mfcc in featset ? ["\e[$(color_code[:red])m$j(mfcc$i)\e[0m" for i in 1:audioparams.mfcc_ncoeffs] : String[],
        :f0 in featset ? ["\e[$(color_code[:green])m$j(f0)\e[0m"] : String[],
        "\e[$(color_code[:cyan])m$j(cntrd)\e[0m", "\e[$(color_code[:cyan])m$j(crest)\e[0m",
        "\e[$(color_code[:cyan])m$j(entrp)\e[0m", "\e[$(color_code[:cyan])m$j(flatn)\e[0m", "\e[$(color_code[:cyan])m$j(flux)\e[0m",
        "\e[$(color_code[:cyan])m$j(kurts)\e[0m", "\e[$(color_code[:cyan])m$j(rllff)\e[0m", "\e[$(color_code[:cyan])m$j(skwns)\e[0m",
        "\e[$(color_code[:cyan])m$j(decrs)\e[0m", "\e[$(color_code[:cyan])m$j(slope)\e[0m", "\e[$(color_code[:cyan])m$j(sprd)\e[0m"
    )
    for j in catch9_f
]...)
    
catch9 = [
    maximum,
    minimum,
    StatsBase.mean,
    median,
    std,
    Catch22.SB_BinaryStats_mean_longstretch1,
    Catch22.SB_BinaryStats_diff_longstretch0,
    Catch22.SB_MotifThree_quantile_hh,
    Catch22.SB_TransitionMatrix_3ac_sumdiagcov,
]

if !loadset
    @info("Build dataset...")

    X = DataFrame([name => Float64[] for name in [match(r_select, v)[1] for v in variable_names]])
    audiofeats = [afe(row[:audio]; featset=featset, audioparams...) for row in eachrow(x)]
    push!(X, vcat([vcat([map(func, eachcol(row)) for func in catch9]...) for row in audiofeats])...)

    yc = CategoricalArray(y);

    train_ratio = 0.8

    train, test = partition(eachindex(yc), train_ratio, shuffle=true)
    X_train, y_train = X[train, :], yc[train]
    X_test, y_test = X[test, :], yc[test]
    save(dsfile, Dict("X_test" => X_test, "y_test" => y_test))

    println("Training set size: ", size(X_train), " - ", length(y_train))
    println("Test set size: ", size(X_test), " - ", length(y_test))
end

┌ Info: Build dataset...
└ @ Main /home/paso/Documents/Aclai/audio-rules2024/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_X10sZmlsZQ==.jl:42


Training set size: (342, 225) - 342
Test set size: (86, 225) - 86


### Train a model

In [5]:
if !loadset
    learned_dt_tree = begin
        Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree
        model = Tree(max_depth=-1, )
        mach = machine(model, X_train, y_train)
        fit!(mach)
        fitted_params(mach).tree
    end
end

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /home/paso/.julia/packages/MLJModels/8W54X/src/loading.jl:159


import MLJDecisionTreeInterface ✔


┌ Info: Training machine(DecisionTreeClassifier(max_depth = -1, …), …).
└ @ MLJBase /home/paso/.julia/packages/MLJBase/7nGJF/src/machines.jl:499


mean(sprd) < 336.6
├─ min(mel9=1419Hz) < -6.463
│  ├─ min(mel2=424Hz) < -5.698
│  │  ├─ URTI (3/3)
│  │  └─ Healthy (9/9)
│  └─ 3ac(mel3=504Hz) < 0.1458
│     ├─ qnt(mel10=1687Hz) < 2.147
│     │  ├─ URTI (64/64)
│     │  └─ bsm(mel1=357Hz) < 13.0
│     │     ├─ Healthy (2/2)
│     │     └─ URTI (2/2)
│     └─ std(mel4=599Hz) < 0.7608
│        ├─ URTI (3/3)
│        └─ Healthy (5/5)
└─ med(cntrd) < 1169.0
   ├─ max(kurts) < 309.5
   │  ├─ bsm(mel3=504Hz) < 9.5
   │  │  ├─ min(mel8=1194Hz) < -5.629
   │  │  │  ├─ max(mel6=845Hz) < -3.627
   │  │  │  │  ⋮
   │  │  │  │  
   │  │  │  └─ min(skwns) < 0.06045
   │  │  │     ⋮
   │  │  │     
   │  │  └─ mean(mel14=3366Hz) < -4.572
   │  │     ├─ mean(rllff) < 1694.0
   │  │     │  ⋮
   │  │     │  
   │  │     └─ URTI (6/6)
   │  └─ Healthy (16/16)
   └─ Healthy (26/26)


### Model inspection & rule study

In [6]:
if !loadset
    sole_dt = solemodel(learned_dt_tree)
    # Make test instances flow into the model, so that test metrics can, then, be computed.
    apply!(sole_dt, X_test, y_test);
    # Save solemodel to disk
    save(jld2file, Dict("sole_dt" => sole_dt))
else
    @info("Load dataset...")
    d = jldopen(dsfile)
    X_test = d["X_test"]
    y_test = d["y_test"]
    close(d)
    d = jldopen(jld2file)
    sole_dt = d["sole_dt"]
    close(d)
end
# Print Sole model
printmodel(sole_dt; show_metrics = true, variable_names_map = variable_names);

[34m▣[0m [1m[36mmean(sprd)[0m [1m<[0m[0m 336.63940662391843
├✔ [1m[33mmin(mel9=1419Hz)[0m [1m<[0m[0m -6.463391337513174
│├✔ [1m[33mmin(mel2=424Hz)[0m [1m<[0m[0m -5.698344532462192
││├✔ URTI : (ninstances = 0, ncovered = 0, confidence = NaN, lift = NaN)
││└✘ Healthy : (ninstances = 0, ncovered = 0, confidence = NaN, lift = NaN)
│└✘ [1m[33m3ac(mel3=504Hz)[0m [1m<[0m[0m 0.14583333333333337
│ ├✔ [1m[33mqnt(mel10=1687Hz)[0m [1m<[0m[0m 2.1470638625858367
│ │├✔ URTI : (ninstances = 22, ncovered = 22, confidence = 0.73, lift = 1.0)
│ │└✘ [1m[33mbsm(mel1=357Hz)[0m [1m<[0m[0m 13.0
│ │ ├✔ Healthy : (ninstances = 0, ncovered = 0, confidence = NaN, lift = NaN)
│ │ └✘ URTI : (ninstances = 0, ncovered = 0, confidence = NaN, lift = NaN)
│ └✘ [1m[33mstd(mel4=599Hz)[0m [1m<[0m[0m 0.7608194009407704
│  ├✔ URTI : (ninstances = 1, ncovered = 1, confidence = 1.0, lift = 1.0)
│  └✘ Healthy : (ninstances = 0, ncovered = 0, confidence = NaN, lift = NaN)
└✘ [1m[36mm

### Extract rules that are at least as good as a random baseline model

In [7]:
interesting_rules = listrules(sole_dt, min_lift = 1.0, min_ninstances = 0);
printmodel.(interesting_rules; show_metrics = true, variable_names_map = variable_names);

[34m▣[0m ([1m[36mmean(sprd)[0m [1m<[0m[0m 336.63940662391843) ∧ (¬([1m[33mmin(mel9=1419Hz)[0m [1m<[0m[0m -6.463391337513174)) ∧ ([1m[33m3ac(mel3=504Hz)[0m [1m<[0m[0m 0.14583333333333337) ∧ ([1m[33mqnt(mel10=1687Hz)[0m [1m<[0m[0m 2.1470638625858367)  ↣  URTI : (ninstances = 86, ncovered = 22, coverage = 0.26, confidence = 0.73, lift = 1.33, natoms = 4)
[34m▣[0m ([1m[36mmean(sprd)[0m [1m<[0m[0m 336.63940662391843) ∧ (¬([1m[33mmin(mel9=1419Hz)[0m [1m<[0m[0m -6.463391337513174)) ∧ (¬([1m[33m3ac(mel3=504Hz)[0m [1m<[0m[0m 0.14583333333333337)) ∧ ([1m[33mstd(mel4=599Hz)[0m [1m<[0m[0m 0.7608194009407704)  ↣  URTI : (ninstances = 86, ncovered = 1, coverage = 0.01, confidence = 1.0, lift = 1.83, natoms = 4)
[34m▣[0m (¬([1m[36mmean(sprd)[0m [1m<[0m[0m 336.63940662391843)) ∧ ([1m[36mmed(cntrd)[0m [1m<[0m[0m 1169.0875935533527) ∧ ([1m[36mmax(kurts)[0m [1m<[0m[0m 309.54313425171983) ∧ ([1m[33mbsm(mel3=504Hz)[0m [1m<[0m[0m 

### Simplify rules while extracting and prettify result

In [8]:
interesting_rules = listrules(sole_dt, min_lift = 1.0, min_ninstances = 0, normalize = true);
printmodel.(interesting_rules; show_metrics = true, syntaxstring_kwargs = (; threshold_digits = 2), variable_names_map = variable_names);

[34m▣[0m ([1m[36mmean(sprd)[0m [1m<[0m[0m 336.64) ∧ ([1m[33mmin(mel9=1419Hz)[0m [1m≥[0m[0m -6.46) ∧ ([1m[33m3ac(mel3=504Hz)[0m [1m<[0m[0m 0.15) ∧ ([1m[33mqnt(mel10=1687Hz)[0m [1m<[0m[0m 2.15)  ↣  URTI : (ninstances = 86, ncovered = 22, coverage = 0.26, confidence = 0.73, lift = 1.33, natoms = 4)
[34m▣[0m ([1m[36mmean(sprd)[0m [1m<[0m[0m 336.64) ∧ ([1m[33mmin(mel9=1419Hz)[0m [1m≥[0m[0m -6.46) ∧ ([1m[33m3ac(mel3=504Hz)[0m [1m≥[0m[0m 0.15) ∧ ([1m[33mstd(mel4=599Hz)[0m [1m<[0m[0m 0.76)  ↣  URTI : (ninstances = 86, ncovered = 1, coverage = 0.01, confidence = 1.0, lift = 1.83, natoms = 4)
[34m▣[0m ([1m[36mmean(sprd)[0m [1m≥[0m[0m 336.64) ∧ ([1m[36mmed(cntrd)[0m [1m<[0m[0m 1169.09) ∧ ([1m[36mmax(kurts)[0m [1m<[0m[0m 309.54) ∧ ([1m[33mbsm(mel3=504Hz)[0m [1m<[0m[0m 9.5) ∧ ([1m[33mmin(mel8=1194Hz)[0m [1m<[0m[0m -5.63) ∧ ([1m[33mmax(mel6=845Hz)[0m [1m<[0m[0m -3.63)  ↣  Healthy : (ninstances = 86, ncovered = 

### Directly access rule metrics

In [9]:
readmetrics.(listrules(sole_dt; min_lift=1.0, min_ninstances = 0))

16-element Vector{@NamedTuple{ninstances::Int64, ncovered::Int64, coverage::Float64, confidence::Float64, lift::Float64, natoms::Int64}}:
 (ninstances = 86, ncovered = 22, coverage = 0.2558139534883721, confidence = 0.7272727272727273, lift = 1.330754352030948, natoms = 4)
 (ninstances = 86, ncovered = 1, coverage = 0.011627906976744186, confidence = 1.0, lift = 1.8297872340425534, natoms = 4)
 (ninstances = 86, ncovered = 5, coverage = 0.05813953488372093, confidence = 1.0, lift = 2.2051282051282053, natoms = 6)
 (ninstances = 86, ncovered = 1, coverage = 0.011627906976744186, confidence = 1.0, lift = 2.2051282051282053, natoms = 7)
 (ninstances = 86, ncovered = 3, coverage = 0.03488372093023256, confidence = 1.0, lift = 1.8297872340425534, natoms = 7)
 (ninstances = 86, ncovered = 7, coverage = 0.08139534883720931, confidence = 0.7142857142857143, lift = 1.3069908814589666, natoms = 6)
 (ninstances = 86, ncovered = 1, coverage = 0.011627906976744186, confidence = 1.0, lift = 1.829787

### Show rules with an additional metric (syntax height of the rule's antecedent)

In [10]:
printmodel.(sort(interesting_rules, by = readmetrics); show_metrics = (; round_digits = nothing, additional_metrics = (; height = r->SoleLogics.height(antecedent(r)))), variable_names_map = variable_names);

[34m▣[0m ([1m[36mmean(sprd)[0m [1m<[0m[0m 336.63940662391843) ∧ ([1m[33mmin(mel9=1419Hz)[0m [1m≥[0m[0m -6.463391337513174) ∧ ([1m[33m3ac(mel3=504Hz)[0m [1m≥[0m[0m 0.14583333333333337) ∧ ([1m[33mstd(mel4=599Hz)[0m [1m<[0m[0m 0.7608194009407704)  ↣  URTI : (ninstances = 86, ncovered = 1, coverage = 0.011627906976744186, confidence = 1.0, lift = 1.8297872340425534, natoms = 4, height = 3)
[34m▣[0m ([1m[36mmean(sprd)[0m [1m≥[0m[0m 336.63940662391843) ∧ ([1m[36mmed(cntrd)[0m [1m<[0m[0m 1169.0875935533527) ∧ ([1m[36mmax(kurts)[0m [1m<[0m[0m 309.54313425171983) ∧ ([1m[33mbsm(mel3=504Hz)[0m [1m<[0m[0m 9.5) ∧ ([1m[33mmin(mel8=1194Hz)[0m [1m≥[0m[0m -5.6292906888019525) ∧ ([1m[36mmin(skwns)[0m [1m≥[0m[0m 0.060454056192696226) ∧ ([1m[33mmed(mel1=357Hz)[0m [1m<[0m[0m -2.3892659624167787) ∧ ([1m[36mmax(cntrd)[0m [1m<[0m[0m 1068.6700214056882)  ↣  URTI : (ninstances = 86, ncovered = 1, coverage = 0.011627906976744186, confide

### Pretty table of rules and their metrics

In [11]:
metricstable(interesting_rules; variable_names_map = variable_names, metrics_kwargs = (; round_digits = nothing, additional_metrics = (; height = r->SoleLogics.height(antecedent(r)))))

┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────┬────────────┬──────────┬───────────┬────────────┬─────────┬────────┬────────┐
│[33;1m                                                                                                                                                                                                                                                                                                                                                                                          Antecedent [0m│[33;1m Consequent [0m│[33;1m ninstances [0m│[33;1m ncovered [0m│[33;1m  coverage [0m│[33;1m confidence [0m

# Inspect features

In [12]:
interesting_rules = listrules(sole_dt,
	min_lift = 1.0,
	# min_lift = 2.0,
	min_ninstances = 0,
	min_coverage = 0.10,
	normalize = true,
);
map(r->(consequent(r), readmetrics(r)), interesting_rules)
printmodel.(interesting_rules; show_metrics = true, syntaxstring_kwargs = (; threshold_digits = 2), variable_names_map=variable_names);

[34m▣[0m ([1m[36mmean(sprd)[0m [1m<[0m[0m 336.64) ∧ ([1m[33mmin(mel9=1419Hz)[0m [1m≥[0m[0m -6.46) ∧ ([1m[33m3ac(mel3=504Hz)[0m [1m<[0m[0m 0.15) ∧ ([1m[33mqnt(mel10=1687Hz)[0m [1m<[0m[0m 2.15)  ↣  URTI : (ninstances = 86, ncovered = 22, coverage = 0.26, confidence = 0.73, lift = 1.33, natoms = 4)


In [13]:
interesting_features = unique(SoleData.feature.(SoleLogics.value.(vcat(SoleLogics.atoms.(i.antecedent for i in interesting_rules)...))))
interesting_variables = sort(SoleData.i_variable.(interesting_features))

4-element Vector{Symbol}:
 Symbol("3ac(mel3=504Hz)")
 Symbol("mean(sprd)")
 Symbol("min(mel9=1419Hz)")
 Symbol("qnt(mel10=1687Hz)")