# ItaData2024
modal example

In [5]:
using Pkg
Pkg.activate(".")
Pkg.resolve()
Pkg.instantiate()
Pkg.dev("SoleModels")
Pkg.dev("Audio911")
Pkg.dev("SoleData")
using MLJ, ModalDecisionTrees
using SoleDecisionTreeInterface, Sole, SoleData
using CategoricalArrays
using DataFrames, JLD2, CSV
using Audio911
using Random
using StatsBase, Catch22

[32m[1m  Activating[22m[39m project at `~/Desktop/audio-rules2024`
[32m[1m  No Changes[22m[39m to `~/Desktop/audio-rules2024/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Desktop/audio-rules2024/Manifest.toml`
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39m[90mPackageExtensionCompat[39m
[32m  ✓ [39m[90mTupleTools[39m
[32m  ✓ [39m[90mOffsetArrays → OffsetArraysAdaptExt[39m
[32m  ✓ [39m[90mStridedViews[39m
[32m  ✓ [39m[90mGR_jll[39m
[32m  ✓ [39m[90mStrideArraysCore[39m
[32m  ✓ [39m[90mNNlib → NNlibFFTWExt[39m
[32m  ✓ [39mStrided
[32m  ✓ [39m[90mLatexify → DataFramesExt[39m
[32m  ✓ [39mPolyester
[32m  ✓ [39m[90mGR[39m
[32m  ✓ [39mSoleModels
[32m  ✓ [39mSole
[32m  ✓ [39mSoleDecisionTreeInterface
[32m  ✓ [39mModalDecisionTrees
[32m  ✓ [39mPlots
[32m  ✓ [39m[90mPlots → FileIOExt[39m
[32m  ✓ [39m[90mPlots → UnitfulExt[39m
  18 dependencies successfully precompiled in 150 seconds. 371 already precompiled.


LoadError: InitError: PyError ($(Expr(:escape, :(ccall(#= /home/gio/.julia/packages/PyCall/1gn3u/src/pyeval.jl:38 =# @pysym(:PyEval_EvalCode), PyPtr, (PyPtr, PyPtr, PyPtr), o, globals, locals))))) <class 'ModuleNotFoundError'>
ModuleNotFoundError("No module named 'librosa'")
  File "/home/gio/.julia/packages/PyCall/1gn3u/src/pyeval.jl", line 1, in <module>
    const Py_single_input = 256  # from Python.h

during initialization of module Audio911

### Open .jld2 file
the file contains 504 samples of respiratory sound, labeled with 2 classes: healty and pneumonia

In [None]:
ds_path = "/datasets/respiratory_Healthy_Pneumonia"

d = jldopen(string((@__DIR__), ds_path, ".jld2"))
x, y = d["dataframe_validated"]
@assert x isa DataFrame
close(d)

### Audio features extraction function
This function is called for every audio sample and extract 51 features:
26 bands of the mel spectrogram,
13 coefficients of the mfcc
12 spectral features: centroid, crest, entropy, f0, flatness, flux, kurtosis, rolloff, skewness, decrease, slope, spread

In [None]:
nan_replacer!(x::AbstractArray{Float64}) = replace!(x, NaN => 0.0)

function afe(x::AbstractVector{Float64}; get_only_melfreq=false)
    # -------------------------------- parameters -------------------------------- #
    # audio module
    sr = 8000
    norm = true
    speech_detection = false
    # stft module
    # stft_length = 256
    # win_type = (:hann, :periodic)
    # win_length = 256
    # overlap_length = 128
    stft_length = 1024
    win_type = (:hann, :periodic)
    win_length = 1024
    overlap_length = 512
    stft_norm = :power                      # :power, :magnitude, :pow2mag
    # mel filterbank module
    nbands = 26
    scale = :mel_htk                        # :mel_htk, :mel_slaney, :erb, :bark
    melfb_norm = :bandwidth                 # :bandwidth, :area, :none
    freq_range = (300, round(Int, sr / 2))
    # mel spectrogram module
    db_scale = false
    # mfcc module
    ncoeffs = 13
    rectification = :log                    # :log, :cubic_root
    dither = true
    # f0 module
    method = :nfc
    f0_range = (50, 400)

    # --------------------------------- functions -------------------------------- #
    # audio module
    audio = load_audio(
        file=x,
        sr=sr,
        norm=norm,
    );

    stftspec = get_stft(
        audio=audio,
        stft_length=stft_length,
        win_type=win_type,
        win_length=win_length,
        overlap_length=overlap_length,
        norm=stft_norm
    );

    # mel filterbank module
    melfb = get_melfb(
        stft=stftspec,
        nbands=nbands,
        scale=scale,
        norm=melfb_norm,
        freq_range=freq_range
    );

    if get_only_melfreq
        return melfb.freq
    end

    # mel spectrogram module
    melspec =  get_melspec(
        stft=stftspec,
        fbank=melfb,
        db_scale=db_scale
    );

    # mfcc module
    mfcc = get_mfcc(
        source=melspec,
        ncoeffs=ncoeffs,
        rectification=rectification,
        dither=dither,
    );

    # f0 module
    f0 = get_f0(
        source=stftspec,
        method=method,
        freq_range=f0_range
    );

    # spectral features module
    spect = get_spectrals(
        source=stftspec,
        freq_range=freq_range
    );

    x_features = hcat(
        melspec.spec',
        mfcc.mfcc',
        f0.f0,
        spect.centroid,
        spect.crest,
        spect.entropy,
        spect.flatness,
        spect.flux,
        spect.kurtosis,
        spect.rolloff,
        spect.skewness,
        spect.decrease,
        spect.slope,
        spect.spread
    );

    nan_replacer!(x_features)

    return x_features
end

afe (generic function with 1 method)

### Compute DataFrame of features

In [16]:
color_code = Dict(:red => 31, :green => 32, :yellow => 33, :blue => 34, :magenta => 35, :cyan => 36)
freq = round.(Int, afe(x[1, :audio]; get_only_melfreq=true))
r_select = r"\e\[\d+m(.*?)\e\[0m"
variable_names = [
    ["\e[$(color_code[:yellow])mmel$i=$(freq[i])Hz\e[0m" for i in 1:26]...,
    ["\e[$(color_code[:red])mmfcc$i\e[0m" for i in 1:13]...,
    "\e[$(color_code[:green])mf0\e[0m", "\e[$(color_code[:cyan])mcntrd\e[0m", "\e[$(color_code[:cyan])mcrest\e[0m",
    "\e[$(color_code[:cyan])mentrp\e[0m", "\e[$(color_code[:cyan])mflatn\e[0m", "\e[$(color_code[:cyan])mflux\e[0m",
    "\e[$(color_code[:cyan])mkurts\e[0m", "\e[$(color_code[:cyan])mrllff\e[0m", "\e[$(color_code[:cyan])mskwns\e[0m",
    "\e[$(color_code[:cyan])mdecrs\e[0m", "\e[$(color_code[:cyan])mslope\e[0m", "\e[$(color_code[:cyan])msprd\e[0m"
]

X = DataFrame([name => Vector{Float64}[] for name in [match(r_select, v)[1] for v in variable_names]])


for f_name in [getnames(catch22)..., :std]
    @eval (function $(Symbol(string(f_name)*"+"))(channel)
        val = $(catch22[f_name])(channel)

        if isnan(val)
            SoleData.aggregator_bottom(SoleData.existential_aggregator(≥), eltype(channel))
        else
            eltype(channel)(val)
        end
    end)
    @eval (function $(Symbol(string(f_name)*"-"))(channel)
        val = $(catch22[f_name])(channel)

        if isnan(val)
            SoleData.aggregator_bottom(SoleData.existential_aggregator(≤), eltype(channel))
        else
            eltype(channel)(val)
        end
    end)
end

function get_patched_feature(f::Callable, polarity::Symbol)
    if f in [minimum, maximum, StatsBase.mean, median]
        f
    else
        @eval $(Symbol(string(f)*string(polarity)))
    end
end

function mean_longstretch1(x) Catch22.SB_BinaryStats_mean_longstretch1((x)) end
function diff_longstretch0(x) Catch22.SB_BinaryStats_diff_longstretch0((x)) end
function quantile_hh(x) Catch22.SB_MotifThree_quantile_hh((x)) end
function sumdiagcov(x) Catch22.SB_TransitionMatrix_3ac_sumdiagcov((x)) end
features = [
    maximum,
    minimum,
    StatsBase.mean,
    median,
	(≥, get_patched_feature(std, :+)),                (≤, get_patched_feature(std, :-)),
	(≥, get_patched_feature(mean_longstretch1, :+)),  (≤, get_patched_feature(mean_longstretch1, :-)),
	(≥, get_patched_feature(diff_longstretch0, :+)),  (≤, get_patched_feature(diff_longstretch0, :-)),
	(≥, get_patched_feature(quantile_hh, :+)),        (≤, get_patched_feature(quantile_hh, :-)),
	(≥, get_patched_feature(sumdiagcov, :+)),         (≤, get_patched_feature(sumdiagcov, :-)),
]

for i in 1:nrow(x)
    push!(X, collect(eachcol(afe(x[i, :audio]))))
end

yc = CategoricalArray(y);

### Data compression for modal analysis

In [17]:
train_ratio = 0.8
train, test = partition(eachindex(yc), train_ratio, shuffle=true)
# train, test = partition(eachindex(yc), train_ratio, shuffle=false) ### Debug
X_train, y_train = X[train, :], yc[train]
X_test, y_test = X[test, :], yc[test]

println("Training set size: ", size(X_train), " - ", length(y_train))
println("Test set size: ", size(X_test), " - ", length(y_test))

Training set size: (403, 51) - 403
Test set size: (101, 51) - 101


### Train a model

In [18]:
learned_dt_tree = begin
    model = ModalDecisionTree(; relations = :IA7, features = features)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
    mach = machine(model, X_train, y_train) |> fit!
end

report(learned_dt_tree).printmodel(variable_names_map=variable_names);

┌ Info: Precomputing logiset...
└ @ ModalDecisionTrees.MLJInterface /home/paso/.julia/dev/ModalDecisionTrees.jl/src/interfaces/MLJ/wrapdataset.jl:135
┌ Info: Training machine(ModalDecisionTree(max_depth = nothing, …), …).
└ @ MLJBase /home/paso/.julia/packages/MLJBase/7nGJF/src/machines.jl:499


[34m▣[0m ⟨G⟩([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627)
├✔ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ([1mmax[[36msprd[0m] [1m≥[0m[0m 711.2412109356288))
│├✔ Healthy
│└✘ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨D̅B̅E̅⟩([1mmax[[31mmfcc7[0m] [1m<[0m[0m -0.5152085686365828))
│ ├✔ Healthy
│ └✘ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨G⟩([1mmax[[31mmfcc9[0m] [1m≥[0m[0m 0.6113784530484502))
│  ├✔ Healthy
│  └✘ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨G⟩([1mmax[[31mmfcc13[0m] [1m≥[0m[0m 0.15322413941387783))
│   ├✔ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨G⟩(([1mmax[[31mmfcc13[0m] [1m≥[0m[0m 0.15322413941387783) ∧ ⟨A̅O̅⟩([1mmax[[36mskwns[0m] [1m<[0m[0m 1.502371047997501)))
│   │├✔ Pneumonia
│   │└✘ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨G⟩(([1mmax[[31mmfcc13[0m] [1m≥[0m[0m 0.153224

### Model inspection & rule study

In [19]:
_, mtree = report(mach).sprinkle(X_test, y_test)
sole_dt = ModalDecisionTrees.translate(mtree)

printmodel(sole_dt; show_metrics = true, variable_names_map=variable_names);

[34m▣[0m ⟨G⟩([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627)
├✔ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ([1mmax[[36msprd[0m] [1m≥[0m[0m 711.2412109356288))
│├✔ Healthy : (ninstances = 26, ncovered = 26, confidence = 0.96, lift = 1.0)
│└✘ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨D̅B̅E̅⟩([1mmax[[31mmfcc7[0m] [1m<[0m[0m -0.5152085686365828))
│ ├✔ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨D̅B̅E̅⟩(([1mmax[[31mmfcc7[0m] [1m<[0m[0m -0.5152085686365828) ∧ ([1m#47[[33mmel23=3037Hz[0m] [1m<[0m[0m 2.4304048485858085e-8)))
│ │├✔ Healthy : (ninstances = 0, ncovered = 0, confidence = NaN, lift = NaN)
│ │└✘ Healthy : (ninstances = 6, ncovered = 6, confidence = 0.83, lift = 1.0)
│ └✘ ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨G⟩([1mmax[[31mmfcc9[0m] [1m≥[0m[0m 0.6113784530484502))
│  ├✔ Healthy : (ninstances = 4, ncovered = 4, confidence = 1.0, lift = 1.0)
│  

### Extract rules that are at least as good as a random baseline model

In [20]:
interesting_rules = listrules(sole_dt, min_lift = 1.0, min_ninstances = 0);
printmodel.(interesting_rules; show_metrics = true, variable_names_map=variable_names);

[34m▣[0m ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ([1mmax[[36msprd[0m] [1m≥[0m[0m 711.2412109356288))  ↣  Healthy : (ninstances = 101, ncovered = 26, coverage = 0.26, confidence = 0.96, lift = 2.02, natoms = 2)
[34m▣[0m ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨D̅B̅E̅⟩([1mmax[[31mmfcc7[0m] [1m<[0m[0m -0.5152085686365828)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) → ([1mmax[[36msprd[0m] [1m<[0m[0m 711.2412109356288)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) → [D̅B̅E̅](([1mmax[[31mmfcc7[0m] [1m<[0m[0m -0.5152085686365828) → ([1m#47[[33mmel23=3037Hz[0m] [1m≥[0m[0m 2.4304048485858085e-8)))  ↣  Healthy : (ninstances = 101, ncovered = 6, coverage = 0.06, confidence = 0.83, lift = 1.75, natoms = 7)
[34m▣[0m ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨G⟩([1mmax[[31mmfcc9[0m] [1m≥[0m[0m 0.6113784530484502)) ∧ [G](([1mmax[[31mmfcc3

### Simplify rules while extracting and prettify result

In [21]:
interesting_rules = listrules(sole_dt, min_lift = 1.0, min_ninstances = 0, normalize = true);
printmodel.(interesting_rules; show_metrics = true, syntaxstring_kwargs = (; threshold_digits = 2), variable_names_map=variable_names);

[34m▣[0m ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02) ∧ ([1mmax[[36msprd[0m] [1m≥[0m[0m 711.24))  ↣  Healthy : (ninstances = 101, ncovered = 26, coverage = 0.26, confidence = 0.96, lift = 2.02, natoms = 2)
[34m▣[0m ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02) ∧ ⟨D̅B̅E̅⟩([1mmax[[31mmfcc7[0m] [1m<[0m[0m -0.52)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02) → ([1mmax[[36msprd[0m] [1m<[0m[0m 711.24)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02) → [D̅B̅E̅](([1mmax[[31mmfcc7[0m] [1m<[0m[0m -0.52) → ([1m#47[[33mmel23=3037Hz[0m] [1m≥[0m[0m 0.0)))  ↣  Healthy : (ninstances = 101, ncovered = 6, coverage = 0.06, confidence = 0.83, lift = 1.75, natoms = 7)
[34m▣[0m ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02) ∧ ⟨G⟩([1mmax[[31mmfcc9[0m] [1m≥[0m[0m 0.61)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02) → ([1mmax[[36msprd[0m] [1m<[0m[0m 711.24)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02) → [D̅B̅E̅]([1mmax[[31mmfcc7[0

### Directly access rule metrics

In [22]:
readmetrics.(listrules(sole_dt; min_lift=1.0, min_ninstances = 0))

12-element Vector{@NamedTuple{ninstances::Int64, ncovered::Int64, coverage::Float64, confidence::Float64, lift::Float64, natoms::Int64}}:
 (ninstances = 101, ncovered = 26, coverage = 0.25742574257425743, confidence = 0.9615384615384616, lift = 2.0232371794871797, natoms = 2)
 (ninstances = 101, ncovered = 6, coverage = 0.0594059405940594, confidence = 0.8333333333333334, lift = 1.7534722222222223, natoms = 7)
 (ninstances = 101, ncovered = 4, coverage = 0.039603960396039604, confidence = 1.0, lift = 2.104166666666667, natoms = 6)
 (ninstances = 101, ncovered = 10, coverage = 0.09900990099009901, confidence = 0.6, lift = 1.1433962264150943, natoms = 9)
 (ninstances = 101, ncovered = 2, coverage = 0.019801980198019802, confidence = 1.0, lift = 1.9056603773584906, natoms = 13)
 (ninstances = 101, ncovered = 1, coverage = 0.009900990099009901, confidence = 1.0, lift = 2.104166666666667, natoms = 16)
 (ninstances = 101, ncovered = 3, coverage = 0.0297029702970297, confidence = 1.0, lift = 

### Show rules with an additional metric (syntax height of the rule's antecedent)

In [23]:
printmodel.(sort(interesting_rules, by = readmetrics); show_metrics = (; round_digits = nothing, additional_metrics = (; height = r->SoleLogics.height(antecedent(r)))), variable_names_map=variable_names);

[34m▣[0m ⟨G⟩([1mmax[[33mmel2=421Hz[0m] [1m≥[0m[0m 0.10474932720736443) ∧ [G]([1mmax[[31mmfcc3[0m] [1m≥[0m[0m 0.02108224501437627) ∧ [G]([1mmax[[31mmfcc2[0m] [1m≥[0m[0m 2.0456837533901515) ∧ [G]([1mmax[[36mentrp[0m] [1m<[0m[0m 0.6343648752658164)  ↣  Pneumonia : (ninstances = 101, ncovered = 1, coverage = 0.009900990099009901, confidence = 1.0, lift = 1.9056603773584906, natoms = 4, height = 4)
[34m▣[0m ⟨G⟩(([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) ∧ ⟨G⟩(([1mmax[[31mmfcc13[0m] [1m≥[0m[0m 0.15322413941387783) ∧ ([1mmin[[31mmfcc7[0m] [1m<[0m[0m -0.28787600284298753))) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) → ([1mmax[[36msprd[0m] [1m<[0m[0m 711.2412109356288)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) → [D̅B̅E̅]([1mmax[[31mmfcc7[0m] [1m≥[0m[0m -0.5152085686365828)) ∧ [G](([1mmax[[31mmfcc3[0m] [1m<[0m[0m 0.02108224501437627) → [G]([1mmax[[31mmfcc9[0m] [1m<[0m[

### Pretty table of rules and their metrics

In [24]:
metricstable(interesting_rules; variable_names_map=variable_names, metrics_kwargs = (; round_digits = nothing, additional_metrics = (; height = r->SoleLogics.height(antecedent(r)))))

┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────┬────────────┬──────────┬────────────┬────────────┬─────────┬────────┬────────┐
│[33;1m                                                                                                                                                                                                                                           