# Real World Applications

In [None]:
using Pkg
Pkg.activate("..")
Pkg.instantiate()
Pkg.update()

## Identifying academically vulnerable learners in first-year science programmes 
[
    Identifying academically vulnerable learners in first-year science
    programmes at a South African higher-education institution
](
    https://sacj.cs.uct.ac.za/index.php/sacj/article/view/832
)

In [None]:
using ARFFFiles
using DataFrames

data = ARFFFiles.load(
    DataFrame, 
    joinpath("..", "datasets", "academically-vulnerable-learners.arff")
)

describe(data)

Oh no! Some attributes have maaany missing values!!!

In [None]:
attributes_with_missings = Vector{Tuple{String, Int}}()

for attribute_name in names(data)
    n_missings = count(x -> ismissing(x), data[:, attribute_name])

    if n_missings > 0 
        push!(attributes_with_missings, (attribute_name, n_missings))
    end
end

sort!(attributes_with_missings, by = x -> x[2], rev = true)

Some preprocessing is required, let's remove columns with missing values

In [None]:
using Impute

data_nomissing = Impute.filter(data; dims=:cols)

describe(data_nomissing)

In [None]:
using MLJ

schema(data_nomissing)

Let's see which kind of models we could use...

In [None]:
y, X = unpack(data_nomissing, ==(Symbol("Risk Status")))

models(matching(X,y))

Too bad! Most models don't work with categorical values out of the box...

This includes the `DecisionTreeClassifier` from `DecisionTree.jl`!

Hence, we first need to encode these values as numerical values.

One possibility is to convert the type of the associated features from
`Multiclass` to `Continuous` or `OrderedFactor`.

In [None]:
data_preprocessed = coerce(data_nomissing, "Risk Status"=>OrderedFactor)
data_preprocessed = coerce(data_preprocessed, Multiclass=>Continuous)

schema(data_preprocessed)

Let's have a look at the data...

In [None]:
y, X = unpack(data_preprocessed, ==(Symbol("Risk Status")))

Great! We can now use a `DecisionTreeClassifier` like in our example!

In [None]:
models(matching(X,y))

Let's first choose a random sample from our dataset: we will use it later to
evaluate our model.

In [None]:
data_shuffled = shuffle(data_preprocessed)  # Let's first shuffle our data
y, X = unpack(data_shuffled, ==(Symbol("Risk Status")))
X_train, y_train = X[1:600, :], y[1:600]
X_test, y_test = X[601:800, :], y[601:800];

Let's try to work following the pipeline we learned this week!

In [None]:
try
    DecisionTreeClassifier = @load DecisionTreeClassifier pkg=DecisionTree
catch
    println("The DecisionTreeClassifier symbol has already been imported.")
end

In [None]:
model = MLJDecisionTreeInterface.DecisionTreeClassifier()

In [None]:
mach = machine(model, X_train, y_train)

In [None]:
fit!(mach)

In [None]:
fitted_params(mach).tree

In [None]:
y_predict_probabilities = predict(mach, X_test)
y_predict = mode.(y_predict_probabilities)
cm = confusion_matrix(y_predict, y_test)

In [None]:
accuracy(cm)

## Interpretable land cover classification with modal decision trees
[
    Interpretable land cover classification with modal decision trees
](
    https://www.tandfonline.com/doi/pdf/10.1080/22797254.2023.2262738
)

To run this section of the notebook, you first need to download the following
datasets and place them in the `/datasets/paviaU` folder:
- [Pavia University](https://www.ehu.eus/ccwintco/uploads/e/ee/PaviaU.mat)
- [Pavia University GT](https://www.ehu.eus/ccwintco/uploads/5/50/PaviaU_gt.mat)


In [None]:
include("../scripts/land-cover.jl")
data_dir = "../datasets/"

X_df, y = LandCoverDataset(
    "Pavia University";
    window_size          = 3,
    ninstances_per_class = 40,
    pad_window_size      = 5,
);

In [None]:
countmap(y)

In [None]:
length.(X_df)

In [None]:
X_df = broadcast(values->Matrix{Float64}(values), X_df)

In [None]:
# Let's unwind the spatial axes
X_df_static = Matrix(X_df)
cols = []
for i_var in 1:size(X_df_static, 2)
    var_unroll = cat(X_df_static[:,i_var]...; dims = 3)
    append!(cols, eachrow(reshape(var_unroll, (9, nrow(X_df)))))
end
X_df_static = DataFrame(
    cols,
    ["$n[$i][$j]" for n in names(X_df) for i in 1:3 for j in 1:3]
)

In [None]:
using MultiData

X_multimodal = MultiModalDataset([X_df, X_df_static])

In [None]:
using ModalDecisionTrees

model = ModalDecisionTree(; relations = :RCC8)

In [None]:
# Train in cross-validation!
e = @time evaluate!(machine(model, X_multimodal, y);
    resampling=StratifiedCV(rng = Random.Xoshiro(1), shuffle=true, nfolds = 2),
    measures=[accuracy],
    verbosity=0,
    check_measure=false
)

In [None]:
# Test accuracies per fold
e.per_fold

In [None]:
dtrees = map((((train_idxs, test_idxs), rep),)->begin
    predictions, tree_test = rep.sprinkle(slicedataset(X_multimodal, test_idxs), y[test_idxs]; simplify = true)
    tree_test
end, zip(e.train_test_rows, e.report_per_fold))

In [None]:
using SoleModels

ruleslist = vcat(listrules.(dtrees)...)

In [None]:
# Every symbolic model (including ruleslist) can have has additional information attached
println(ruleslist[1])

ruleinfo = SoleModels.info(ruleslist[1])
println(keys(ruleinfo))

In [None]:
ruleinfo[:supporting_predictions] |> length

In [None]:
sort(readmetrics.(ruleslist), by=x->x[:coverage], rev = true)

In [None]:
# goodrules = filter(r->readmetrics(r)[:ninstances] > 1, ruleslist)
goodrules = sort(ruleslist, by=r->readmetrics(r)[:coverage], rev = true)
printmodel.(goodrules; show_metrics = true, threshold_digits = 4);