In [None]:
using CSV
using DataFrames
using MLJ
using Plots

#Pkg.add("MLJDecisionTreeInterface")

In [None]:
# MOTIVE : TEST DIFFERENT MLJ MODELS ON PREPROCESSED DATA 

In [None]:
# Basic configuration --- 
base_path= "/home/camilodlt/Downloads/CM_ML/TP5/Maturite_dentaire"

In [None]:
cwd = pwd()
print("Current wd: ",cwd)
if cwd != base_path
    print("Changing cwd to:$base_path")
    cd(base_path);
end

In [None]:
# Data configuaration --- 
imputed_path= "data/ordinalEncoder_imputed_knn/knn_imputed.csv";

In [None]:
imputed = CSV.read(joinpath(base_path, imputed_path), DataFrame)

In [None]:
print(describe(imputed,:detailed)) # all columns are numeric. Have been imputed before

In [None]:
n_0= nrow(imputed)
n_1=ncol(imputed) 
print("Shape: ($n_0,$n_1)")

In [None]:
MLJ.schema(imputed)

In [None]:
to_coerce_ordered= ["PAT_SEX"]

for col in to_coerce_ordered 
    display("Coercing column $col to OrderedFactor")
    imputed[!,col] = MLJ.coerce(imputed[!,col], OrderedFactor)
    

    display("--- Levels of the converted column ---")
    display(levels(imputed[!,col]))

    display("--- New MLJ schema ---")
    display(MLJ.schema(imputed))
end 

### TRAIN TEST SPLIT 

In [None]:
y, X= unpack(imputed, ==(:PAT_AGE) ; shuffle=true,rng=1234);

(Xtrain, Xtest), (ytrain, ytest)= partition((X,y), 0.8;
          shuffle=true,
          rng=1234,
          multi=true);

# Train shape
n_0= nrow(Xtrain)
n_1=ncol(Xtrain) 
display("Train Shape: ($n_0,$n_1)")

# Test shape
n_0= nrow(Xtest)
n_1=ncol(Xtest) 
display("Test Shape: ($n_0,$n_1)")

### MODEL SEARCH 

See which models can be trained on data. 

In [None]:
for m in models(matching(X, y))
    display(rpad(m.name, 30) * "Package : $(m.package_name)")
end

### Load models 

In [None]:
load_path("RandomForestRegressor", pkg="DecisionTree")

In [None]:
load_path("DecisionTreeRegressor", pkg="DecisionTree")

#### Train Decision tree 

In [None]:
Tree = @load DecisionTreeRegressor pkg=DecisionTree
tree = Tree()

mach = machine(tree, Xtrain, ytrain)

In [None]:
# Train algorithm 
fit!(mach)

In [None]:
fitted_params(mach) 

In [None]:
cv=CV(nfolds=5)
evaluate!(mach, resampling=cv,measure=[rms, mae], verbosity=0)

#### Train RF 

In [None]:
RF = @load RandomForestRegressor pkg=DecisionTree
rf = RF()
@doc RF

In [None]:
# Options ---
param_max_depth = range(rf, :max_depth; upper=20, lower=-1)
param_min_samples_leaf = range(rf, :max_depth; upper=200, lower=1)
param_min_samples_split = range(rf, :min_samples_split, upper=30, lower=2)
param_n_subfeatures = range(rf, :n_subfeatures, upper=0, lower= -1)
param_n_trees = range(rf, :n_trees, upper=4000, lower= 50)
param_sampling_fraction = range(rf, :sampling_fraction, upper=0.9, lower=0.5)    

# Ranges ---
params_rf = [
    param_max_depth,
    param_min_samples_leaf,
    param_min_samples_split,
    param_n_subfeatures,
    param_n_trees,
    param_sampling_fraction
    ]
#mach_rf = machine(rf, Xtrain, ytrain)

In [None]:
tuning_forest = TunedModel(model=rf,
    tuning=Grid(resolution=2),
    resampling=CV(nfolds=2,rng=123),
    range=params_rf,
    measure=mae);

mach_tuned = machine(tuning_forest, Xtrain, ytrain);


In [None]:
# Train algorithm 
fit!(mach_tuned)

In [None]:
plot(mach_tuned)

In [None]:
#fitted_params(mach_rf) 

In [None]:
#evaluate!(mach_rf, resampling=cv,measure=[rms, mae], verbosity=1)

In [None]:
report(mach_tuned).history