In [1]:
"""Testing script for integration."""

import sys
from pathlib import Path
import os

from matilda import InstanceSpace
from matilda.data import metadata, options
from matilda.stages.cloister import CloisterStage
from matilda.stages.pilot import PilotStage
from matilda.stages.prelim import PrelimStage
from matilda.stages.preprocessing import PreprocessingStage
from matilda.stages.pythia import PythiaStage
from matilda.stages.sifted import SiftedStage
from matilda.stages.trace import TraceStage


In [2]:
script_dir = Path(os.path.abspath('')) / "tests" / "test_data" / "demo"

metadata_path = script_dir / "metadata.csv"
options_path = script_dir / "options.json"

metadata_object = metadata.from_csv_file(metadata_path)
options_object = options.from_json_file(options_path)

if metadata_object is None or options_object is None:
    print("ERR: File reading failed!")
    sys.exit()

In [3]:
instance_space = InstanceSpace(
    metadata_object,
    options_object,
    stages=[
        PreprocessingStage,
        PrelimStage,
        SiftedStage,
        PilotStage,
        PythiaStage,
        CloisterStage,
        TraceStage,
    ],
)

print(instance_space._runner._stage_order)  # noqa: SLF001
print(instance_space._runner.run_stage[0])
instance_space.build()



[[<class 'matilda.stages.preprocessing.PreprocessingStage'>], [<class 'matilda.stages.prelim.PrelimStage'>], [<class 'matilda.stages.sifted.SiftedStage'>], [<class 'matilda.stages.pilot.PilotStage'>], [<class 'matilda.stages.cloister.CloisterStage'>, <class 'matilda.stages.pythia.PythiaStage'>], [<class 'matilda.stages.trace.TraceStage'>]]
[DEBUG]: running PreprocessingStage
---------------------------------------------------
---------------------------------------------------
[DEBUG]: running PrelimStage
-------------------------------------------------------------------------
-> Calculating the binary measure of performance
-> Minimizing performance.
An algorithm is good if its performance is less than 0.2
-> For 11 % of the instances there is more than one best algorithm.
Random selection is used to break ties.
beta_threshold: 0.55
nalgos: 10
num_good_algos: [ 0  8 10  0  6  8  0  0  0  0  0  0  1  0  0  0  1  4  8  7  5  8  3  0
  0  0 10 10 10  1 10 10  0  8  8  8  9  9  9  1  0 1



Cost value of the GA algorithm is:  0.3541528239202658
-> Keeping 6 out of 10 features (clustering).
Bydensity value is : False
[DEBUG]: running PilotStage
  -> PILOT is using random starting points for BFGS.
-------------------------------------------------------------------------
  -> PILOT is solving numerically the projection problem.
  -> This may take a while. Trials will not be run sequentially.
-------------------------------------------------------------------------
Pilot has completed trial 1
Pilot has completed trial 2


[32m2024-10-04 14:28:41.143[0m | [1mINFO    [0m | [36mmatilda.stages.cloister[0m:[36mcloister[0m:[36m180[0m - [1m  -> CLOISTER is using correlation to estimate a boundary for the space.[0m


Pilot has completed trial 3
Pilot has completed trial 4
Pilot has completed trial 5
-------------------------------------------------------------------------
  -> PILOT has completed. The projection matrix A is:
[[ 0.2195   0.3127  -0.3413   0.2668  -0.04034 -0.3083 ]
 [ 0.2517   0.03836 -0.4438  -0.05133  0.4368  -0.06384]]
[DEBUG]: running CloisterStage


[32m2024-10-04 14:28:41.177[0m | [1mINFO    [0m | [36mmatilda.stages.cloister[0m:[36mcloister[0m:[36m195[0m - [1m-----------------------------------------------------------------[0m
[32m2024-10-04 14:28:41.179[0m | [1mINFO    [0m | [36mmatilda.stages.cloister[0m:[36mcloister[0m:[36m196[0m - [1m  -> CLOISTER has completed.[0m


[DEBUG]: running PythiaStage
-> Summoning PYTHIA to train the prediction models.
  -> Initializing PYTHIA.
 => PYTHIA is using gaussian kernel
-------------------------------------------------------------------------
 -> PYTHIA is using grid search for hyper-parameter optimization.
 -> PYTHIA is not using cost-sensitive classification.
-------------------------------------------------------------------------
  -> Using a 5-fold stratified cross-validation experiment to evaluate the SVMs.
-------------------------------------------------------------------------
  -> Training has started. PYTHIA may take a while to complete...
    -> PYTHIA has trained a model for 'algo_NB',there are 9 models left to train.
      -> Elapsed time: 1.81s
    -> PYTHIA has trained a model for 'algo_LDA',there are 8 models left to train.
      -> Elapsed time: 1.27s
    -> PYTHIA has trained a model for 'algo_QDA',there are 7 models left to train.
      -> Elapsed time: 1.26s
    -> PYTHIA has trained a mode

  if (good_elements / elements) < self.opts.purity:


    -> Best performance footprint for 'algo_QDA'
        -> There are not enough instances to calculate a footprint.
        -> The subset of instances used is too small.
    -> Algorithm 'algo_QDA' completed. Elapsed time: 0.60s
    -> Best performance footprint for 'algo_poly_SVM'
        -> There are not enough instances to calculate a footprint.
        -> The subset of instances used is too small.
    -> Algorithm 'algo_poly_SVM' completed. Elapsed time: 2.80s
    -> Best performance footprint for 'algo_NB'


  if (good_elements / elements) < self.opts.purity:


    -> Algorithm 'algo_NB' completed. Elapsed time: 4.73s
    -> Best performance footprint for 'algo_LDA'
        -> There are not enough instances to calculate a footprint.
        -> The subset of instances used is too small.
    -> Algorithm 'algo_LDA' completed. Elapsed time: 4.75s
    -> Best performance footprint for 'algo_RBF_SVM'
        -> There are not enough instances to calculate a footprint.
        -> The subset of instances used is too small.
    -> Algorithm 'algo_RBF_SVM' completed. Elapsed time: 4.83s
    -> Best performance footprint for 'algo_KNN'
    -> Best performance footprint for 'algo_L_SVM'
    -> Best performance footprint for 'algo_J48'
    -> Best performance footprint for 'algo_RandF'
        -> There are not enough instances to calculate a footprint.
        -> The subset of instances used is too small.
    -> Algorithm 'algo_RandF' completed. Elapsed time: 4.88s
    -> Best performance footprint for 'algo_CART'


  if (good_elements / elements) < self.opts.purity:


    -> Algorithm 'algo_KNN' completed. Elapsed time: 4.99s
    -> Algorithm 'algo_J48' completed. Elapsed time: 5.01s
    -> Algorithm 'algo_CART' completed. Elapsed time: 5.02s
    -> Algorithm 'algo_L_SVM' completed. Elapsed time: 5.10s
------------------------------------------------------------------------
  -> TRACE is detecting and removing contradictory sections of the footprints.
  -> Base algorithm 'algo_NB'
      -> TRACE is comparing 'algo_NB' with 'algo_LDA'
      -> Test algorithm 'algo_LDA' completed. Elapsed time: 0.00s
      -> TRACE is comparing 'algo_NB' with 'algo_QDA'
      -> Test algorithm 'algo_QDA' completed. Elapsed time: 0.00s
      -> TRACE is comparing 'algo_NB' with 'algo_CART'
      -> Test algorithm 'algo_CART' completed. Elapsed time: 0.01s
      -> TRACE is comparing 'algo_NB' with 'algo_J48'
      -> Test algorithm 'algo_J48' completed. Elapsed time: 0.01s
      -> TRACE is comparing 'algo_NB' with 'algo_KNN'
      -> Test algorithm 'algo_KNN' complete

  if (good_elements / elements) < self.opts.purity:


Model(data=Data(inst_labels=0            abalone
1        abalone_ori
2       appendicitis
3           asbestos
4      audiology_std
           ...      
207     wine_quality
208       winsconsin
209             wpbc
210      wpbc_no_Nas
211              zoo
Name: Instances, Length: 212, dtype: object, feat_labels=['feature_Max_Normalized_Entropy_attributes', 'feature_Mean_Mutual_Information_Attribute_Class', 'feature_WeightedDist_StdDev', 'feature_Max_Feature_Efficiency_F3', 'feature_Training_Error_Linear_Classifier_L2', 'feature_Nonlinearity_Nearest_Neighbor_Classifier_N4'], algo_labels=['algo_NB', 'algo_LDA', 'algo_QDA', 'algo_CART', 'algo_J48', 'algo_KNN', 'algo_L_SVM', 'algo_poly_SVM', 'algo_RBF_SVM', 'algo_RandF'], x=array([[ 0.29595574,  0.88648305,  0.70802719, -0.89257442,  1.05554782,
         1.27513603],
       [ 0.29595574,  1.39841088,  0.70802719,  0.85462078, -1.16349627,
         1.46082763],
       [-0.50239617, -0.00661317, -0.7012761 ,  0.17193697,  0.60055933,
    

In [7]:
model = instance_space.model

In [12]:
model.data.x

array([[ 0.29595574,  0.88648305,  0.70802719, -0.89257442,  1.05554782,
         1.27513603],
       [ 0.29595574,  1.39841088,  0.70802719,  0.85462078, -1.16349627,
         1.46082763],
       [-0.50239617, -0.00661317, -0.7012761 ,  0.17193697,  0.60055933,
         0.46039867],
       ...,
       [-0.34534952, -1.17389672, -0.60505205, -0.33965518,  0.89856907,
         0.70864563],
       [-0.31895999, -1.18196258, -0.60505205, -0.32163725,  0.89671491,
         0.72684568],
       [-0.31954114,  1.47581912, -0.45035181,  1.39240367, -1.12804226,
        -1.37872698]])