In [None]:
%load_ext autoreload 
%autoreload 2
import numpy as np
import benchml as bml
bml.splits.synchronize(0)
log = bml.log

## Defining new models

In [None]:
def build_linear():
    return bml.pipeline.Module(
        tag="ExttLinear",
        transforms=[
            bml.transforms.ExttInput(),
            bml.transforms.CleanMatrix(
                inputs={
                    "X":"ExttInput.X"
                }),
            bml.transforms.LinearRegression(
                inputs={
                    "X": "CleanMatrix.X",
                    "y": "ExttInput.Y"
                })
        ],
        outputs={"y":"LinearRegression.y"})

def build_ridge():
    return bml.pipeline.Module(
        tag="ExttRidge",
        transforms=[
            bml.transforms.ExttInput(),
            bml.transforms.CleanMatrix(
                inputs={
                    "X":"ExttInput.X"
                }),
            bml.transforms.Ridge(
                inputs={
                    "X": "CleanMatrix.X",
                    "y": "ExttInput.Y"
                })
        ],
        hyper=bml.transforms.BayesianHyper(
            bml.transforms.Hyper({"Ridge.alpha": np.linspace(-7,7,15)}),
            convert={"Ridge.alpha": (lambda p: 10**p)}
        ),
        outputs={"y":"Ridge.y"})

def build_mp():
    return bml.pipeline.Module(
        tag="ExttMarchenkoLinear",
        transforms=[
            bml.transforms.ExttInput(),
            bml.transforms.CleanMatrix(
                args={
                    "axis": 0, 
                    "std_threshold": 1e-10
                },
                inputs={
                    "X":"ExttInput.X"
                }),
            bml.transforms.MarchenkoPasturFilter(
                args={
                     "monomials": [1,2]
                },
                inputs={
                    "X":"CleanMatrix.X"
                }),
            bml.transforms.LinearRegression(
                inputs={
                    "X": "MarchenkoPasturFilter.X",
                    "y": "ExttInput.Y"
                })
        ],
        outputs={"y":"LinearRegression.y"})

def build_rf():
    return bml.pipeline.Module(
        tag="ExttRandomForest",
        transforms=[
            bml.transforms.ExttInput(),
            bml.transforms.RandomForestRegressor(
                tag="predictor",
                args={
                    "n_estimators": 100,
                },
                inputs={"X": "ExttInput.X", "y": "ExttInput.Y"}),
        ],
        hyper=bml.transforms.GridHyper(
            bml.transforms.Hyper({"predictor.max_depth": [2,4,8]})
        ),
        broadcast={},
        outputs={"y": "predictor.y"})

def build_models():
    return [
        build_linear(),
        build_ridge(),
        build_rf(),
        build_mp()
    ]


## Inspecting a model

In [None]:
model = build_mp()
print(model)

In [None]:
args = model.compileArgs()
print("Args =")
_ = [ print("  %-40s = %-10s" % (k,v)) for k, v in args.items() ]

In [None]:
model.showHelpMessage()

## Fit and hyperfit

In [None]:
def fit_evaluate_model(
        model, 
        dataset, 
        metrics=["mae", "r2", "rhop"],
        split=dict(method="random", n_splits=10, train_fraction=0.9)):
    accu = bml.Accumulator(metrics=metrics)
    with bml.stream(model, dataset) as stream:
        # print(dataset)
        for idx, (train, test) in enumerate(stream.split(**split)):
            model.fit(train)
            output_train = model.map(train)
            output_test = model.map(test)
            accu.append("train", output_train["y"], train["ExttInput.Y"])
            accu.append("test", output_test["y"], test["ExttInput.Y"])
        print("Accumulated metrics:")
        res = accu.evaluateAll(log=bml.log, bootstrap=100)
        # print(model)
    return res

In [None]:
dataset = bml.load_dataset("ising.extt")
res = fit_evaluate_model(model, dataset)

In [None]:
models = build_models()
timer = bml.utils.StagedTimer()
for m in models:
    print("Evaluate model '%s'" % m.tag)
    with timer.time(m.tag):
        res = fit_evaluate_model(m, dataset)
print("Wall times:")
timer.report(log)

In [None]:
def hyperfit_evaluate_model(
        model, 
        dataset, 
        metrics=["mae", "r2", "rhop"],
        split=dict(method="random", n_splits=10, train_fraction=0.9),
        hypersplit=dict(method="random", n_splits=10, train_fraction=0.8)):
    accu = bml.Accumulator(metrics=metrics)
    with bml.stream(model, dataset) as stream:
        # print(dataset)
        for idx, (train, test) in enumerate(stream.split(**split)):
            if model.hyper is not None:
                model.hyperfit(
                    stream=train,
                    split_args=hypersplit,
                    accu_args=dict(metric=metrics[0]),
                    target="y",
                    target_ref="ExttInput.Y",
                    log=None)
            else:
                model.fit(stream=train)
            output_train = model.map(train)
            output_test = model.map(test)
            accu.append("train", output_train["y"], train["ExttInput.Y"])
            accu.append("test", output_test["y"], test["ExttInput.Y"])
        print("Accumulated metrics:")
        res = accu.evaluateAll(log=bml.log, bootstrap=100)
        # print(model)
    return res

In [None]:
models = build_models()
timer = bml.utils.StagedTimer()
for m in models:
    print("Evaluate model '%s'" % m.tag)
    with timer.time(m.tag):
        res = hyperfit_evaluate_model(m, dataset)

In [None]:
timer.report(log)

In [None]:
trafos = list(bml.transforms.get_all())

In [None]:
_ = [ print(t) for t in trafos ]
print(len(trafos))

In [None]:
model = build_mp()
with bml.stream(model, dataset) as stream:
    if model.hyper is not None:
        model.hyperfit(
            stream=stream,
            split_args=dict(method="kfold", k=10),
            accu_args=dict(metric="mae"),
            target="y",
            target_ref="ExttInput.Y",
            log=None)
    else:
        model.fit(stream)
bml.save("model.arch", model)

In [None]:
model = bml.load("model.arch")
with bml.stream(model, dataset) as stream:
    output = model.map(stream)
print(output)