In [1]:
from mandala.imports import *
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from typing import Any
import numpy as np
np.random.seed(42)

storage = Storage(deps_path='__main__')

def exit_hook(storage: Storage):
    ops = storage.ops.cache
    if "scale_data" not in ops:
        cf = storage.cf(ops['get_data'])
    elif "get_train_test_split" not in ops:
        cf = storage.cf(ops['scale_data']) | storage.cf(ops['get_data'])
    elif "train_svc" not in ops:
        cf = storage.cf(ops['get_train_test_split'])
    elif "eval_model" not in ops:
        cf = storage.cf(ops['train_svc']) | storage.cf(ops['train_random_forest'])
    elif "eval_ensemble" not in ops:
        cf = storage.cf(ops['eval_model'])
    else:
        cf = storage.cf(ops['eval_ensemble']) | storage.cf(ops['eval_model'])
    cf = cf.expand_all()
    cf.draw(path='demo.svg', verbose=True, show_how="none")
    df = cf.df(values='objs', include_calls=False)
    bad_cols = ['X', 'y', 'X_train', 'X_test', 'y_train', 'y_test', 'X_scaled']
    for col in bad_cols:
        if col in df:
            # replace the values with "..."
            df[col] = df[col].apply(lambda x: "...")
    display(df)
storage._exit_hooks.append(exit_hook)

@op(output_names=["X", "y"])
def get_data():
    return make_moons(n_samples=1000, noise=0.3, random_state=42)

@op(output_names=["X_train", "X_test", "y_train", "y_test"])
def get_train_test_split(X, y):
    return tuple(train_test_split(X, y, test_size=0.2, random_state=42))

@op(output_names=["X_scaled"])
def scale_data(X):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

@op(output_names=["svc_model"])
def train_svc(X_train, y_train, C: float = 1.0, kernel: str = "linear"):
    model = SVC(C=C, kernel=kernel)
    model.fit(X_train, y_train)
    return model

@op(output_names=["rf_model"])
def train_random_forest(X_train, y_train, n_estimators: int = 5, max_depth: int = 5):
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train, y_train)
    return model

@op(output_names=["accuracy",])
def eval_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

@op(output_names=["accuracy"])
def eval_ensemble(models: MList[Any], X_test, y_test):
    y_preds = [model.predict(X_test) for model in models]
    y_pred = np.mean(y_preds, axis=0) > 0.5
    acc = accuracy_score(y_test, y_pred)
    return acc

In [8]:
with storage:
    for scale in (True, False):
        X, y = get_data()
        if scale:
            X = scale_data(X=X)
        X_train, X_test, y_train, y_test = get_train_test_split(X=X, y=y)

        svc_models = []
        for kernel in ('linear', 'rbf', 'poly'):
            svc_model = train_svc(X_train=X_train, y_train=y_train, kernel=kernel)
            svc_acc = eval_model(model=svc_model, X_test=X_test, y_test=y_test)
            svc_models.append(svc_model)
        
        rf_models = []
        for n_estimators in (5, 10, 20):
            rf_model = train_random_forest(X_train=X_train, y_train=y_train, n_estimators=n_estimators)
            rf_acc = eval_model(model=rf_model, X_test=X_test, y_test=y_test)
            rf_models.append(rf_model)
        
        ensemble_acc = eval_ensemble(models=svc_models + rf_models, X_test=X_test, y_test=y_test)

Unnamed: 0,n_estimators,max_depth,kernel,y,X,X_train,y_test,y_train,X_test,C,model,models,accuracy
0,,,poly,...,...,...,...,...,...,1.0,SVC(kernel='poly'),,0.82
1,,,rbf,...,...,...,...,...,...,1.0,SVC(),,0.915
2,5,5.0,,...,...,...,...,...,...,,"(DecisionTreeClassifier(max_depth=5, max_featu...",,0.9
3,,,poly,...,...,...,...,...,...,1.0,SVC(kernel='poly'),,0.835
4,5,5.0,,...,...,...,...,...,...,,"(DecisionTreeClassifier(max_depth=5, max_featu...",,0.88
5,20,5.0,,...,...,...,...,...,...,,"(DecisionTreeClassifier(max_depth=5, max_featu...",,0.9
6,,,linear,...,...,...,...,...,...,1.0,SVC(kernel='linear'),,0.82
7,"ValueCollection([20, 10, 5])",5.0,"ValueCollection(['linear', 'rbf', 'poly'])",...,...,...,...,...,...,1.0,"ValueCollection([SVC(), RandomForestClassifier...","[SVC(kernel='linear'), SVC(), SVC(kernel='poly...",0.9
8,"ValueCollection([20, 10, 5])",5.0,"ValueCollection(['linear', 'rbf', 'poly'])",...,...,...,...,...,...,1.0,ValueCollection([RandomForestClassifier(max_de...,"[SVC(kernel='linear'), SVC(), SVC(kernel='poly...",0.89
9,20,5.0,,...,...,...,...,...,...,,"(DecisionTreeClassifier(max_depth=5, max_featu...",,0.9
