In [None]:
# if you are running this notebook on Google Colab, uncomment the following lines
# !pip install git+https://github.com/amakelov/mandala
# !pip install scikit-learn

In [None]:
from mandala.imports import *
from typing import List, Tuple
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification, load_digits
from pathlib import Path
import numpy as np
from numpy import ndarray
Config.enable_ref_magics = True
Config.warnings = False

OUTPUT_ROOT = Path().absolute() / "03_advanced.db"
OUTPUT_ROOT.unlink(missing_ok=True)

# storage = Storage()
# storage = Storage(db_path=OUTPUT_ROOT)
storage = Storage(db_path=OUTPUT_ROOT, deps_root=Path().absolute())

In [None]:
@op
def generate_data() -> Tuple[ndarray, ndarray]:
    return load_digits(n_class=2, return_X_y=True)

@op
def train_and_eval_tree(X, y, seed,
                        max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
    tree = DecisionTreeClassifier(random_state=seed, 
                                  max_depth=max_depth,
                                  max_features=1).fit(X, y)
    return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)
    
@op
def eval_forest(trees:List[DecisionTreeClassifier], X, y) -> float:
    majority_vote = np.array([tree.predict(X) for tree in trees]).mean(axis=0) >= 0.5
    return round(accuracy_score(y_true=y, y_pred=majority_vote), 2)


In [None]:
with storage.run(): # memoization context manager
    X, y = generate_data()
    trees = []
    for seed in range(10): # can't grow trees without seeds
        tree, acc = train_and_eval_tree(X, y, seed=seed)
        trees.append(tree)
    forest_acc = eval_forest(trees, X, y)
    print(forest_acc)


In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = []
        for seed in range(n_trees): 
            tree, acc = train_and_eval_tree(X, y, seed=seed)
            trees.append(tree)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = []
        for seed in range(n_trees): 
            tree, acc = train_and_eval_tree(X, y, seed=seed)
            if acc > 0.8:
                trees.append(tree)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
@superop
def train_forest(X, y, n_trees) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i) 
        if acc > 0.8:
            trees.append(tree)
    return trees

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
with storage.query() as q: # context manager for declarative queries
    n_trees = Q() # a wildcard query variable
    X, y = generate_data() # copy-paste computational code
    trees = train_forest(X, y, n_trees)
    forest_acc = eval_forest(trees, X, y)
    df = q.get_table(n_trees.named('n_trees'),
                     forest_acc.named('forest_acc'))
df

In [None]:
with storage.run():
    X, y = generate_data()
    tree, acc = train_and_eval_tree(X, y, seed=2)
    with storage.query() as q: # contexts can be nested
        trees = Q([tree, ...]) # matches a list containing `tree`
        forest_acc = eval_forest(trees, X, y)
        df = q.get_table(trees.named('trees'), 
                         forest_acc.named('forest_acc'))
df

In [None]:
@op
def train_and_eval_tree(X, y, seed, max_features=1,
                        max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
    tree = DecisionTreeClassifier(random_state=seed, 
                                  max_depth=max_depth,
                                  max_features=max_features).fit(X, y)
    return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)
    
@superop
def train_forest(X, y, n_trees, max_features=1) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i, max_features=max_features) 
        if acc > 0.8:
            trees.append(tree)
    return trees

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        for max_features in (1, 2):
            trees = train_forest(X, y, n_trees, max_features=max_features)
            forest_acc = eval_forest(trees, X, y)
            print(forest_acc)

In [None]:
with storage.query() as q:
    n_trees = Q() 
    X, y = generate_data() 
    max_features = Q()
    trees = train_forest(X, y, n_trees, max_features=max_features)
    forest_acc = eval_forest(trees, X, y)
    df = q.get_table(n_trees.named('n_trees'), max_features.named('max_features'),
                     forest_acc.named('forest_acc'))
df

In [None]:
@superop
def train_forest(X, y, n_trees, max_features=1) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i, max_features=max_features) 
        if acc > 0.9:
            trees.append(tree)
    return trees

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        for max_features in (1, 2):
            trees = train_forest(X, y, n_trees, max_features=max_features)
            forest_acc = eval_forest(trees, X, y)
            print(forest_acc)