In [1]:
from mandala.imports import *
from typing import List, Tuple
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification, load_digits
from pathlib import Path
import numpy as np
from numpy import ndarray
Config.enable_ref_magics = True
Config.warnings = False

OUTPUT_ROOT = Path().absolute() / "03_advanced.db"
OUTPUT_ROOT.unlink(missing_ok=True)

# storage = Storage()
# storage = Storage(db_path=OUTPUT_ROOT)
storage = Storage(db_path=OUTPUT_ROOT, deps_root=Path().absolute())

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@op
def generate_data() -> Tuple[ndarray, ndarray]:
    return load_digits(n_class=2, return_X_y=True)

@op
def train_and_eval_tree(X, y, seed,
                        max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
    tree = DecisionTreeClassifier(random_state=seed, 
                                  max_depth=max_depth,
                                  max_features=1).fit(X, y)
    return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)
    
@op
def eval_forest(trees:List[DecisionTreeClassifier], X, y) -> float:
    majority_vote = np.array([tree.predict(X) for tree in trees]).mean(axis=0) >= 0.5
    return round(accuracy_score(y_true=y, y_pred=majority_vote), 2)


In [3]:
with storage.run(): # memoization context manager
    X, y = generate_data()
    trees = []
    for seed in range(10): # can't grow trees without seeds
        tree, acc = train_and_eval_tree(X, y, seed=seed)
        trees.append(tree)
    forest_acc = eval_forest(trees, X, y)
    print(forest_acc)


ValueRef(0.98, uid=f26...)


In [4]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = []
        for seed in range(n_trees): 
            tree, acc = train_and_eval_tree(X, y, seed=seed)
            trees.append(tree)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

ValueRef(0.97, uid=ecd...)
ValueRef(in_memory=False, uid=f26...)
ValueRef(0.97, uid=ecd...)


In [5]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = []
        for seed in range(n_trees): 
            tree, acc = train_and_eval_tree(X, y, seed=seed)
            if acc > 0.8:
                trees.append(tree)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

ValueRef(0.94, uid=dbb...)
ValueRef(0.99, uid=65e...)
ValueRef(0.96, uid=f7d...)


In [6]:
@superop
def train_forest(X, y, n_trees) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i) 
        if acc > 0.8:
            trees.append(tree)
    return trees

In [7]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

ValueRef(in_memory=False, uid=dbb...)
ValueRef(in_memory=False, uid=65e...)
ValueRef(in_memory=False, uid=f7d...)


In [8]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

ValueRef(in_memory=False, uid=dbb...)
ValueRef(in_memory=False, uid=65e...)
ValueRef(in_memory=False, uid=f7d...)
ValueRef(0.97, uid=ecd...)


In [9]:
with storage.query() as q: # context manager for declarative queries
    n_trees = Q() # a wildcard query variable
    X, y = generate_data() # copy-paste computational code
    trees = train_forest(X, y, n_trees)
    forest_acc = eval_forest(trees, X, y)
    df = q.get_table(n_trees.named('n_trees'),
                     forest_acc.named('forest_acc'))
df

Unnamed: 0,n_trees,forest_acc
0,10,0.99
1,5,0.94
2,15,0.96
3,20,0.97


In [10]:
with storage.run():
    X, y = generate_data()
    tree, acc = train_and_eval_tree(X, y, seed=2)
    with storage.query() as q: # contexts can be nested
        trees = Q([tree, ...]) # matches a list containing `tree`
        forest_acc = eval_forest(trees, X, y)
        df = q.get_table(trees.named('trees'), 
                         forest_acc.named('forest_acc'))
df

Unnamed: 0,trees,forest_acc
0,"[DecisionTreeClassifier(max_depth=1, max_featu...",0.98
1,"[DecisionTreeClassifier(max_depth=1, max_featu...",0.97
2,"[DecisionTreeClassifier(max_depth=1, max_featu...",0.97
3,"[DecisionTreeClassifier(max_depth=1, max_featu...",0.99
4,"[DecisionTreeClassifier(max_depth=1, max_featu...",0.94
5,"[DecisionTreeClassifier(max_depth=1, max_featu...",0.96
6,"[DecisionTreeClassifier(max_depth=1, max_featu...",0.97


In [11]:
@op
def train_and_eval_tree(X, y, seed, max_features=1,
                        max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
    tree = DecisionTreeClassifier(random_state=seed, 
                                  max_depth=max_depth,
                                  max_features=max_features).fit(X, y)
    return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)
    
@superop
def train_forest(X, y, n_trees, max_features=1) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i, max_features=max_features) 
        if acc > 0.8:
            trees.append(tree)
    return trees

In [12]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        for max_features in (1, 2):
            trees = train_forest(X, y, n_trees, max_features=max_features)
            forest_acc = eval_forest(trees, X, y)
            print(forest_acc)

FOUND CHANGES IN MODULE __main__:
  CHANGED dependency train_and_eval_tree for memoized functions [train_forest, train_and_eval_tree]
     @op
    [31m-def train_and_eval_tree(X, y, seed,[0m
    [32m+def train_and_eval_tree(X, y, seed, max_features=1,[0m
                             max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
         tree = DecisionTreeClassifier(random_state=seed, 
                                       max_depth=max_depth,
    [31m-                                  max_features=1).fit(X, y)[0m
    [32m+                                  max_features=max_features).fit(X, y)[0m
         return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)
Choose an action to apply to all functions: [i]gnore, [n]ew version, [a]bort? 
Choice: ignore for memoized functions [train_forest, train_and_eval_tree]
  CHANGED dependency train_forest for memoized functions [train_forest]
     @superop
    [31m-def train_forest(X, y, n_trees) -> List[DecisionTr

In [13]:
with storage.query() as q:
    n_trees = Q() 
    X, y = generate_data() 
    max_features = Q()
    trees = train_forest(X, y, n_trees, max_features=max_features)
    forest_acc = eval_forest(trees, X, y)
    df = q.get_table(n_trees.named('n_trees'), max_features.named('max_features'),
                     forest_acc.named('forest_acc'))
df

Unnamed: 0,n_trees,max_features,forest_acc
0,10,1,0.99
1,5,1,0.94
2,15,1,0.96
3,20,1,0.97
4,10,2,0.99
5,15,2,0.98
6,20,2,0.98
7,5,2,0.94


In [14]:
@superop
def train_forest(X, y, n_trees, max_features=1) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i, max_features=max_features) 
        if acc > 0.9:
            trees.append(tree)
    return trees

In [15]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        for max_features in (1, 2):
            trees = train_forest(X, y, n_trees, max_features=max_features)
            forest_acc = eval_forest(trees, X, y)
            print(forest_acc)

FOUND CHANGES IN MODULE __main__:
  CHANGED dependency train_forest for memoized functions [train_forest]
         for i in range(n_trees):
             tree, acc = train_and_eval_tree(X, y, seed=i, max_features=max_features) 
    [31m-        if acc > 0.8:[0m
    [32m+        if acc > 0.9:[0m
                 trees.append(tree)
         return trees
Choose an action to apply to all functions: [i]gnore, [n]ew version, [a]bort? 
Choice: new version for memoized functions [train_forest]
  Created new version of train_forest
ValueRef(in_memory=False, uid=dbb...)
ValueRef(in_memory=False, uid=dbb...)
ValueRef(in_memory=False, uid=65e...)
ValueRef(0.99, uid=65e...)
ValueRef(in_memory=False, uid=65e...)
ValueRef(0.99, uid=65e...)
ValueRef(in_memory=False, uid=65e...)
ValueRef(0.98, uid=f26...)
