In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    # run this cell ONLY if you are running this in Google Colab
    !pip install git+https://github.com/amakelov/mandala
    !pip install scikit-learn

In [None]:
from mandala.imports import *
from typing import List, Tuple
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification, load_digits
from pathlib import Path
import numpy as np
from numpy import ndarray
Config.enable_ref_magics = True
Config.warnings = False

OUTPUT_ROOT = Path().absolute() / "03_advanced.db"
OUTPUT_ROOT.unlink(missing_ok=True)

# storage = Storage() # use this for an in-memory storage without dependency tracking
# storage = Storage(db_path=OUTPUT_ROOT) # use this for a persistent storage without dependency tracking
storage = Storage(db_path=OUTPUT_ROOT, deps_path='__main__')

In [None]:
@op
def generate_data() -> Tuple[ndarray, ndarray]:
    return load_digits(n_class=2, return_X_y=True)

@op
def train_and_eval_tree(X, y, seed,
                        max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
    tree = DecisionTreeClassifier(random_state=seed, 
                                  max_depth=max_depth,
                                  max_features=1).fit(X, y)
    return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)
    
@op
def eval_forest(trees:List[DecisionTreeClassifier], X, y) -> float:
    majority_vote = np.array([tree.predict(X) for tree in trees]).mean(axis=0) >= 0.5
    return round(accuracy_score(y_true=y, y_pred=majority_vote), 2)


In [None]:
with storage.run(): # memoization context manager
    X, y = generate_data()
    trees = []
    for seed in range(10): # can't grow trees without seeds
        tree, acc = train_and_eval_tree(X, y, seed=seed)
        trees.append(tree)
    forest_acc = eval_forest(trees, X, y)
    print(forest_acc)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = []
        for seed in range(n_trees): 
            tree, acc = train_and_eval_tree(X, y, seed=seed)
            trees.append(tree)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = []
        for seed in range(n_trees): 
            tree, acc = train_and_eval_tree(X, y, seed=seed)
            if acc > 0.8:
                trees.append(tree)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
@superop
def train_forest(X, y, n_trees) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i) 
        if acc > 0.8:
            trees.append(tree)
    return trees

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, ):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)

In [None]:
storage.similar(forest_acc, context=True)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (10, 15, 20,):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees[:n_trees//2], X, y)

In [None]:
storage.draw_graph(forest_acc)

In [None]:
storage.draw_graph(forest_acc, project=True)

In [None]:
storage.print_graph(forest_acc, project=True)

In [None]:
with storage.query():
    idx = Q() # index into list
    X, y = generate_data()
    n_trees = Q() # input to computation; can match anything
    trees = train_forest(X=X, y=y, n_trees=n_trees)
    a0 = trees[idx] # a0 will match any element of a match for trees at index matching idx1
    a1 = ListQ(elts=[a0], idxs=[idx]) # a1 will match any list containing a match for a0 at index idx0
    forest_acc = eval_forest(trees=a1, X=X, y=y)
storage.df(n_trees, forest_acc)

In [None]:
@superop
def train_forest(X, y, n_trees, threshold = 0.8) -> List[DecisionTreeClassifier]:
    trees = []
    for i in range(n_trees):
        tree, acc = train_and_eval_tree(X, y, seed=i) 
        if acc > threshold:
            trees.append(tree)
    return trees

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        trees = train_forest(X, y, n_trees)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (5, 10, 15, 20):
        trees = train_forest(X, y, n_trees, threshold=0.5)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
# notice we changed `max_features` to 2
@op
def train_and_eval_tree(X, y, seed,
                        max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
    tree = DecisionTreeClassifier(random_state=seed, 
                                  max_depth=max_depth,
                                  max_features=2).fit(X, y)
    return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)
    

In [None]:
with storage.run(): 
    X, y = generate_data()
    for n_trees in (10, 15, 20):
        trees = train_forest(X, y, n_trees, threshold=0.5)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
# now we change it back to 1 - the old memoized calls are used!
@op
def train_and_eval_tree(X, y, seed,
                        max_depth=1) -> Tuple[DecisionTreeClassifier, float]:
    tree = DecisionTreeClassifier(random_state=seed, 
                                  max_depth=max_depth,
                                  max_features=1).fit(X, y)
    return tree, round(accuracy_score(y_true=y, y_pred=tree.predict(X)), 2)


with storage.run(): 
    X, y = generate_data()
    for n_trees in (10, 15, 20):
        trees = train_forest(X, y, n_trees, threshold=0.5)
        forest_acc = eval_forest(trees, X, y)
        print(forest_acc)

In [None]:
# look at the versions of a single dependency
storage.sources(train_and_eval_tree)