In [None]:
if 'google.colab' in str(get_ipython()):
    !pip install -r https://raw.githubusercontent.com/abbbe/eye-on-stick/main/requirements.txt
    !git clone https://github.com/abbbe/eye-on-stick
    %cd eye-on-stick

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np

import os, urllib
import mlflow, git

from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.vec_env import VecNormalize
from stable_baselines import PPO2, SAC

import matplotlib.pyplot as plt
%matplotlib inline

from lib import eos
from lib.eos import EyeOnStickEnv

In [None]:
from PIL import Image, ImageDraw
from IPython import display
from io import BytesIO

def showarray(img_array):
    buf = BytesIO()
    Image.fromarray(np.uint8(img_array)).save(buf, 'png')
    display.display(display.Image(data=buf.getvalue()))

In [None]:
with git.Repo() as repo:
    assert not repo.is_dirty()
    git_branch_name = repo.active_branch.name
    git_short_hash = repo.git.rev_parse(repo.head.object.hexsha, short=4)

In [None]:
# we run N_ERAS eras (=mlflow runs), each era consists of N_STEPS steps
# at the end of each era we report metrics to mlflow and learn for N_LEARN_EPOCHS epochs
N_ERAS = 25 # eras 
N_STEPS = 1000 # steps each
N_LEARN_EPOCHS = 10000

N_ENVS = 1

params = {
        'REWARD_AIM_WEIGHT': 1,
        'REWARD_LEVEL_WEIGHT': 1,
        'REWARD_ACTION_WEIGHT': 0.1
}

In [None]:
def run(n_joints, policy_class, model_name, N_ENVS=N_ENVS, name=None):
    env = make_vec_env(lambda: EyeOnStickEnv(n_joints, params), n_envs=N_ENVS)
    env = VecNormalize(env)
    
    run_name = f'{n_joints}J'
    if name is not None:
        run_name += f' {name}'

    with mlflow.start_run(run_name=run_name) as parent_run:
        # log params
        mlflow.log_param("policy_class", policy_class.__name__)
        mlflow.log_param("model_name", model_name)        
        for key, value in params.items():
            mlflow.log_param(key, value)

        # arrange tensorboard logs
        mlflow_artifacts_dir = urllib.request.url2pathname(urllib.parse.urlparse(mlflow.get_artifact_uri()).path)
        tensorboard_logdir = os.path.join(mlflow_artifacts_dir, "tensorboard_log")
        os.makedirs(tensorboard_logdir, exist_ok=False)

        model = policy_class(model_name, env, verbose=1, tensorboard_log=tensorboard_logdir)

        for era in range(N_ERAS):
            with mlflow.start_run(run_name=f'era={era}', nested=True) as child_run:
                all_alphas, all_rewards = [], []

                obs = env.reset()

                for _ in range(N_STEPS):
                    display.clear_output(wait=True)
                    showarray(env.render(mode='rgb_array'))
                    #import time
                    #time.sleep(.05)

                    actions, _ = model.predict(obs, deterministic=True)
                    obs, rewards, _dones, infos = env.step(actions)

                    all_alphas.append([info['alpha'] for info in infos])                
                    all_rewards.append(rewards)

                mlflow.log_metric(key="alpha_mean", value=np.mean(all_alphas), step=era)
                mlflow.log_metric(key="alpha_std", value=np.std(all_alphas), step=era)
                mlflow.log_metric(key="reward_total", value=np.sum(all_rewards), step=era)
                mlflow.log_metric(key="reward_mean", value=np.mean(all_rewards), step=era)
                mlflow.log_metric(key="reward_std", value=np.std(all_rewards), step=era)

                model.learn(N_LEARN_EPOCHS)
                #mlflow.keras.save_model(model, f"era{era}")

        env.close()

In [None]:
for _ in range(5):
    run(n_joints=4, policy_class=SAC, model_name='MlpPolicy', name=f'{git_branch_name} {git_short_hash}')