In [1]:
import minari
import gymnasium as gym
import numpy as np
from pyoperon.sklearn import SymbolicRegressor
from sklearn.preprocessing import StandardScaler

import os
os.environ['MUJOCO_GL']='egl'

In [94]:
dataset = minari.load_dataset("Minimal-Hopper-Expert-v5")
episode = dataset[0]

scaler0 = StandardScaler()
X0 = episode.observations[0:88]
y0_0 = episode.actions[0:88, 0]
y0_1 = episode.actions[0:88, 1]
y0_2 = episode.actions[0:88, 2]
X0 = scaler0.fit_transform(X0)

scaler1 = StandardScaler()
X1 = episode.observations[70:140]
y1_0 = episode.actions[70:140, 0]
y1_1 = episode.actions[70:140, 1]
y1_2 = episode.actions[70:140, 2]
X1 = scaler1.fit_transform(X1)

scaler2 = StandardScaler()
X2 = episode.observations[135:180]
y2_0 = episode.actions[135:180, 0]
y2_1 = episode.actions[135:180, 1]
y2_2 = episode.actions[135:180, 2]
X2 = scaler2.fit_transform(X2)

scaler3 = StandardScaler()
X3 = episode.observations[160:250]
y3_0 = episode.actions[160:200, 0]
y3_1 = episode.actions[160:200, 1]
y3_2 = episode.actions[160:200, 2]
X3 = scaler3.fit_transform(X3)

In [72]:
import time

past = time.time()


i0_action_0 = SymbolicRegressor(random_state=24012000)
i0_action_0.fit(X0, y0_0)
print("S1")

i0_action_1 = SymbolicRegressor(random_state=24012000)
i0_action_1.fit(X0, y0_1)
print("S2")

i0_action_2 = SymbolicRegressor(random_state=24012000)
i0_action_2.fit(X0, y0_2)
print("S3")

now = time.time()
print(now-past)

S1
S2
S3
14.880643606185913


In [73]:
past = time.time()

i1_action_0 = SymbolicRegressor(random_state=24012000)
i1_action_0.fit(X1, y1_0)
print("S1")

i1_action_1 = SymbolicRegressor(random_state=24012000)
i1_action_1.fit(X1, y1_1)
print("S2")

i1_action_2 = SymbolicRegressor(random_state=24012000)
i1_action_2.fit(X1, y1_2)
print("S3")

now = time.time()
print(now-past)

S1
S2
S3
14.938016176223755


In [76]:
past = time.time()

i2_action_0 = SymbolicRegressor(random_state=24012000)
i2_action_0.fit(X2, y2_0)
print("S1")

i2_action_1 = SymbolicRegressor(random_state=24012000)
i2_action_1.fit(X2, y2_1)
print("S2")

i2_action_2 = SymbolicRegressor(random_state=24012000)
i2_action_2.fit(X2, y2_2)
print("S3")

now = time.time()
print(now-past)

S1
S2
S3
14.809717178344727


In [95]:
past = time.time()

i3_action_0 = SymbolicRegressor(random_state=24012000)
i3_action_0.fit(X3, y3_0)
print("S1")

i3_action_1 = SymbolicRegressor(random_state=24012000)
i3_action_1.fit(X3, y3_1)
print("S2")

i3_action_2 = SymbolicRegressor(random_state=24012000)
i3_action_2.fit(X3, y3_2)
print("S3")

now = time.time()
print(now-past)

S1
S2
S3
14.879687786102295


In [96]:
def benchmark_models(env_name, seed):
    env = gym.make(env_name, render_mode='rgb_array')
    env = gym.wrappers.RecordVideo(env, "videos", episode_trigger=lambda x: True)

    total_reward = 0.0
    obs = env.reset(seed=seed)[0]

    action = np.zeros(env.action_space.shape[0])
    terminated, truncated = False, False
    t = 0

    while not (terminated or truncated):
        if t < 88:
            action[0] = i0_action_0.predict(scaler0.transform(obs.reshape(1, -1)))[0]
            action[1] = i0_action_1.predict(scaler0.transform(obs.reshape(1, -1)))[0]
            action[2] = i0_action_2.predict(scaler0.transform(obs.reshape(1, -1)))[0]
        elif t >= 88 and t <= 140:
            action[0] = i1_action_0.predict(scaler1.transform(obs.reshape(1, -1)))[0]
            action[1] = i1_action_1.predict(scaler1.transform(obs.reshape(1, -1)))[0]
            action[2] = i1_action_2.predict(scaler1.transform(obs.reshape(1, -1)))[0]
        elif t >= 140 and t <= 170:
            action[0] = i2_action_0.predict(scaler2.transform(obs.reshape(1, -1)))[0]
            action[1] = i2_action_1.predict(scaler2.transform(obs.reshape(1, -1)))[0]
            action[2] = i2_action_2.predict(scaler2.transform(obs.reshape(1, -1)))[0]
        else:
            action[0] = i3_action_0.predict(scaler3.transform(obs.reshape(1, -1)))[0]
            action[1] = i3_action_1.predict(scaler3.transform(obs.reshape(1, -1)))[0]
            action[2] = i3_action_2.predict(scaler3.transform(obs.reshape(1, -1)))[0]
            
        action = np.clip(action, -1, 1)
        print("      OBS:", [round(o, 2) for o in obs])
        print("   OPERON: ", [round(a, 2) for a in action])
        print("     TRUE: ", [round(a, 2) for a in episode.actions[t]])
        print()

        obs, reward, terminated, truncated, _ = env.step(action)

        t += 1
        total_reward += reward
        
        print(t)
        
    print(f"Total reward: {total_reward:.3f}")
    env.close()

In [97]:
benchmark_models("Hopper-v5", seed=24012000)

  logger.warn(


      OBS: [1.25, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, -0.0, -0.0]
   OPERON:  [-1.0, 0.01, 0.88]
     TRUE:  [-1.0, 0.99, 1.0]

1
      OBS: [1.25, -0.0, -0.01, -0.0, 0.0, -0.04, -0.11, -1.18, -1.48, 0.07, 1.28]
   OPERON:  [-0.66, -0.13, 0.98]
     TRUE:  [-0.25, -0.99, 1.0]

2
      OBS: [1.25, -0.01, -0.02, -0.0, 0.02, -0.08, -0.21, -2.02, -2.43, -0.09, 2.7]
   OPERON:  [0.61, -0.22, 1.0]
     TRUE:  [-0.34, -1.0, 1.0]

3
      OBS: [1.24, -0.03, -0.04, -0.01, 0.05, -0.04, -0.32, -1.4, -1.51, -0.53, 4.12]
   OPERON:  [0.04, -0.38, 1.0]
     TRUE:  [-0.24, -1.0, 1.0]

4
      OBS: [1.24, -0.04, -0.05, -0.01, 0.09, -0.06, -0.43, -1.54, -1.4, -1.11, 5.54]
   OPERON:  [0.34, -0.23, 0.93]
     TRUE:  [0.91, 0.93, 1.0]

5
      OBS: [1.24, -0.05, -0.06, -0.02, 0.14, -0.04, -0.54, -1.27, -0.89, -1.52, 6.84]
   OPERON:  [0.48, -0.15, 0.73]
     TRUE:  [0.7, -0.27, 1.0]

6
      OBS: [1.23, -0.06, -0.06, -0.04, 0.19, -0.01, -0.63, -0.78, -0.17, -1.81, 7.85]
   OPERON:  [0.32, -0.19, 

In [93]:
from IPython.core.display import Video

display(Video(url='videos/rl-video-episode-0.mp4'))