In [None]:
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from coin_toss import Coin_toss
from transform import TS_VS
import numpy as np

# Get initial trajectory and learn transformation

In [None]:
game = Coin_toss(ergodic=False)
wealths = np.zeros(101)
wealths[0] = 100
for idx in range(100):
    obs, rew, _, _ = game.step(np.array([1]))
    wealths[idx + 1] = obs 
transformation = TS_VS(wealths)

# Train 3 models: with logarithmic transformation, without transformation, and with learned transformation

In [None]:
num_time_steps = 1e7
model_logarithm = PPO('MlpPolicy', Coin_toss(ergodic=True)).learn(total_timesteps=num_time_steps, progress_bar=True)
model_standard = PPO('MlpPolicy', Coin_toss(ergodic=False)).learn(total_timesteps=num_time_steps, progress_bar=True)
model_transform = PPO('MlpPolicy', Coin_toss(ergodic=True, trans=transformation)).learn(total_timesteps=num_time_steps, progress_bar=True)

# Play 1000 games with trained agents

In [None]:
game_logarithm = Coin_toss(ergodic=False)
game_standard = Coin_toss(ergodic=False)
game_transform = Coin_toss(ergodic=False)
num_exp = 1000
ep_len = 1000
traj_logarithm = np.zeros((num_exp, ep_len + 1))
traj_standard = np.zeros((num_exp, ep_len + 1))
traj_transform = np.zeros((num_exp, ep_len + 1))
for idx_1 in range(num_exp):
    obs_logarithm = game_logarithm.reset()
    obs_standard = game_standard.reset()
    obs_transform = game_transform.reset()
    traj_logarithm[idx_1, 0] = 100
    traj_standard[idx_1, 0] = 100
    traj_transform[idx_1, 0] = 100
    action_logarithm, _ = model_logarithm.predict(np.log(obs_logarithm))
    action_standard, _ = model_standard.predict(obs_standard)
    action_transform, _ = model_transform.predict(obs_transform)
    for idx_2 in range(ep_len):
        obs_logarithm, rew_logarithm, _, _ = game_logarithm.step(action_logarithm)
        obs_standard, rew_standard, _, _ = game_standard.step(action_standard)
        obs_transform, rew_transform, _, _ = game_transform.step(action_transform)
        action_logarithm, _ = model_logarithm.predict(np.log(obs_logarithm))
        action_standard, _ = model_standard.predict(obs_standard)
        action_transform, _ = model_transform.predict(obs_transform)
        traj_logarithm[idx_1, idx_2 + 1] = game_logarithm.cum_reward.item()
        traj_standard[idx_1, idx_2 + 1] = game_standard.cum_reward.item()
        traj_transform[idx_1, idx_2 + 1] = game_transform.cum_reward.item()

# Plot first 10 trajectories

In [None]:
plt.subplot(3, 1, 1)
plt.plot(traj_logarithm[0:10, :].T)
plt.title('logarithmic transform')
plt.yscale('log')
plt.subplot(3, 1, 2)
plt.plot(traj_standard[0:10, :].T)
plt.title('standard learning')
plt.yscale('log')
plt.subplot(3, 1, 3)
plt.plot(traj_transform[0:10, :].T)
plt.title('learned transform')
plt.yscale('log')
plt.show()

# Compute statistics

In [None]:
print(np.mean(traj_logarithm[:, -1]))
print(np.mean(traj_standard[:, -1]))
print(np.mean(traj_transform[:, -1]))

In [None]:
print(np.median(traj_logarithm[:, -1]))
print(np.median(traj_standard[:, -1]))
print(np.median(traj_transform[:, -1]))