In [1]:
import numpy as np

# from gym.envs.toy_text import BlackjackEnv
import gym
from typing import Literal, List, Tuple, cast, Dict, Optional, Callable, Protocol, Union
import plotly.graph_objects as go
from copy import deepcopy
from abc import abstractmethod, ABC
import math
import sys
from tqdm.autonotebook import tqdm
import plotly.express as px

import io


In [2]:
RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)


In [3]:
env = gym.make('MountainCar-v0')
env.seed(RANDOM_SEED)
env


<TimeLimit<MountainCarEnv<MountainCar-v0>>>

In [4]:
env.reset()

array([-0.589128,  0.      ], dtype=float32)

In [5]:
env.render()

True

In [6]:
Position = float
Velocity = float
State = Tuple[Position, Velocity]

Action = Literal[0, 1, 2]

StateAction = Tuple[State, Action]
Observation = State

"""
- 0: Accelerate to the Left
- 1: Don't accelerate
- 2: Accelerate to the Right
"""
Reward = float
Step = Tuple[State, Optional[Action], Optional[Reward]]
Episode = List[Step]

# all_states: List[State] = list(range(1000))
all_actions: List[Action] = [0, 1, 2]
# nums_of_all_state = len(all_states)
# nums_of_all_state_action = len(all_states) * len(all_actions)
# allowed_actions: List[List[Action]] = [
#     all_actions for _ in range(nums_of_all_state)]


Feature = np.ndarray
Weight = np.ndarray


In [7]:
class FeatureInterface(Protocol):
    len: int

    @abstractmethod
    def to_feature(self, sa: StateAction) -> Feature:
        raise NotImplementedError()


class TestFeature(FeatureInterface):
    def __init__(self):
        self.len = 7

    def to_feature(self, sa: StateAction) -> Feature:
        ((pos, vel), a) = sa
        v = [pos, vel, pos + vel,  pos *
             vel,  *self.one_hot_encode(a)]
        assert len(v) == self.len, f"unexpected length encountered: {len(v)}"
        return np.asarray(v)

    def one_hot_encode(self, a: Action) -> List[int]:
        assert a in [0, 1, 2], f"unexpected a encountered: {a}"
        if a == 0:
            return [1, 0, 0]
        elif a == 1:
            return [0, 1, 0]
        else:
            return [0, 0, 1]


class AppxInterface(Protocol):
    feature_algorithm: FeatureInterface

    @abstractmethod
    def predict(self, sa: StateAction, w: Weight) -> float:
        raise NotImplemented()

    @abstractmethod
    def gradient(self, sa: StateAction, w: Weight) -> np.ndarray:
        raise NotImplemented()


class Linear(AppxInterface):
    def __init__(self, feature_algorithm: FeatureInterface):
        self.feature_algorithm = feature_algorithm

    def predict(self, sa: StateAction, w: Weight) -> float:
        return np.inner(self.feature_algorithm.to_feature(sa), w)

    def gradient(self, sa: StateAction, w: Weight) -> np.ndarray:
        return self.feature_algorithm.to_feature(sa)


class PolicyInterface(Protocol):
    appx_algorithm: AppxInterface

    @abstractmethod
    def allowed_actions(self, s: State) -> List[Action]:
        raise NotImplementedError()

    @abstractmethod
    def take_action(self, s: State, w: Weight) -> Action:
        raise NotImplementedError()


class SigmaGreddy(PolicyInterface):
    def __init__(self, sigma: float, appx_algorithm: AppxInterface):
        self.sigma = sigma
        self.appx_algorithm = appx_algorithm

    def allowed_actions(self, s: State) -> List[Action]:
        return all_actions

    def take_action(self, s: State, w: Weight) -> Action:
        rand = np.random.random()
        all_actions = self.allowed_actions(s)
        if rand < self.sigma:
            return np.random.choice(all_actions)
        else:
            maxi = np.argmax(
                [self.appx_algorithm.predict((s, a), w) for a in all_actions]
            )
            return all_actions[maxi]


class AlwaysRight(PolicyInterface):

    def __init__(self, appx_algorithm: AppxInterface):
        self.appx_algorithm = appx_algorithm

    def allowed_actions(self, s: State) -> List[Action]:
        return all_actions

    def take_action(self, s: State, w: Weight) -> Action:
        return 2


class AlgorithmInterface(Protocol):
    n_of_omega: int
    policy_algorithm: PolicyInterface

    @abstractmethod
    def after_step(
        self, cur_state_action: StateAction, episode: Episode, omega: np.ndarray
    ):
        raise NotImplementedError()

    @abstractmethod
    def on_termination(self, episode: Episode, omega: np.ndarray):
        raise NotImplementedError()

    def allowed_actions(self, s: State) -> List[Action]:
        return self.policy_algorithm.allowed_actions(s)

    def is_terminal_state(self, s: State) -> bool:
        (pos, _) = s
        if pos >= 0.5:
            return True
        return False

    def take_action(self, s: State, omega: Weight) -> Action:
        if self.is_terminal_state(s):
            return np.random.choice(self.allowed_actions(s))

        return self.policy_algorithm.take_action(s, omega)

    def predict(self, sa: StateAction, w: Weight) -> float:
        # assert -1 <= s <= len(all_states), f"unexpected state encounter: {s}"
        (s, _) = sa
        if self.is_terminal_state(s):
            return 0

        # if s == -1 or s == len(all_states):
        #     return 0

        return self.policy_algorithm.appx_algorithm.predict(sa, w)

    def gradient(self, sa: StateAction, w: Weight) -> np.ndarray:
        # assert 0 <= s < len(all_states), f"unexpected state encounter: {s}"

        return self.policy_algorithm.appx_algorithm.gradient(sa, w)


class Sarsa(AlgorithmInterface):
    def __init__(
        self,
        alpha: float,
        policy_algorithm=PolicyInterface,
        gamma: float = 1.0,
    ):
        self.alpha = alpha
        self.gamma = gamma
        self.policy_algorithm = policy_algorithm

        self.n_of_omega = self.policy_algorithm.appx_algorithm.feature_algorithm.len

    def after_step(
        self, this_state_action: StateAction, episode: Episode, omega: np.ndarray
    ):
        # (this_s, this_a) = this_state_action
        history = episode[-1:]
        gamma = self.gamma

        # if len(history) != (1 + n):
        #     return
        assert len(history) == (
            1
        ), f"unexpected history length encountered: {len(history)}"

        (old_s, old_a, r) = history[0]

        rwd = cast(Reward, r) + gamma * self.predict(this_state_action, omega)

        omega += (
            self.alpha
            * (rwd - self.predict((old_s, cast(Action, old_a)), omega))
            * self.gradient((old_s, cast(Action, old_a)), omega)
        )

    def on_termination(self, episode: Episode, omega: Weight):
        pass


In [8]:
class Agent:
    def __init__(
        self,
        env: gym.Env,
        algm: AlgorithmInterface,
    ):
        self.env = env
        self.algm = algm
        self.clear()

    def reset(self):
        self.cur_state: State = self.env.reset()
        self.ready_act: Optional[Action] = None
        self.end = False
        self.episode: Episode = []

    def clear(self):
        self.reset()

        self.omega = np.asarray(
            # [np.random.random() for _ in range(self.algm.n_of_omega)]
            [0.0 for _ in range(self.algm.n_of_omega)]
        )
        # self.episodes: List[Episode] = []

    def step(self) -> Tuple[Observation, bool]:
        assert not self.end, "cannot step on a ended agent"

        act = self.ready_act or self.algm.take_action(
            self.cur_state, self.omega)
        (obs, rwd, stop, _) = self.env.step(act)
        obs = cast(Observation, obs)

        self.episode.append((self.cur_state, act, rwd))

        self.cur_state = obs

        self.ready_act = self.algm.take_action(self.cur_state, self.omega)

        self.algm.after_step(
            (self.cur_state, self.ready_act), self.episode, self.omega)

        if stop:
            self.episode.append((self.cur_state, None, None))
            # self.episodes.append(self.episode)
            self.end = True
            self.algm.on_termination(self.episode, self.omega)
            self.episode = []

        return (obs, stop)

    def render(self):
        self.env.render()

    def close(self):
        self.clear()
        self.env.close()

    def predict(self, s: State) -> float:
        return np.max(
            [
                self.algm.predict((s, a), self.omega)
                for a in self.algm.allowed_actions(s)
            ]
        )


In [9]:
tF = TestFeature()
tF.to_feature(((52, 12), 0))


array([ 52,  12,  64, 624,   1,   0,   0])

In [10]:
TOTAL_TRAINING_EPISODES = 1_0000

agent = Agent(
    cast(gym.Env, env),
    # Sarsa(2e-2, SigmaGreddy(0.1, Linear(TestFeature())))
    Sarsa(2e-2, SigmaGreddy(0.05, Linear(TestFeature())))
    # TDN(9, 2e-4, Linear(), Tiling(5))
)


training = tqdm(range(TOTAL_TRAINING_EPISODES))

# last_omega: Optional[np.ndarray] = None

for run in training:
    agent.reset()
    end = False
    while not end:
        _, end = agent.step()

        agent.render()

    # if run > 1:
    #     progress.set_postfix_str(
    #         str(np.linalg.norm(agent.omega - last_omega)))

    # last_omega = deepcopy(agent.omega)




  3%|▎         | 317/10000 [2:55:27<89:19:34, 33.21s/it]  


KeyboardInterrupt: 

In [11]:

TOTAL_EVALUATE_TIMES = 100
success_times = 0
evaluate = tqdm(range(TOTAL_EVALUATE_TIMES))
for run in evaluate:
    agent.reset()
    end = False
    i = 0
    while not end:
        _, end = agent.step()

        agent.render()
        i += 1

    if i < 200:
        success_times += 1
        print('success!')


 28%|██▊       | 28/100 [04:12<10:48,  9.01s/it]


In [24]:
agent.close()

1000

In [26]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=[i + 1 for i in range(len(omegas) - 1)],
               y=[np.linalg.norm(omegas[i] - omegas[i-1]) for i in range(1, len(omegas))], mode="lines", name="omegas")
)
# fig.add_trace(
#     go.Scatter(
#         x=[i + 1 for i in s],
#         y=[agent.predict(i) for i in s],
#         mode="lines",
#         name="monte-carlo prediction",
#     )
# )
fig.show()


In [None]:
# agent.episodes[:20]


In [None]:
agent.omega

In [None]:
true_values = np.load("./true_values_arr.npy", allow_pickle=False)
true_values


In [None]:
# fig = go.Figure()
# s = list(range(1000))
# fig.add_trace(
#     go.Scatter(x=[i + 1 for i in s], y=true_values, mode="lines", name="true values")
# )
# fig.add_trace(
#     go.Scatter(
#         x=[i + 1 for i in s],
#         y=[agent.predict(i) for i in s],
#         mode="lines",
#         name="monte-carlo prediction",
#     )
# )
# fig.show()
