# 시간차 학습 (Temporal-difference Learning)

* 시간차 학습(TD learning) 역시 MC와 마찬가지로 환경에 대한 완전한 정보를 요구하지 않고 경험 데이터로부터 직접 학습한다.
* 하지만 시간차 학습은 MC와는 달리 한 에피소드가 끝나기를 기다렸다 상태 값들을 업데이트할 필요가 없다.
* 왜냐하면 시간차 학습은 자신의 다음 상태에 대한 예측값을 현재 가치함수를 이용하여 획득해 활용하기 때문이다 (bootstrapping).

## 가치 함수들

<img src="./val_fs.PNG" width="600" height="400" align="left">


## TD-error 와 TD-target

<img src="./td_err.PNG" width=600 height=400 align=left>

## TD 학습을 이용한  policy evalution

## 1. Driving Home 예제: MC vs TD
<img src="driving.PNG" width=700 height=400>
<br/><br/>
<img src="driving2.PNG" width=700 height=400>

## 2. 랜덤 워크 예제
<img src="random_walk.PNG" width=700 height=400>

In [None]:
import random
import matplotlib.pyplot as plt
import os
import math
from collections import defaultdict
import numpy as np
from IPython.display import clear_output
from racetrack_env import RacetrackEnv, Map, REWARD_SUCCESS
%matplotlib inline

In [None]:
def gen_episode(states):
    episode = []
    k = len(states)//2 # start node index
    try:
        while states[k] not in terminals:
            episode.append(states[k])
            action = -1 if random.randint(0, 1) else 1
            k += action
    finally:
        episode.append(states[k])
    return episode

def gen_episodes(states, trials):
    episodes = []
    for n in range(trials):
        episodes.append(gen_episode(states))
    return episodes

def initialize(states):
    vals_dic = {}
    for state in states:
        if state not in terminals:
            vals_dic[state]=0.5
        else:
            vals_dic[state]=0
    return vals_dic

def value_update(eps, vals_dic, alpha):
    for t in range(0, len(eps)-1):
        new_val = vals_dic[eps[t]]+alpha*(reward(eps[t+1])+vals_dic[eps[t+1]]-vals_dic[eps[t]])
        vals_dic[eps[t]]=new_val

def reward(state):
    if state == 'rterm':
        return 1.
    else:
        return 0.

In [None]:
states = ['lterm', 'A', 'B', 'C', 'D', 'E', 'rterm']
terminals = ('lterm', 'rterm')
values_dic1 = initialize(states)
values_dic10 = initialize(states)
values_dic100 = initialize(states)

In [None]:
episodes1 = gen_episodes(states, 1)
episodes10 = gen_episodes(states, 10)
episodes100 = gen_episodes(states, 100)

for episode in episodes1:
    value_update(episode, values_dic1, 0.1)
for episode in episodes10:
    value_update(episode, values_dic10, 0.1)
for episode in episodes100:
    value_update(episode, values_dic100, 0.1)

In [None]:
plt.plot([1/6., 2/6., 3/6., 4/6., 5/6.], 'ko-')
plt.plot([values_dic1[x] for x in states[1:-1]], 'ko-', color='r', label='1')
plt.plot([values_dic10[x] for x in states[1:-1]], 'ko-', color='g', label='10')
plt.plot([values_dic100[x] for x in states[1:-1]], 'ko-', color='b', label='100')
plt.xticks(range(6), states[1:-1])
plt.ylim([0,1])
plt.xlim([-1, 6])
plt.legend(loc='best')

## TD 학습을 이용한 Control 문제
<br/>
<img src="sarsa.PNG", width=700 height=400>
<br/><br/>
<img src="q_learning.PNG", width=700 height=400>
<br/>

## Racetrack problem
<br/><br/>
<img src="./race.PNG" width=700 height=400>

In [None]:
MAX_EPISODE = 10000  # Recommend: 10000 for E-Greedy
MAX_STEP = 70
EGREEDY_EPS = 0.1
GAMMA = 0.99
ALPHA = 0.1
SHOW_TERM = 5001
SPOLICY = "EGreedy"
SAVE_FILENM = "Racetrack_sarsa{}.sav".format(SPOLICY)

In [None]:
def make_env():
    with open('racetrack_map_4.txt', 'r') as f:
        amap = Map(f.read())

    vel_info = (
        0, 3,  # vx min / max
        -3, 3   # vy min / max
    )
    env = RacetrackEnv(amap, vel_info, MAX_STEP)
    return env

def egreedy_policy(env, Q, state, e_no, test_action=None):
    aprobs = Q[state]
    if test_action is not None:
        action = test_action
    else:
        action = np.random.choice(np.flatnonzero(aprobs == aprobs.max()))
    nA = env.action_space.n
    eps = EGREEDY_EPS * (1 - float(e_no) / MAX_EPISODE)
    # eps = EGREEDY_EPS
    A = np.ones(nA) * eps / nA
    A[action] += (1.0 - eps)
    return A

def egreedy_action(aprobs, nA):
    return np.random.choice(range(nA), p=aprobs)

def test_egreedy_policy():
    env = make_env()
    nA = env.action_space.n
    Q = defaultdict(lambda: np.zeros(nA))
    best_action = 1
    state = None
    aprobs = egreedy_policy(env, Q, state, 1, best_action)
    assert np.array_equal(aprobs, np.array([0.02, 0.92, 0.02, 0.02, 0.02]))
    n = 0
    acnt = defaultdict(int)
    TRY_CNT = 100
    while n < TRY_CNT:
        action = np.random.choice(range(nA), p=aprobs)
        acnt[action] += 1
        n += 1
    EPS_CNT = 100 * EGREEDY_EPS
    assert TRY_CNT - acnt[best_action] < 2 * EPS_CNT

def make_greedy_policy(Q):
    def func(state):
        A = np.zeros_like(Q[state], dtype=float)
        best_action = np.argmax(Q[state])
        A[best_action] = 1.0
        return A
    return func

def _run_step(env, Q, N, state, action, nA, n_episode, n_step):
    nstate, reward, done, _ = env.step(action)

    naprobs = egreedy_policy(env, Q, nstate, n_episode + 1)
    naction = np.random.choice(range(nA), p=naprobs)  # SARSA

    v = Q[state][action]
    nv = Q[nstate][naction]
    td_target = reward + GAMMA * nv
    td_delta = td_target - v
    Q[state][action] += ALPHA * td_delta
    return nstate, naction, reward, done  

def _print_policy_progress(Q, state, N, action):
    print("  ", state, Q[state], action)

def _print_done_msg(reward):
    if reward == REWARD_SUCCESS:
        print("   SUCCESS!!")
    else:
        print("   DONE")

def learn_Q(env):
    nA = env.action_space.n
    Q = defaultdict(lambda: np.zeros(nA))
    N = defaultdict(lambda: np.ones(nA))
    rewards_list = []

    for n_episode in range(MAX_EPISODE):
        state = env.reset()
        naprobs = egreedy_policy(env, Q, state, n_episode + 1)
        action = np.random.choice(range(nA), p=naprobs)

        show = (n_episode + 1) % SHOW_TERM == 0
        if show:
            print("========== Policy: {}, Episode: {} / {} ==========".
                 format(SPOLICY, n_episode + 1, MAX_EPISODE))

        for n_step in range(MAX_STEP):
            state, action, reward, done = _run_step(env, Q, N, state, action, nA,
                                                    n_episode, n_step)
            if reward == REWARD_SUCCESS:
                rewards_list.append(1.)
            if show:
                _print_policy_progress(Q, state, N, action)
            if done:
                if show:
                    _print_done_msg(reward)
                break

    print("The average of rewards: {}".format(np.sum(rewards_list)/MAX_EPISODE))
    #print(np.sum(rewards_list), len(rewards_list))
    return Q

def run():
    env = make_env()
    Q = learn_Q(env)
    play_policy = make_greedy_policy(Q)
    env.play(play_policy, 2)

In [None]:
run()

```python
def _run_step(env, Q, N, state, action, nA, n_episode, n_step):
    nstate, reward, done, _ = env.step(action)

    naprobs = egreedy_policy(env, Q, nstate, n_episode + 1)
    naction = np.random.choice(range(nA), p=naprobs)  # SARSA

    v = Q[state][action]
    nv = Q[nstate][naction]
    td_target = reward + GAMMA * nv
    td_delta = td_target - v
    Q[state][action] += ALPHA * td_delta
    return nstate, naction, reward, done  
```

In [None]:
def _run_step(env, Q, N, state, nA, n_episode, n_step):
    aprobs = egreedy_policy(env, Q, state, n_episode)
    action = np.random.choice(range(nA), p=aprobs)
    
    nstate, reward, done, _ = env.step(action)
    naction = np.argmax(Q[nstate])  # Q-learning

    v = Q[state][action]
    nv = Q[nstate][naction]
    td_target = reward + GAMMA * nv
    td_delta = td_target - v
    Q[state][action] += ALPHA * td_delta
    return nstate, naction, reward, done  

def learn_Q(env):
    nA = env.action_space.n
    Q = defaultdict(lambda: np.zeros(nA))
    N = defaultdict(lambda: np.ones(nA))
    rewards_list = []

    for n_episode in range(MAX_EPISODE):
        state = env.reset()
        naprobs = egreedy_policy(env, Q, state, n_episode + 1)
        action = np.random.choice(range(nA), p=naprobs)

        show = (n_episode + 1) % SHOW_TERM == 0
        if show:
            print("========== Policy: {}, Episode: {} / {} ==========".
                 format(SPOLICY, n_episode + 1, MAX_EPISODE))

        for n_step in range(MAX_STEP):
            state, action, reward, done = _run_step(env, Q, N, state, nA,
                                                    n_episode, n_step)
            if reward == REWARD_SUCCESS:
                rewards_list.append(1.)
            if show:
                _print_policy_progress(Q, state, N, action)
            if done:
                if show:
                    _print_done_msg(reward)
                break

    print("The average of rewards: {}".format(np.sum(rewards_list)/MAX_EPISODE))
    #print(np.sum(rewards_list), len(rewards_list))
    return Q

In [None]:
run()