<a href="https://colab.research.google.com/github/aslestia/ACS_2025/blob/main/ACS_Week03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Jika perlu:
# !pip install gymnasium==0.29.1 matplotlib numpy

import numpy as np
import random
import matplotlib.pyplot as plt
import gymnasium as gym
#==Environment & Fixed Policy (untuk evaluasi)==
env = gym.make("Blackjack-v1", sab=True)  # episodic, no counting cards

def fixed_policy(observation):
    # observation = (player_sum, dealer_showing, usable_ace)
    player_sum, dealer, usable_ace = observation
    return 0 if player_sum >= 20 else 1  # 0: stick, 1: hit
#== Generate Episode (untuk MC)
def generate_episode(policy):
    episode = []
    obs, _ = env.reset()
    done = False
    while not done:
        a = policy(obs)
        next_obs, r, terminated, truncated, info = env.step(a)
        episode.append((obs, a, r))
        obs = next_obs
        done = terminated or truncated
    return episode
#== First-Visit MC Policy Evaluation (menghitung V dan Q)
def mc_prediction_first_visit(policy, num_episodes=100000, gamma=1.0):
    # Q: dict[(state, action)] -> value
    returns_sum = {}   # jumlah G untuk (s,a)
    returns_count = {} # banyaknya kunjungan pertama (s,a)
    Q = {}

    for _ in range(num_episodes):
        episode = generate_episode(policy)

        # hitung return G_t mundur
        G = 0.0
        visited = set()
        for t in reversed(range(len(episode))):
            s, a, r = episode[t]
            G = gamma * G + r

            # first-visit check
            if (s, a) not in visited:
                visited.add((s, a))
                returns_sum[(s, a)] = returns_sum.get((s, a), 0.0) + G
                returns_count[(s, a)] = returns_count.get((s, a), 0) + 1
                Q[(s, a)] = returns_sum[(s, a)] / returns_count[(s, a)]

    return Q
#== Visualisasi Nilai Policy
def value_from_Q(Q, usable_ace=False):
    V = np.full((32, 12), np.nan)  # player_sum 0..31, dealer 0..11
    for ps in range(12, 22):       # biasanya relevan 12..21
        for dealer in range(1, 11):
            s = (ps, dealer, usable_ace)
            Qa = [Q.get((s, 0), None), Q.get((s, 1), None)]
            if any(v is not None for v in Qa):
                V[ps, dealer] = np.nanmax([v if v is not None else -np.inf for v in Qa])
    return V

Q_eval = mc_prediction_first_visit(fixed_policy, num_episodes=50000, gamma=1.0)
V_noace = value_from_Q(Q_eval, usable_ace=False)
V_ace   = value_from_Q(Q_eval, usable_ace=True)

plt.figure(); plt.imshow(V_noace.T, origin='lower', aspect='auto')
plt.title("V(s) MC Evaluation – usable_ace=False"); plt.xlabel("player_sum"); plt.ylabel("dealer_showing"); plt.colorbar(); plt.show()

plt.figure(); plt.imshow(V_ace.T, origin='lower', aspect='auto')
plt.title("V(s) MC Evaluation – usable_ace=True"); plt.xlabel("player_sum"); plt.ylabel("dealer_showing"); plt.colorbar(); plt.show()

#== MC Control (ε-greedy) – Mencari Policy yang Lebih Baik
def mc_control_epsilon_greedy(num_episodes=500000, gamma=1.0, epsilon_start=1.0, epsilon_end=0.05):
    Q = {}
    returns_sum, returns_count = {}, {}

    def epsilon_greedy_action(s, eps):
        # actions: 0 stick, 1 hit
        q0 = Q.get((s,0), 0.0)
        q1 = Q.get((s,1), 0.0)
        if random.random() < eps:
            return random.choice([0,1])
        return 0 if q0 >= q1 else 1

    for i in range(1, num_episodes+1):
        # linear decay epsilon
        eps = max(epsilon_end, epsilon_start - (epsilon_start - epsilon_end) * (i/num_episodes))

        # generate episode with current ε-greedy policy
        episode = []
        s, _ = env.reset()
        done = False
        while not done:
            a = epsilon_greedy_action(s, eps)
            s_next, r, terminated, truncated, _ = env.step(a)
            episode.append((s, a, r))
            s = s_next
            done = terminated or truncated

        # First-visit MC updates
        G = 0.0
        visited = set()
        for t in reversed(range(len(episode))):
            s, a, r = episode[t]
            G = gamma * G + r
            if (s,a) not in visited:
                visited.add((s,a))
                returns_sum[(s,a)] = returns_sum.get((s,a), 0.0) + G
                returns_count[(s,a)] = returns_count.get((s,a), 0) + 1
                Q[(s,a)] = returns_sum[(s,a)] / returns_count[(s,a)]

    # extract greedy policy
    def greedy_policy(s):
        q0 = Q.get((s,0), 0.0); q1 = Q.get((s,1), 0.0)
        return 0 if q0 >= q1 else 1

    return Q, greedy_policy
#== Evaluasi Singkat Policy Hasil MC Control
Q_mc, pi_mc = mc_control_epsilon_greedy(num_episodes=200000)

def evaluate_policy(policy, n_episodes=10000):
    rewards = []
    for _ in range(n_episodes):
        s, _ = env.reset()
        done = False
        G = 0
        while not done:
            a = policy(s)
            s, r, term, trunc, _ = env.step(a)
            G += r
            done = term or trunc
        rewards.append(G)
    return np.mean(rewards)

avg_return_fixed = evaluate_policy(fixed_policy, 5000)
avg_return_mc    = evaluate_policy(pi_mc, 5000)
print("Average return Fixed:", avg_return_fixed)
print("Average return MC-control greedy:", avg_return_mc)


In [None]:
# !pip install gymnasium==0.29.1 numpy matplotlib

import numpy as np, random
import matplotlib.pyplot as plt
import gymnasium as gym

env = gym.make("Blackjack-v1", sab=True)  # episodic
ACTIONS = [0, 1]  # 0=stick, 1=hit
def evaluate_policy(policy_fn, n_episodes=5000, env=env):
    """Return rata-rata reward episode dengan policy_fn(s)->a (greedy)."""
    returns = []
    for _ in range(n_episodes):
        s, _ = env.reset()
        done = False
        G = 0
        while not done:
            a = policy_fn(s)
            s, r, term, trunc, _ = env.step(a)
            G += r
            done = term or trunc
        returns.append(G)
    return float(np.mean(returns))

def greedy_from_Q(Q):
    def pi(s):
        q0 = Q.get((s,0), 0.0)
        q1 = Q.get((s,1), 0.0)
        return 0 if q0 >= q1 else 1
    return pi

def eps_greedy_action(Q, s, eps):
    if random.random() < eps:
        return random.choice(ACTIONS)
    q0 = Q.get((s,0), 0.0); q1 = Q.get((s,1), 0.0)
    return 0 if q0 >= q1 else 1
def mc_control_eps_greedy_first_visit(
    num_episodes=200_000, gamma=1.0, eps_start=1.0, eps_end=0.05, env=env, eval_every=10_000
):
    Q = {}
    ret_sum, ret_cnt = {}, {}
    eval_points, eval_scores = [], []

    for ep in range(1, num_episodes+1):
        eps = max(eps_end, eps_start - (eps_start-eps_end)*ep/num_episodes)
        # generate satu episode dengan policy ε-greedy saat ini
        episode = []
        s, _ = env.reset()
        done = False
        while not done:
            a = eps_greedy_action(Q, s, eps)
            s2, r, term, trunc, _ = env.step(a)
            episode.append((s, a, r))
            s = s2
            done = term or trunc

        # First-visit updates (gunakan set visited)
        G = 0.0
        visited = set()
        for t in reversed(range(len(episode))):
            s, a, r = episode[t]
            G = gamma*G + r
            if (s,a) not in visited:
                visited.add((s,a))
                ret_sum[(s,a)] = ret_sum.get((s,a), 0.0) + G
                ret_cnt[(s,a)] = ret_cnt.get((s,a), 0) + 1
                Q[(s,a)] = ret_sum[(s,a)] / ret_cnt[(s,a)]

        if ep % eval_every == 0:
            pi_greedy = greedy_from_Q(Q)
            score = evaluate_policy(pi_greedy, n_episodes=3000, env=env)
            eval_points.append(ep); eval_scores.append(score)

    return Q, (eval_points, eval_scores)
def mc_control_eps_greedy_every_visit(
    num_episodes=200_000, gamma=1.0, eps_start=1.0, eps_end=0.05, env=env, eval_every=10_000
):
    Q = {}
    ret_sum, ret_cnt = {}, {}
    eval_points, eval_scores = [], []

    for ep in range(1, num_episodes+1):
        eps = max(eps_end, eps_start - (eps_start-eps_end)*ep/num_episodes)
        episode = []
        s, _ = env.reset()
        done = False
        while not done:
            a = eps_greedy_action(Q, s, eps)
            s2, r, term, trunc, _ = env.step(a)
            episode.append((s, a, r))
            s = s2
            done = term or trunc

        # Every-visit updates: TIDAK pakai set visited
        G = 0.0
        for t in reversed(range(len(episode))):
            s, a, r = episode[t]
            G = gamma*G + r
            ret_sum[(s,a)] = ret_sum.get((s,a), 0.0) + G
            ret_cnt[(s,a)] = ret_cnt.get((s,a), 0) + 1
            Q[(s,a)] = ret_sum[(s,a)] / ret_cnt[(s,a)]

        if ep % eval_every == 0:
            pi_greedy = greedy_from_Q(Q)
            score = evaluate_policy(pi_greedy, n_episodes=3000, env=env)
            eval_points.append(ep); eval_scores.append(score)

    return Q, (eval_points, eval_scores)
EPISODES = 100_000
Q_fv, (x_fv, y_fv) = mc_control_eps_greedy_first_visit(num_episodes=EPISODES, eval_every=10_000)
Q_ev, (x_ev, y_ev) = mc_control_eps_greedy_every_visit(num_episodes=EPISODES, eval_every=10_000)

pi_fv = greedy_from_Q(Q_fv)
pi_ev = greedy_from_Q(Q_ev)

avg_fv = evaluate_policy(pi_fv, n_episodes=10_000)
avg_ev = evaluate_policy(pi_ev, n_episodes=10_000)

print("Average return (greedy) – First-visit:", round(avg_fv, 4))
print("Average return (greedy) – Every-visit:", round(avg_ev, 4))

plt.figure()
plt.plot(x_fv, y_fv, marker='o', label='First-visit MC')
plt.plot(x_ev, y_ev, marker='s', label='Every-visit MC')
plt.axhline(0.0, linestyle='--', linewidth=1)
plt.xlabel("Episodes")
plt.ylabel("Avg return (eval 3k eps)")
plt.title("MC Control ε-greedy: First-visit vs Every-visit")
plt.legend(); plt.show()


In [None]:
# Jika diperlukan, buka komentar baris berikut untuk memasang dependensi:
# !pip install gymnasium==0.29.1 numpy matplotlib

import numpy as np
import matplotlib.pyplot as plt
import random
import gymnasium as gym

env = gym.make("FrozenLake-v1", is_slippery=True)
n_states = env.observation_space.n
n_actions = env.action_space.n
n_states, n_actions

def epsilon_greedy(Q, s, epsilon):
    """Pilih aksi dengan epsilon-greedy dari tabel Q.
    - Dengan probabilitas epsilon: pilih aksi acak (eksplorasi)
    - Selainnya: pilih aksi argmax Q (eksploitasi)
    """
    if random.random() < epsilon:
        return random.randrange(n_actions)
    else:
        return int(np.argmax(Q[s]))

def evaluate_policy(Q, episodes=500):
    """Evaluasi policy greedy(Q): kembalikan rata-rata reward (success rate)."""
    wins = 0
    for _ in range(episodes):
        s, _ = env.reset()
        done = False
        while not done:
            a = int(np.argmax(Q[s]))
            s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            if done and r > 0:
                wins += 1
    return wins / episodes

def rolling_mean(x, w=50):
    if len(x) < w:
        return np.array(x, dtype=float)
    c = np.cumsum(np.insert(x, 0, 0))
    return (c[w:] - c[:-w]) / float(w)

#=================

def train_sarsa(
    num_episodes=5000,
    alpha=0.1,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.05,
):
    Q = np.zeros((n_states, n_actions), dtype=float)
    eps_history = []
    perf_history = []  # success rate evaluasi berkala

    for ep in range(1, num_episodes + 1):
        # Linear decay epsilon
        epsilon = max(eps_end, eps_start - (eps_start - eps_end) * ep / num_episodes)
        eps_history.append(epsilon)

        s, _ = env.reset()
        a = epsilon_greedy(Q, s, epsilon)
        done = False
        while not done:
            s2, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            if not done:
                a2 = epsilon_greedy(Q, s2, epsilon)
                target = r + gamma * Q[s2, a2]
            else:
                target = r  # terminal
            td_error = target - Q[s, a]
            Q[s, a] += alpha * td_error
            s, a = s2, (epsilon_greedy(Q, s2, epsilon) if not done else 0)

        # evaluasi berkala (setiap 200 ep)
        if ep % 200 == 0:
            perf = evaluate_policy(Q, episodes=300)
            perf_history.append(perf)

    return Q, eps_history, perf_history
#=============================
def train_q_learning(
    num_episodes=5000,
    alpha=0.1,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.05,
):
    Q = np.zeros((n_states, n_actions), dtype=float)
    eps_history = []
    perf_history = []

    for ep in range(1, num_episodes + 1):
        epsilon = max(eps_end, eps_start - (eps_start - eps_end) * ep / num_episodes)
        eps_history.append(epsilon)

        s, _ = env.reset()
        done = False
        while not done:
            a = epsilon_greedy(Q, s, epsilon)
            s2, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            best_next = 0 if done else np.max(Q[s2])
            target = r + gamma * best_next
            td_error = target - Q[s, a]
            Q[s, a] += alpha * td_error
            s = s2

        if ep % 200 == 0:
            perf = evaluate_policy(Q, episodes=300)
            perf_history.append(perf)

    return Q, eps_history, perf_history
#==============

EPISODES = 10000
ALPHA = 0.1
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.05

Q_sarsa, eps_sarsa, perf_sarsa = train_sarsa(EPISODES, ALPHA, GAMMA, EPS_START, EPS_END)
Q_q, eps_q, perf_q = train_q_learning(EPISODES, ALPHA, GAMMA, EPS_START, EPS_END)

avg_sarsa = evaluate_policy(Q_sarsa, episodes=1000)
avg_q = evaluate_policy(Q_q, episodes=1000)
avg_sarsa, avg_q

#===============

x = np.arange(200, EPISODES+1, 200)
plt.figure()
plt.plot(x, perf_sarsa, marker='o', label='SARSA')
plt.plot(x, perf_q, marker='s', label='Q-learning')
plt.xlabel('Episodes')
plt.ylabel('Success rate (eval 300 eps)')
plt.title('FrozenLake TD Control: SARSA vs Q-learning')
plt.legend(); plt.show()

In [None]:
# Jika perlu, hapus komentar berikut:
# !pip install gymnasium==0.29.1 numpy matplotlib

import numpy as np
import matplotlib.pyplot as plt
import random
import gymnasium as gym

env = gym.make("CartPole-v1")
n_actions = env.action_space.n
n_actions

#===
# Konfigurasi bin untuk tiap dimensi state
NUM_BINS = (6, 6, 12, 12)  # cart_pos, cart_vel, pole_angle, pole_vel

# Batasan (clipping) untuk tiap dimensi agar diskretisasi tidak meledak
STATE_BOUNDS = np.array([
    [-2.4, 2.4],        # cart position (env termination bound)
    [-3.0, 3.0],        # cart velocity (dibatasi agar masuk akal)
    [-0.2095, 0.2095],  # pole angle (~12 degrees)
    [-3.5, 3.5]         # pole velocity at tip (dibatasi)
], dtype=float)

def create_bins(low, high, bins):
    """Buat batas bin (tanpa termasuk -inf/inf) untuk np.digitize."""
    return np.linspace(low, high, bins - 1)

# Precompute batas bin untuk tiap dimensi
BIN_EDGES = [create_bins(STATE_BOUNDS[i,0], STATE_BOUNDS[i,1], NUM_BINS[i]) for i in range(4)]

def discretize_state(state):
    """Map state kontinu -> tuple indeks diskrit (i0,i1,i2,i3)."""
    s = np.array(state, dtype=float)
    # Clip agar dalam batas
    s = np.clip(s, STATE_BOUNDS[:,0], STATE_BOUNDS[:,1])
    idxs = [int(np.digitize(s[i], BIN_EDGES[i])) for i in range(4)]
    # Pastikan indeks dalam [0, bins-1]
    idxs = [min(NUM_BINS[i]-1, max(0, idxs[i])) for i in range(4)]
    return tuple(idxs)

def q_shape():
    return (*NUM_BINS, n_actions)  # contoh: (6,6,12,12,2)

q_shape()

#===
def epsilon_greedy(Q, state_idx, epsilon):
    if random.random() < epsilon:
        return random.randrange(n_actions)
    return int(np.argmax(Q[state_idx]))

def evaluate(Q, episodes=20):
    """Evaluasi policy greedy dari Q, kembalikan rata-rata total reward per episode."""
    tot = 0.0
    for _ in range(episodes):
        s, _ = env.reset()
        s_idx = discretize_state(s)
        done = False
        ep_reward = 0.0
        while not done:
            a = int(np.argmax(Q[s_idx]))
            s, r, terminated, truncated, _ = env.step(a)
            s_idx = discretize_state(s)
            ep_reward += r
            done = terminated or truncated
        tot += ep_reward
    return tot / episodes

def rolling_mean(x, w=50):
    if len(x) < w:
        return np.array(x, dtype=float)
    c = np.cumsum(np.insert(x, 0, 0))
    return (c[w:] - c[:-w]) / float(w)

#===
def train_sarsa(
    episodes=2000,
    alpha=0.1,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.05,
):
    Q = np.zeros(q_shape(), dtype=float)
    rewards = []
    eval_hist = []

    for ep in range(1, episodes+1):
        epsilon = max(eps_end, eps_start - (eps_start - eps_end) * ep / episodes)
        s, _ = env.reset()
        s_idx = discretize_state(s)
        a = epsilon_greedy(Q, s_idx, epsilon)

        done = False
        ep_reward = 0.0
        while not done:
            s_next, r, term, trunc, _ = env.step(a)
            ep_reward += r
            done = term or trunc
            s_next_idx = discretize_state(s_next)
            if not done:
                a_next = epsilon_greedy(Q, s_next_idx, epsilon)
                target = r + gamma * Q[s_next_idx + (a_next,)]
            else:
                target = r
            td_error = target - Q[s_idx + (a,)]
            Q[s_idx + (a,)] += alpha * td_error
            s_idx, a = s_next_idx, (a_next if not done else 0)

        rewards.append(ep_reward)
        if ep % 50 == 0:
            eval_hist.append(evaluate(Q, episodes=10))

    return Q, rewards, eval_hist

#===
def train_q_learning(
    episodes=2000,
    alpha=0.1,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.05,
):
    Q = np.zeros(q_shape(), dtype=float)
    rewards = []
    eval_hist = []

    for ep in range(1, episodes+1):
        epsilon = max(eps_end, eps_start - (eps_start - eps_end) * ep / episodes)
        s, _ = env.reset()
        s_idx = discretize_state(s)
        done = False
        ep_reward = 0.0
        while not done:
            a = epsilon_greedy(Q, s_idx, epsilon)
            s_next, r, term, trunc, _ = env.step(a)
            ep_reward += r
            done = term or trunc
            s_next_idx = discretize_state(s_next)
            best_next = 0.0 if done else np.max(Q[s_next_idx])
            target = r + gamma * best_next
            td_error = target - Q[s_idx + (a,)]
            Q[s_idx + (a,)] += alpha * td_error
            s_idx = s_next_idx

        rewards.append(ep_reward)
        if ep % 50 == 0:
            eval_hist.append(evaluate(Q, episodes=10))

    return Q, rewards, eval_hist

#===
EPISODES = 4000
ALPHA = 0.1
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.05

Q_sarsa, rew_sarsa, eval_sarsa = train_sarsa(EPISODES, ALPHA, GAMMA, EPS_START, EPS_END)
Q_q, rew_q, eval_q = train_q_learning(EPISODES, ALPHA, GAMMA, EPS_START, EPS_END)

print('Greedy eval (avg reward) SARSA:', evaluate(Q_sarsa, episodes=30))
print('Greedy eval (avg reward) Q-learn:', evaluate(Q_q, episodes=30))

#===
plt.figure()
plt.plot(rew_sarsa)
plt.plot(rolling_mean(rew_sarsa, 50))
plt.title('SARSA: Reward per Episode (CartPole)')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

plt.figure()
plt.plot(rew_q)
plt.plot(rolling_mean(rew_q, 50))
plt.title('Q-learning: Reward per Episode (CartPole)')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()