# Improving the Plinko DQN algorithm using Double Q-Learning

In [1]:
import random
from collections import defaultdict, deque
import copy # deep copying Q-table
from collections import namedtuple
from grid_utils import unmark_block
from state_utils import choose_action, find_ledge_state_key, get_valid_diagonal_moves, handle_blocks, drop_ball, initialize_trackers
from board_builder import build_board
from visualization import print_training_stats

### Part 1: Double Q-Learning

#### Motivation: 
>The original Plinko code uses standard Q-learning. Q-learning is known for maximization bias, leading to overestimation of action values. Our standard Q-learning algorithm uses one Q-table to select both the best next action and to evaluate the value of that action. If some action's value is overestimated our max operation will likely select it therefore distributing the overestimation. Double Q-learning ensures that our selection and evaluation are separate. We will use the online Q-table to select the best next action while using the target Q-table to evaluate the value of that chosen action. This will reduce the chance of consistently selecting actions based on overestimated values.

#### Expectation: 
>We expect more accurate Q-value estimates, which will hopefully result in a more stable learning process and convergence to a better final policy to ensure a higher success rate for the target bucket. It might also prevent our agent from getting stuck favouring sub-optimal paths due to early overestimations.

### Double Q-Learning and Experience Replay

In [2]:
# declaration of tracker dictionaries
trackers = initialize_trackers()

# DDQN learning initialization
learning_rate = 0.1
discount_factor = 0.99 # higher discount factor for potentially long paths
exploration_rate = 1.0  # start fully exploratory
exploration_decay = 0.995  # slow decay
min_exploration = 0.01  # smallest possible exploration rate
episodes = 1000  # number of training episodes

update_frequency = 4 # learn every 4 steps
target_update_frequency = 100 # update target table every 100 steps
soft_update_alpha = 0.01 # soft update parameter

# Experience Replay
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
replay_buffer = deque(maxlen=10000) # store last 10k transitions
batch_size = 64

# training setup
target_bucket = 2  # the bucket the agent should aim for

# Q-Learning Specific
# Two Q-tables for Double DQN
q_table_online = defaultdict(lambda: defaultdict(float))
q_table_target = defaultdict(lambda: defaultdict(float))

### Training Loop (DDQN)

In [3]:
def learn(grid, width, learning_rate, discount_factor):
    if len(replay_buffer) < batch_size:
        return # not enough samples yet

    mini_batch = random.sample(replay_buffer, batch_size)

    for experience in mini_batch:
        state, action, reward, next_state, done = experience
        # check state and action exist in online table
        if state not in q_table_online or action not in q_table_online[state]:
             q_table_online[state][action] = 0.0
        if state not in q_table_target or action not in q_table_target[state]:
             q_table_target[state][action] = 0.0
            
        current_q_online = q_table_online[state][action]

        # target calculation
        if done:
            target_q = reward # use the final reward directly
        else:
            next_state_type = next_state[0]
            if isinstance(next_state_type, str) and next_state_type == 'block':
                next_available_actions = list(range(width))
            elif isinstance(next_state_type, tuple):
                ledge_y_next = next_state_type[1]
                next_available_actions = [col for col in range(width) if grid.get((col, ledge_y_next)) in {'_', '⬒', '⤓', '↥'}]
            else:
                print(f"Error: Unknown next state format in learn: {next_state}")
                next_available_actions = []
                
            if next_state not in q_table_online:
                 q_table_online[next_state] = defaultdict(float)
                 for act in next_available_actions:
                    q_table_online[next_state][act] = 0.0
            if next_state not in q_table_target:
                 q_table_target[next_state] = defaultdict(float)
                 for act in next_available_actions:
                    q_table_target[next_state][act] = 0.0
            
            # best action in next state using online table
            online_q_next = q_table_online[next_state]
            if not online_q_next or not next_available_actions: # if no actions available or state just initialized
                 best_next_action = None
                 max_online_q_next = 0.0
            else:
                 max_online_q_next = -float('inf')
                 best_actions_next = []
                 for act in next_available_actions: # check only valid next actions
                    q_val = online_q_next.get(act, 0.0) 
                    if q_val > max_online_q_next:
                        max_online_q_next = q_val
                        best_actions_next = [act]
                    elif q_val == max_online_q_next:
                        best_actions_next.append(act)
                 if not best_actions_next:
                      best_next_action = random.choice(next_available_actions)
                 else:
                     best_next_action = random.choice(best_actions_next)

            # get Q-value for that best action using target table
            q_target_next = q_table_target[next_state].get(best_next_action, 0.0) # 0 if action missing
                 
            # reward for intermediate steps is 0
            target_q = discount_factor * q_target_next # R_i=0 + gamma * Q_target(s', argmax Q_online(s',a'))
        # update online Q-table
        q_table_online[state][action] = current_q_online + learning_rate * (target_q - current_q_online)

# target table update        
def update_target_network(online_q_table, target_q_table, alpha):
    for state in online_q_table:
        if state not in target_q_table:
             target_q_table[state] = defaultdict(float)
             
        for action in online_q_table[state]:
            # initialize target action value if it doesn't exist
            if action not in target_q_table[state]:
                 target_q_table[state][action] = 0.0 
            target_q_table[state][action] = (1.0 - alpha) * target_q_table[state][action] + alpha * online_q_table[state][action]

# agent determines when to call learn() based on total steps
def should_learn():
     return total_decision_steps[0] % update_frequency == 0

In [4]:
total_decision_steps = [0]
steps_after_episode = 0
episode_rewards_history = []
most_recent_rewards = deque(maxlen=100)

# synchronize Q-tables initially
q_table_target = copy.deepcopy(q_table_online) 

# training loop
for episode in range(episodes):
    grid, buckets = build_board("default", 15, 30, trackers)

    width = max(x for (x, y) in grid.keys()) + 1
    height = max(y for (x, y) in grid.keys()) + 1
         
    start_x = random.randint(0, width - 1)
    
    episode_final_reward, final_bucket = drop_ball(
        grid=grid,
        width=width,
        height=height,
        start_x=start_x,
        buckets=buckets,
        target_bucket=target_bucket,
        mode="dqn",  # 👈 DQN mode
        exploration_rate=exploration_rate,
        q_table=q_table_online,
        trackers=trackers,
        extra={
            "replay_buffer": replay_buffer,
            "should_learn": should_learn,
            "learn": learn,
            "Experience": Experience,
            "q_table_target": q_table_target,
            "soft_update_alpha": soft_update_alpha,
            "update_target_network": update_target_network,
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "discount_factor": discount_factor,
            "total_decision_steps": total_decision_steps,  # pass as list for mutability
            "target_update_frequency": target_update_frequency
        }
    )
    steps_after_episode += total_decision_steps[0]
    
    # check if target table update is due based on step count progression
    if (total_decision_steps[0] // target_update_frequency) < (steps_after_episode // target_update_frequency):
       update_target_network(q_table_online, q_table_target, soft_update_alpha)
       # print(f"Target network updated after episode {episode + 1}")
    
    # perform learning steps from replay buffer
    if len(replay_buffer) > batch_size:
        for _ in range(10): # example 10 learning steps per episode end
             learn(grid, width, learning_rate, discount_factor) 
             
    episode_rewards_history.append(episode_final_reward) # save the final reward
    most_recent_rewards.append(episode_final_reward)
    
    # decay exploration rate
    exploration_rate = max(min_exploration, exploration_rate * exploration_decay)

    # print progress
    if (episode + 1) % 100 == 0:
        avg_reward = sum(most_recent_rewards) / len(most_recent_rewards)
        print(f"Episode {episode + 1} | Avg Reward (Last 100): {avg_reward:.2f} | Exploration Rate: {exploration_rate:.2f} | Buffer Size: {len(replay_buffer)} | Q-States: {len(q_table_online)}")

# final statistics summary
print_training_stats(
    trackers,
    q_table_online  # or q_table depending on version
)


Episode 100 | Avg Reward (Last 100): 0.89 | Exploration Rate: 0.61 | Buffer Size: 730 | Q-States: 17
Episode 200 | Avg Reward (Last 100): 0.97 | Exploration Rate: 0.37 | Buffer Size: 1255 | Q-States: 17
Episode 300 | Avg Reward (Last 100): 1.00 | Exploration Rate: 0.22 | Buffer Size: 1815 | Q-States: 17
Episode 400 | Avg Reward (Last 100): 0.98 | Exploration Rate: 0.13 | Buffer Size: 2304 | Q-States: 17
Episode 500 | Avg Reward (Last 100): 0.96 | Exploration Rate: 0.08 | Buffer Size: 2763 | Q-States: 17
Episode 600 | Avg Reward (Last 100): 0.94 | Exploration Rate: 0.05 | Buffer Size: 3303 | Q-States: 17
Episode 700 | Avg Reward (Last 100): 0.97 | Exploration Rate: 0.03 | Buffer Size: 3938 | Q-States: 17
Episode 800 | Avg Reward (Last 100): 0.98 | Exploration Rate: 0.02 | Buffer Size: 4571 | Q-States: 17
Episode 900 | Avg Reward (Last 100): 1.00 | Exploration Rate: 0.01 | Buffer Size: 5243 | Q-States: 17
Episode 1000 | Avg Reward (Last 100): 1.00 | Exploration Rate: 0.01 | Buffer Size: 

Unnamed: 0_level_0,Unnamed: 1_level_0,9,10,11,12,13,6,7,8,1,2,3,4,5,14,0
position,buttons_pressed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
"(2, 27)",(),,,,,,0.752623,,,,0.855214,0.830387,0.821098,0.767238,,
"(6, 24)",(),0.829509,,,,,0.729444,0.750045,0.608597,,,,,,,
"(1, 21)",(),,,,,,0.371865,,,0.882401,0.881815,0.873085,0.881983,0.643639,,
"(9, 19)",(),0.75214,0.876011,0.878144,0.879795,0.884361,,,,,,,,,,
"(3, 17)",(),0.871435,,,,,0.896523,0.870908,0.869706,,,0.887916,0.905644,0.912174,,
"(7, 15)",(),0.901526,0.90016,0.888804,0.888068,,,0.910566,0.908685,,,,,,,
"(0, 13)",(),,,,,,,,,0.9249,0.927315,0.929595,0.92715,,,0.926017
"(9, 11)",(),0.902466,0.906388,0.920162,0.922874,0.929629,,,,,,,,,0.929112,
"(0, 9)",(),,,,,,0.889655,0.889895,0.889857,0.928935,0.929305,0.929215,0.889454,0.890042,,0.954601
"(0, 9)","((0, 9),)",,,,,,0.971257,0.971415,0.971384,0.187649,0.0,0.808341,0.979675,0.971541,,0.0
