# Improving the Plinko DQN algorithm using Double Q-Learning

In [None]:
import random
import pandas as pd
from IPython.display import display
from collections import defaultdict, deque
import numpy as np
import copy # deep copying Q-table
from collections import namedtuple

### Part 1: Double Q-Learning

#### Motivation: 
>The original Plinko code uses standard Q-learning. Q-learning is known for maximization bias, leading to overestimation of action values. Our standard Q-learning algorithm uses one Q-table to select both the best next action and to evaluate the value of that action. If some action's value is overestimated our max operation will likely select it therefore distributing the overestimation. Double Q-learning ensures that our selection and evaluation are separate. We will use the online Q-table to select the best next action while using the target Q-table to evaluate the value of that chosen action. This will reduce the chance of consistently selecting actions based on overestimated values.

#### Expectation: 
>We expect more accurate Q-value estimates, which will hopefully result in a more stable learning process and convergence to a better final policy to ensure a higher success rate for the target bucket. It might also prevent our agent from getting stuck favouring sub-optimal paths due to early overestimations.

### Global Trackers and Dictionaries

In [24]:
pipes = {}  # maps (x, y) of pipe end -> (x, y) of connected destination
blocks = {}  # maps row_y -> {x: original tile} for restoring blocked rows

# Q-Learning Specific
# Two Q-tables for Double DQN
q_table_online = defaultdict(lambda: defaultdict(float))
q_table_target = defaultdict(lambda: defaultdict(float))

# Experience Replay
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
replay_buffer = deque(maxlen=10000) # store last 10k transitions
batch_size = 64

# Statistics Trackers
bucket_tracker = {i: 0 for i in range(5)}  # maps bucket index -> number of landings
ledge_tracker = defaultdict(int)  # maps state tuple -> number of visits
block_row_tracker = defaultdict(int) # maps block state tuple -> number of visits
spike_tracker = defaultdict(int)  # maps spike row y -> number of hits
pipe_tracker = defaultdict(int)  # maps (x, y) of pipe entry/exit -> number of uses
button_tracker = defaultdict(int)  # maps (x, y) of button tile -> number of presses
episode_rewards = [] # track rewards per episode

### Board Functions

In [25]:
def generate_grid(width, height):
    grid = {}
    for y in range(height):
        for x in range(width):
            if (y % 2 == 0 and x % 2 == 1) or (y % 2 == 1 and x % 2 == 0):
                grid[(x, height - 1 - y)] = 'O'  # place pegs in a checkered pattern
            else:
                grid[(x, height - 1 - y)] = ' '  # empty spaces between pegs
    return grid

def mark_ledge(grid, start_x, length, ledge_y, button_x=None):
    # place a horizontal ledge starting at start_x on row ledge_y
    for x in range(start_x, start_x + length):
        if (x, ledge_y) not in grid or grid[(x, ledge_y)] == ' ': # Avoid overwriting pegs/other features unless empty
            if x == button_x:
                grid[(x, ledge_y)] = '⬒'  # mark a special button tile
                button_tracker[(x, ledge_y)]  # initialize button in tracker
            else:
                grid[(x, ledge_y)] = '_'  # normal ledge tile
    # Ledge tracker now tracks state visits, not just ledge definitions

def mark_spike(grid, start_x, length, spike_y):
    for x in range(start_x, start_x + length):
            if (x, spike_y) not in grid or grid[(x, spike_y)] == ' ':
                grid[(x, spike_y)] = '^'
    spike_tracker[spike_y]  # auto-initializes to 0 if not already set

def mark_pipe(grid, x, y1, y2):
    # mark a vertical pipe that connects y1 and y2 at column x
    top = max(y1, y2)
    bottom = min(y1, y2)

    for y in range(bottom, top + 1):
        if y == top:
            grid[(x, y)] = '⤓'  # down pipe entrance
        elif y == bottom:
            grid[(x, y)] = '↥'  # up pipe entrance
        else:
            tile = grid.get((x, y), ' ')
            grid[(x, y)] = 'Φ' if tile == 'O' else '|'  # middle of the pipe

    # connect both ends in the pipes map
    pipes[(x, top)] = (x, bottom)
    pipes[(x, bottom)] = (x, top)

    # start tracking usage of this pipe
    pipe_tracker[(x, top)]
    pipe_tracker[(x, bottom)]

def mark_slide(grid, start_x, start_y, length, direction):
    slide_char = '\\\\' if direction == "forward" else '/'
    x, y = start_x, start_y

    for _ in range(length):
        if (x, y) in grid and grid[(x, y)] == 'O':
            grid[(x, y)] = slide_char  # replace pegs with slides

        # move diagonally in the selected direction
        if direction == "forward":
            x += 1
            y -= 1
        else:
            x -= 1
            y -= 1

def mark_block(grid, width, row_y):
    if row_y in blocks:
        return  # skip if already marked

    blocks[row_y] = {}  # store original row tiles
    for x in range(width):
        current_tile = grid.get((x, row_y), ' ')
        if current_tile not in {'↥', 'Φ', '⤓', '|'}:  # skip if tile is part of a pipe
            blocks[row_y][x] = current_tile  # remember what was here
            grid[(x, row_y)] = '█'  # mark block tile

def unmark_block(grid, row_y):
    if row_y not in blocks:
        return  # nothing to unmark

    for x, original_char in blocks[row_y].items():
        grid[(x, row_y)] = original_char  # restore original tile
    del blocks[row_y]  # remove from block tracker

def mark_buckets(width, num_buckets):
    buckets = {}  # maps x to bucket index
    base_size = width // num_buckets  # base size for each bucket
    extra = width % num_buckets  # leftover columns
    middle_bucket = num_buckets // 2  # middle bucket index
    start_x = 0  # starting column for current bucket

    for i in range(num_buckets):
        # add 1 to size if extra columns remain and it's not the middle bucket
        size = base_size + (1 if extra > 0 and i != middle_bucket else 0)
        for x in range(start_x, start_x + size):
            buckets[x] = i  # map each column to bucket index
        start_x += size  # move to next start column
        if extra > 0 and i != middle_bucket:
            extra -= 1  # use up one extra column

    return buckets

def visualize_grid(grid, width, height, ball_position=None, buckets=None):
    x_labels = "   " + " ".join(str(i % 10) for i in range(width))  # x-axis labels
    print(x_labels)  # print top x-axis
    for y in range(height - 1, -1, -1):
        row = f"{y:2} "  # add y-axis label
        for x in range(width):
            if ball_position and (x, y) == ball_position:
                row += 'X'  # draw ball
            else:
                row += grid.get((x, y), ' ')  # draw tile
            row += " "
        print(row)  # print full row

    bucket_row = "   "
    if buckets:
        for x in range(width):
            bucket_row += str(buckets.get(x, ' ')) + " "
    else:
        bucket_row += "  " * width
    print(bucket_row)  # print bucket labels
    print(x_labels)  # print bottom x-axis
    print("===" + "=" * (2 * width))  # draw horizontal divider

### Game Logic: DQN and Experience Replay

In [65]:
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) # keep track of action-states for now

def choose_action(state, q_table, epsilon, width, grid):
    # available actions based on the state type
    if isinstance(state[0], str) and state[0] == 'block':
        available_actions = list(range(width))
    elif isinstance(state[0], tuple): # ledge state
        ledge_start_x, ledge_y = state[0]
        # check if ledge_y is valid before accessing grid
        available_actions = [col for col in range(width) if grid.get((col, ledge_y)) in {'_', '⬒', '⤓', '↥'}]
        if not available_actions:
           available_actions = [col for col in range(width) if grid.get((col, ledge_y)) in {'_', '⬒', '⤓', '↥'}]
    else:
        print(f"Warning: Unrecognized state format for action selection: {state}")
        available_actions = list(range(width))
        
    # check state exists in the q_table (initialize if not)
    if state not in q_table:
        q_table[state] = defaultdict(float)
        for act in available_actions:
             q_table[state][act] = 0.0 
    current_q_actions = q_table[state]
    for act in available_actions:
        if act not in current_q_actions:
             current_q_actions[act] = 0.0
    
    # if no actions were available or initialized, fallback
    if not available_actions:
         print(f"Error: No available actions determined for state {state}. Choosing random column.")
         return random.choice(list(range(width)))

    if random.random() < epsilon:
        return random.choice(available_actions)  # explore
    else:
        # choose the action with the highest Q-value from available actions
        q_values = q_table[state]
        max_q = -float('inf')
        best_actions = []
        
        for act in available_actions: 
            q_val = q_values.get(act, 0.0) # set to 0 if action not seen before in this state
            if q_val > max_q:
                max_q = q_val
                best_actions = [act]
            elif q_val == max_q:
                best_actions.append(act)
        
        if not best_actions: # if all available actions have Q=-inf
             return random.choice(available_actions) # select random available action
             
        return random.choice(best_actions) # choose randomly among best actions

# find the state key ((start_x, y), frozenset(buttons)) for the ledge the ball is currently on
def find_ledge_state_key(x, y, width, grid, pressed_buttons):
    # search left from current x to find the start of the connected ledge segment
    ledge_start_x = x
    while ledge_start_x >= 0 and grid.get((ledge_start_x, y)) in {'_', '⬒', '⤓', '↥'}:
        ledge_start_x -= 1
    ledge_start_x += 1 # correct start position
    
    # check if the found start is actually a ledge tile
    if grid.get((ledge_start_x, y)) in {'_', '⬒', '⤓', '↥'}:
         return ((ledge_start_x, y), frozenset(pressed_buttons))
    else:
         # this is when the ball is on a pipe tile adjacent to a ledge and not technically on the ledge start itself
         # return None if we can't confirm the start point
         print(f"Debug: Could not confirm ledge start for ({x},{y}). Found start_x={ledge_start_x}, tile={grid.get((ledge_start_x, y))}")
         return None 

### Double Q-Learning and Experience Replay

In [66]:
# DDQN learning initialization
learning_rate = 0.1
discount_factor = 0.99 # higher discount factor for potentially long paths
exploration_rate = 1.0  # start fully exploratory
exploration_decay = 0.995  # slow decay
min_exploration = 0.01  # smallest possible exploration rate
episodes = 1000  # number of training episodes

update_frequency = 4 # learn every 4 steps
target_update_frequency = 100 # update target table every 100 steps
soft_update_alpha = 0.01 # soft update parameter

# training setup
target_bucket = 1  # the buckets the agent should aim for
total_timesteps = 0 # counter for triggering learn/target updates

In [67]:
def learn(grid, width, learning_rate, discount_factor):
    if len(replay_buffer) < batch_size:
        return # not enough samples yet

    mini_batch = random.sample(replay_buffer, batch_size)

    for experience in mini_batch:
        state, action, reward, next_state, done = experience
        # check state and action exist in online table
        if state not in q_table_online or action not in q_table_online[state]:
             q_table_online[state][action] = 0.0
        if state not in q_table_target or action not in q_table_target[state]:
             q_table_target[state][action] = 0.0
            
        current_q_online = q_table_online[state][action]

        # target calculation
        if done:
            target_q = reward # use the final reward directly
        else:
            next_state_type = next_state[0]
            if isinstance(next_state_type, str) and next_state_type == 'block':
                next_available_actions = list(range(width))
            elif isinstance(next_state_type, tuple):
                ledge_y_next = next_state_type[1]
                next_available_actions = [col for col in range(width) if grid.get((col, ledge_y_next)) in {'_', '⬒', '⤓', '↥'}]
            else:
                print(f"Error: Unknown next state format in learn: {next_state}")
                next_available_actions = []
                
            if next_state not in q_table_online:
                 q_table_online[next_state] = defaultdict(float)
                 for act in next_available_actions:
                    q_table_online[next_state][act] = 0.0
            if next_state not in q_table_target:
                 q_table_target[next_state] = defaultdict(float)
                 for act in next_available_actions:
                    q_table_target[next_state][act] = 0.0
            
            # best action in next state using online table
            online_q_next = q_table_online[next_state]
            if not online_q_next or not next_available_actions: # if no actions available or state just initialized
                 best_next_action = None
                 max_online_q_next = 0.0
            else:
                 max_online_q_next = -float('inf')
                 best_actions_next = []
                 for act in next_available_actions: # check only valid next actions
                    q_val = online_q_next.get(act, 0.0) 
                    if q_val > max_online_q_next:
                        max_online_q_next = q_val
                        best_actions_next = [act]
                    elif q_val == max_online_q_next:
                        best_actions_next.append(act)
                 if not best_actions_next:
                      best_next_action = random.choice(next_available_actions)
                 else:
                     best_next_action = random.choice(best_actions_next)

            # get Q-value for that best action using target table
            q_target_next = q_table_target[next_state].get(best_next_action, 0.0) # 0 if action missing
                 
            # reward for intermediate steps is 0
            target_q = discount_factor * q_target_next # R_i=0 + gamma * Q_target(s', argmax Q_online(s',a'))
        # update online Q-table
        q_table_online[state][action] = current_q_online + learning_rate * (target_q - current_q_online)

# target table update        
def update_target_network(online_q_table, target_q_table, alpha):
    for state in online_q_table:
        if state not in target_q_table:
             target_q_table[state] = defaultdict(float)
             
        for action in online_q_table[state]:
            # initialize target action value if it doesn't exist
            if action not in target_q_table[state]:
                 target_q_table[state][action] = 0.0 
            target_q_table[state][action] = (1.0 - alpha) * target_q_table[state][action] + alpha * online_q_table[state][action]

In [74]:
def drop_ball_episode(grid, width, height, start_x, buckets, exploration_rate, target_bucket, should_learn):
    x, y = start_x, height - 1
    pressed_buttons = set()
    last_state = None
    last_action = -1
    time_step = 0 # keep track of steps
    global total_decision_steps
    # visualize_grid(grid, width, height, None, buckets) # can be slow

    while y > 0:
        # visualize_grid(grid, width, height, (x,y), buckets)
        current_tile = grid.get((x, y))
        current_state_tuple = None
        
        # check for decision points, ledges or blocks
        is_ledge = current_tile in {'_', '⬒', '⤓', '↥'}
        is_block = current_tile == '█' or (current_tile in {'⤓', '↥'} and (grid.get((x - 1, y), '') == '█' or grid.get((x + 1, y), '') == '█'))
        if is_ledge or is_block:
            # find state
            current_pressed_buttons_frozen = frozenset(pressed_buttons)
            if is_ledge:
                state_key_info = find_ledge_state_key(x, y, width, grid, current_pressed_buttons_frozen)
                if state_key_info:
                    current_state_tuple = state_key_info
                    ledge_tracker[current_state_tuple] += 1
                else:
                    print(f"Error: Ledge state key not found at ({x},{y}) tile '{current_tile}'")
                    # treat as terminal error state for this episode
                    reward = -50.0 
                    done = True
                    if last_state is not None: # save final error transition
                       replay_buffer.append(Experience(last_state, last_action, reward, None, done))
                    break
            elif is_block:
                current_state_tuple = (('block', y), current_pressed_buttons_frozen)
                block_row_tracker[current_state_tuple] += 1
                
            # save previous transition
            if last_state is not None:
                # the reward for intermediate steps is 0 for now
                replay_buffer.append(Experience(last_state, last_action, 0, current_state_tuple, False))
                if should_learn() and len(replay_buffer) >= batch_size:
                    learn(grid, width, learning_rate, discount_factor)
                    if total_decision_steps % target_update_frequency == 0:
                        update_target_network(q_table_online, q_table_target, soft_update_alpha)
         
            # select action
            action = choose_action(current_state_tuple, q_table_online, exploration_rate, width, grid)
            last_state = current_state_tuple
            last_action = action
            time_step += 1
            total_decision_steps += 1
            done = False  # ensures 'done' is always defined
            
            # next action of button, pipe, or just moving x
            chosen_tile = grid.get((action, y))
            if chosen_tile == '⬒':
                grid[(action, y)] = '_' 
                unmark_block(grid, 5) 
                button_tracker[(action, y)] += 1
                pressed_buttons.add((action, y))
                x = action # move to the button's column or stay
                continue 
            elif (action, y) in pipes:
                pipe_tracker[(action, y)] += 1
                x, y = pipes[(action, y)]
                continue
            else:
                x = action # normal drop off, move to chosen column
                # fall straight down if empty space below
                while (x, y - 1) in grid and grid[(x, y - 1)] == ' ':
                    y -= 1
                y -= 1 # drop one more row 
                continue
        
        # slide tiles
        if current_tile in {'\\\\\\\\', '/'}:
            x += 1 if current_tile == '\\\\\\\\' else -1
            y -= 1
            # boundary check after slide move
            if not (0 <= x < width and 0 <= y < height):
                y = 0 # falling off bottom
                break
            continue
            
        # spike check
        if current_tile == '^':
            if y in spike_tracker:
                spike_tracker[y] += 1
            reward = 0 
            done = True
            if last_state is not None:
                 replay_buffer.append(Experience(last_state, last_action, reward, None, done))
                 if should_learn() and len(replay_buffer) >= batch_size:
                    learn(grid, width, learning_rate, discount_factor)
                    if total_decision_steps % target_update_frequency == 0:
                        update_target_network(q_table_online, q_table_target, soft_update_alpha)
            break

        # try falling diagonally
        possible_moves = []
        left_diag_coord = (x - 1, y - 1)
        right_diag_coord = (x + 1, y - 1)

        # check if left diagonal is valid (on board and not empty space ' ')
        if 0 <= left_diag_coord[0] < width and 0 <= left_diag_coord[1] < height and grid.get(left_diag_coord) != ' ':
            possible_moves.append(left_diag_coord)
        
        # check if right diagonal is valid
        if 0 <= right_diag_coord[0] < width and 0 <= right_diag_coord[1] < height and grid.get(right_diag_coord) != ' ':
            possible_moves.append(right_diag_coord)
            
        if possible_moves:
            chosen_move = random.choice(possible_moves)
            x, y = chosen_move
        else:
            # fall straight down if no diagonal options
            y -= 1

    # end of episode
    final_bucket = -1
    if not done: # when loop exited because y <= 0
        done = True
        final_bucket = buckets.get(x, -1) # -1 if out of bounds
        if final_bucket == target_bucket:
            reward = 1.0
        elif final_bucket != -1:
            reward = -1.0
        else:
             reward = -1.0
        
        if final_bucket != -1:
            bucket_tracker[final_bucket] += 1
            
        # save final transition
        if last_state is not None:
            replay_buffer.append(Experience(last_state, last_action, reward, None, done))
            if should_learn() and len(replay_buffer) >= batch_size:
                learn(grid, width, learning_rate, discount_factor)
                if total_decision_steps % target_update_frequency == 0:
                    update_target_network(q_table_online, q_table_target, soft_update_alpha)
    
    return reward, final_bucket

### Board Constructor

In [76]:
def build_board():
    width, height = 15, 30
    num_buckets = 5
    grid = generate_grid(width, height)
    # reset global blocks for the new board
    global blocks
    blocks = {}
    # reset pipes for the new board
    global pipes
    pipes = {}

    mark_ledge(grid, start_x=2, length=5, ledge_y=27)
    mark_ledge(grid, start_x=6, length=4, ledge_y=24)
    mark_ledge(grid, start_x=1, length=6, ledge_y=21)
    mark_ledge(grid, start_x=9, length=5, ledge_y=19)
    mark_ledge(grid, start_x=3, length=7, ledge_y=17)
    mark_ledge(grid, start_x=7, length=6, ledge_y=15)
    mark_ledge(grid, start_x=0, length=5, ledge_y=13)
    mark_ledge(grid, start_x=9, length=6, ledge_y=11)
    mark_ledge(grid, start_x=0, length=9, ledge_y=9, button_x=0)
    mark_ledge(grid, start_x=5, length=6, ledge_y=7)
    mark_ledge(grid, start_x=4, length=5, ledge_y=5)
    mark_ledge(grid, start_x=8, length=4, ledge_y=3)
    mark_ledge(grid, start_x=4, length=8, ledge_y=2)

    mark_slide(grid, start_x=0, start_y=28, length=4, direction="forward")
    mark_slide(grid, start_x=13, start_y=23, length=3, direction="backward")
    mark_slide(grid, start_x=14, start_y=16, length=4, direction="backward")
    mark_slide(grid, start_x=0, start_y=12, length=2, direction="forward")

    mark_spike(grid, start_x=5, length=4, spike_y=18)
    mark_spike(grid, start_x=7, length=2, spike_y=6)

    mark_pipe(grid, x=6, y1=27, y2=24)
    mark_pipe(grid, x=9, y1=24, y2=19)
    mark_pipe(grid, x=4, y1=9, y2=5)

    mark_block(grid, width, row_y=5)
    buckets = mark_buckets(width, num_buckets)

    return grid, buckets, width, height

### Training Loop (DDQN)

In [77]:
# reset trackers for this training run
bucket_tracker = {i: 0 for i in range(5)} 
ledge_tracker = defaultdict(int)  
block_row_tracker = defaultdict(int) 
spike_tracker = defaultdict(int)  
pipe_tracker = defaultdict(int) 
button_tracker = defaultdict(int) 
episode_rewards_history = [] 
most_recent_rewards = deque(maxlen=100)

In [79]:
total_decision_steps = 0
steps_after_episode = 0
# synchronize Q-tables initially
q_table_target = copy.deepcopy(q_table_online) 
 
# agent determines when to call learn() based on total steps
def should_learn():
     return total_decision_steps % update_frequency == 0

# training loop
for episode in range(episodes):
    grid, buckets, width, height = build_board()
    if (0,9) in grid and grid[(0,9)] == '_': 
        grid[(0,9)] = '⬒'
    if 5 not in blocks:
         mark_block(grid, width, 5)
         
    start_x = random.randint(0, width - 1)
    
    # drop_ball_episode now handles simulation and storing experiences
    # calls learn() internally based on the trigger function
    episode_final_reward, final_bucket = drop_ball_episode(grid, width, height, start_x, buckets, exploration_rate, target_bucket, should_learn)

    steps_after_episode += total_decision_steps
    
    # check if target table update is due based on step count progression
    if (total_decision_steps // target_update_frequency) < (steps_after_episode // target_update_frequency):
       update_target_network(q_table_online, q_table_target, soft_update_alpha)
       # print(f"Target network updated after episode {episode + 1}")
    
    # perform learning steps from replay buffer
    if len(replay_buffer) > batch_size:
        for _ in range(10): # example 10 learning steps per episode end
             learn(grid, width, learning_rate, discount_factor) 
             
    episode_rewards_history.append(episode_final_reward) # save the final reward
    most_recent_rewards.append(episode_final_reward)
    
    # decay exploration rate
    exploration_rate = max(min_exploration, exploration_rate * exploration_decay)

    # print progress
    if (episode + 1) % 100 == 0:
        avg_reward = sum(most_recent_rewards) / len(most_recent_rewards)
        print(f"Episode {episode + 1} | Avg Reward (Last 100): {avg_reward:.2f} | Exploration Rate: {exploration_rate:.2f} | Buffer Size: {len(replay_buffer)} | Q-States: {len(q_table_online)}")

# final statistics
print("\nLedge State Visit Statistics (Top 15):")
sorted_ledges = sorted(ledge_tracker.items(), key=lambda item: item[1], reverse=True)
for state, count in sorted_ledges[:15]:
    print(f"State {state} visited {count} times")

print("\nBlock Row State Visit Statistics (Top 15):")
sorted_blocks = sorted(block_row_tracker.items(), key=lambda item: item[1], reverse=True)
for state, count in sorted_blocks[:15]:
    print(f"State {state} visited {count} times")

print("\nBucket Landing Statistics:")
total_landings = sum(bucket_tracker.values())
for bucket_id, count in sorted(bucket_tracker.items()):
    if count > 0:
        percent = (count / total_landings * 100) if total_landings > 0 else 0
        print(f"Bucket {bucket_id}: {count} landings ({percent:.1f}%)")

print("\nSpike Hit Statistics:")
for y, count in sorted(spike_tracker.items()):
     if count > 0:
        print(f"Spike row {y} hit {count} times")

print("\nPipe Usage Statistics:")
for (x, y), count in sorted(pipe_tracker.items()):
    if count > 0:
        print(f"Pipe at ({x}, {y}) used {count} times")

print("\nButton Press Statistics:")
for (x, y), count in sorted(button_tracker.items()):
    if count > 0:
        print(f"Button at ({x}, {y}) pressed {count} times")

# show online Q-table
q_data_for_df = {}
for state, actions in q_table_online.items():
    state_display = (state[0], tuple(sorted(list(state[1]))))
    q_data_for_df[state_display] = actions

q_df = pd.DataFrame(q_data_for_df).T

if not q_df.empty:
    q_df.index = pd.MultiIndex.from_tuples(
        q_df.index,
        names=["position", "buttons_pressed"]
    )
    
    def get_y_sort_key(idx):
        y_keys = []
        for pos, buttons in idx:
            if isinstance(pos, tuple) and len(pos) == 2 and isinstance(pos[1], int):
                y_keys.append(-pos[1])
            elif isinstance(pos, tuple) and len(pos) == 2 and pos[0] == 'block':
                 y_keys.append(-pos[1])
            else:
                y_keys.append(0) 
        return y_keys
        
    q_df = q_df.sort_index(level=0, key=get_y_sort_key, sort_remaining=False)
    q_df = q_df.reindex(sorted(q_df.columns), axis=1)

    print("\nOnline Q-Table (Sample):")
    display(q_df.head(20)) 
else:
    print("\nQ-Table is empty.")

Episode 100 | Avg Reward (Last 100): -0.66 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 200 | Avg Reward (Last 100): -0.32 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 300 | Avg Reward (Last 100): -0.36 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 400 | Avg Reward (Last 100): -0.68 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 500 | Avg Reward (Last 100): -0.60 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 600 | Avg Reward (Last 100): -0.46 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 700 | Avg Reward (Last 100): -0.70 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 800 | Avg Reward (Last 100): -0.56 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 900 | Avg Reward (Last 100): -0.62 | Exploration Rate: 0.01 | Buffer Size: 10000 | Q-States: 10
Episode 1000 | Avg Reward (Last 100): -0.36 | Exploration Rate: 

Unnamed: 0_level_0,Unnamed: 1_level_0,0,2,4,6,7,8,9,10,12
position,buttons_pressed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"(9, 19)",(),,,,,,,0.0,0.0,0.0
"(block, 5)",(),,,0.0,,,,,,
"(4, 9)",(),-0.328918,0.0,0.0,0.0,,0.0,,,
"(4, 5)",(),,,0.0,,,,,,
"(0, 9)","((0, 9),)",-0.954872,-0.999998,-0.322791,-0.999905,,-0.999415,,,
"(4, 5)","((0, 9),)",,,-0.348771,-0.956602,,-0.962249,,,
"(9, 24)",(),,,,0.0,0.0,,0.0,,
"(6, 27)",(),,0.0,0.0,0.0,,,,,
"(6, 24)",(),,,,0.0,0.0,,0.0,,
"(4, 9)","((0, 9),)",-0.999965,-0.830845,-0.325547,-0.644982,,-1.0,,,
