In [1]:
import os
import gym
import sys
sys.path.append("environments")
import matplotlib.pyplot as plt
import numpy as np
import random
import math
import collections 
from collections import defaultdict, deque
import statistics

In [2]:
def differential_q_learning(env,
                            num_iterations=1000,
                            alpha=0.1,
                            beta=0.01,
                            epsilon=0.1):

    Q   = collections.defaultdict(float)
    rho = 0.0

    # start
    raw = env.reset()
    state = tuple(raw)

    for _ in range(num_iterations):
        # ε‑greedy action selection
        actions = env.get_actions(state)
        if random.random() < epsilon:
            a = random.choice(actions)
        else:
            a = max(actions, key=lambda x: Q[(state, x)])

        # take action
        next_raw, reward, done, info = env.step(a)
        next_state = tuple(next_raw)

        # compute max_a' Q(next_state, a')
        next_actions = env.get_actions(next_state)
        max_q_next = max((Q[(next_state, a2)] for a2 in next_actions), default=0.0)

        # TD error: δ = r − ρ + max Q(next) − Q(state,action)
        delta = reward - rho + max_q_next - Q[(state, a)]

        # updates
        Q[(state, a)] += alpha * delta
        rho           += beta  * delta

        # advance
        state = next_state

    return Q, rho

In [None]:
from explicit2py_converter import ExplicitMDPEnv
env = ExplicitMDPEnv(
    tra_path="/Users/cancetinsoy/mdptest/stayvsburst.tra",
    rew_path="/Users/cancetinsoy/mdptest/stayvsburst.tra.rew",
    lab_path="/Users/cancetinsoy/mdptest/stayvsburst.lab",
    seed=0,  # optional
)

# plug straight into your differential_q_learning(...)
Q, rho = differential_q_learning(env, num_iterations=200000, alpha=0.1, beta=0.001, epsilon=0.1)
print(rho)  # learned average reward estimate

0.9999999999999539
