In [6]:
import gym
import numpy as np

# Create CartPole environment
env = gym.make('CartPole-v1')

# Define parameters
num_episodes = 1000
max_steps_per_episode = 500
learning_rate = 0.1
discount_rate = 0.99
epsilon = 0.1
lambda_param = 0.9  # Eligibility Trace parameter

# Discretize the observation space
num_bins = (6, 12, 6, 12)  # (cart position, cart velocity, pole angle, pole velocity)
num_actions = env.action_space.n

# Initialize Q-table with zeros
q_table = {}
eligibility_traces = {}

# Initialize Q-values for all state-action pairs
def initialize_q_table():
    for i in range(num_bins[0]):
        for j in range(num_bins[1]):
            for k in range(num_bins[2]):
                for l in range(num_bins[3]):
                    state = (i, j, k, l)
                    q_table[state] = np.zeros(num_actions)

initialize_q_table()

def discretize(observation):
    discretized_values = tuple(int(obs * num_bins[i]) for i, obs in enumerate(observation))
    return discretized_values

# Helper function to select an action using epsilon-greedy policy
def choose_action(state):
    if state not in q_table:
        q_table[state] = np.zeros(num_actions)
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(num_actions)
    else:
        return np.argmax(q_table[state])


# Initialize eligibility traces for all state-action pairs in q_table
def initialize_eligibility_traces():
    for state in q_table.keys():
        if state not in eligibility_traces:
            eligibility_traces[state] = np.zeros(num_actions)
        else:
            eligibility_traces[state] *= 0  # Reset eligibility traces to zero



# SARSA with Eligibility Traces algorithm
for episode in range(num_episodes):
    observation = env.reset()
    state = discretize(observation)
    done = False
    total_reward = 0

    initialize_eligibility_traces()

    action = choose_action(state)

    for step in range(max_steps_per_episode):
        # env.render()  # Render the environment

        next_observation, reward, done, _ = env.step(action)
        next_state = discretize(next_observation)
        next_action = choose_action(next_state)

        # Update eligibility traces
        eligibility_traces[state][action] += 1

        delta = reward + discount_rate * q_table[next_state][next_action] - q_table[state][action]

        # Update Q-values and eligibility traces for states encountered during the episode
        for s in set(q_table.keys()).union(set(eligibility_traces.keys())):
            if s in q_table:
                for a in range(num_actions):
                    if s in eligibility_traces:
                        q_table[s][a] += learning_rate * delta * eligibility_traces[s][a]
                        eligibility_traces[s][a] *= discount_rate * lambda_param
                    else:
                        q_table[s] = np.zeros(num_actions)  # Initialize Q-values for unseen states
                        eligibility_traces[s] = np.zeros(num_actions)  # Initialize eligibility traces for unseen states
            else:
                q_table[s] = np.zeros(num_actions)  # Initialize Q-values for unseen states
                eligibility_traces[s] = np.zeros(num_actions)  # Initialize eligibility traces for unseen states




        total_reward += reward
        state = next_state
        action = next_action

        if done:
            break

    if episode % 100 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

print("Training finished.")
env.close()  # Close the environment after training


Episode: 0, Total Reward: 10.0
Episode: 100, Total Reward: 8.0
Episode: 200, Total Reward: 10.0
Episode: 300, Total Reward: 20.0
Episode: 400, Total Reward: 14.0
Episode: 500, Total Reward: 17.0
Episode: 600, Total Reward: 35.0
Episode: 700, Total Reward: 36.0
Episode: 800, Total Reward: 32.0
Episode: 900, Total Reward: 31.0
Training finished.
