In [4]:
import numpy as np
import random

In [5]:
states = ['A', 'B', 'C', 'D', 'E']
actions = [0, 1]
gamma = 1.0
alpha = 0.8
epsilon = 0.25
max_timesteps = 3
rewards = {'A': 1000, 'E': 10, 'B': -1, 'C': -1, 'D': -1}
count_terminal_state = {'A': 0, 'E': 0}

w = np.array([1, 1, -1], dtype=float)

def feature_vector(state, action):
    distance_to_left = states.index(state)
    action_feature = 1 if action == 0 else -1
    return np.array([distance_to_left, action_feature, 1])

def Q_value(state, action, w):
    return np.dot(w, feature_vector(state, action))

def epsilon_greedy(state, w, epsilon):
    if random.random() < epsilon:
        # print("greedy move")
        return random.choice(actions)
    else:
        q_values = [Q_value(state, action, w) for action in actions]
        # print("optimal move")
        return np.argmax(q_values)

# Update Q-values
def update_weights(state, action, next_state, reward, alpha):
    global w
    q_sa = Q_value(state, action, w)
    next_q_values = [Q_value(next_state, next_action, w) for next_action in actions]
    target = reward + gamma * max(next_q_values)
    w += alpha * (target - q_sa) 
    
def q_learning(max_timesteps):
    global w
    state = 'D'  # initial state
    while state not in ['A', 'E'] and max_timesteps > 0:
        action = epsilon_greedy(state, w, epsilon)
        next_state = states[states.index(state) + (1 if action == 1 else -1)]
        reward = rewards[next_state]
        update_weights(state, action, next_state, reward, alpha)

        state = next_state
        
    return state

def main():
    episodes = int(input("Number of episodes "))
    for episode in range(episodes):
        end_state = q_learning(max_timesteps = 3)
        print(f'Episode {episode + 1} State ends in {end_state}')
        count_terminal_state[end_state] += 1
    
    print(f'Agent went to A {count_terminal_state["A"]} times and E {count_terminal_state["E"]} times')
    print(f'final weights: {w}')
if __name__ == "__main__":
  main()

Episode 1 State ends in E
Episode 2 State ends in A
Episode 3 State ends in A
Episode 4 State ends in A
Episode 5 State ends in A
Episode 6 State ends in A
Episode 7 State ends in A
Episode 8 State ends in A
Episode 9 State ends in A
Episode 10 State ends in A
Episode 11 State ends in A
Episode 12 State ends in A
Episode 13 State ends in A
Episode 14 State ends in A
Episode 15 State ends in A
Episode 16 State ends in A
Episode 17 State ends in A
Episode 18 State ends in E
Episode 19 State ends in A
Episode 20 State ends in A
Episode 21 State ends in A
Episode 22 State ends in A
Episode 23 State ends in E
Episode 24 State ends in A
Episode 25 State ends in A
Episode 26 State ends in A
Episode 27 State ends in A
Episode 28 State ends in A
Episode 29 State ends in E
Episode 30 State ends in A
Episode 31 State ends in A
Episode 32 State ends in A
Episode 33 State ends in A
Episode 34 State ends in A
Episode 35 State ends in A
Episode 36 State ends in A
Episode 37 State ends in A
Episode 38

In [6]:
def reset():
    global count_terminal_state
    count_terminal_state = {'A': 0, 'E': 0}

w1 = np.array([1,1,-1], dtype = float)
w2 = np.array([1,1,-1], dtype = float)

def update_weights_2_vectors(state, action, next_state, reward, alpha, w):
    q_sa = Q_value(state, action, w)
    next_q_values = [Q_value(next_state, next_action, w) for next_action in actions]
    target = reward + gamma * max(next_q_values)
    w += alpha * (target - q_sa) 
    
def q_learning_2_vectors(max_timesteps):
    global w1, w2
    state = 'D'  # initial state
    while state not in ['A', 'E'] and max_timesteps > 0:
        action = epsilon_greedy(state, w, epsilon)
        next_state = states[states.index(state) + (1 if action == 1 else -1)]
        reward = rewards[next_state]
        if action == 0:
            update_weights_2_vectors(state, action, next_state, reward, alpha, w1)
        else:
            update_weights_2_vectors(state, action, next_state, reward, alpha, w2)
        state = next_state
        
    return state

def main():
    reset()
    episodes = int(input("Number of episodes "))
    for episode in range(episodes):
        end_state = q_learning_2_vectors(max_timesteps = 3)
        print(f'Episode {episode + 1} State ends in {end_state}')
        count_terminal_state[end_state] += 1
    
    print(f'Agent went to A {count_terminal_state["A"]} times and E {count_terminal_state["E"]} times')
    print(f'final weights: w1:{w1} || w2: {w2}')
if __name__ == "__main__":
    main()

Episode 1 State ends in A
Episode 2 State ends in A
Episode 3 State ends in A
Episode 4 State ends in A
Episode 5 State ends in A
Episode 6 State ends in E
Episode 7 State ends in A
Episode 8 State ends in A
Episode 9 State ends in A
Episode 10 State ends in A
Episode 11 State ends in A
Episode 12 State ends in A
Episode 13 State ends in A
Episode 14 State ends in E
Episode 15 State ends in A
Episode 16 State ends in E
Episode 17 State ends in A
Episode 18 State ends in A
Episode 19 State ends in A
Episode 20 State ends in A
Episode 21 State ends in A
Episode 22 State ends in A
Episode 23 State ends in A
Episode 24 State ends in A
Episode 25 State ends in A
Episode 26 State ends in A
Episode 27 State ends in A
Episode 28 State ends in A
Episode 29 State ends in A
Episode 30 State ends in A
Episode 31 State ends in A
Episode 32 State ends in A
Episode 33 State ends in A
Episode 34 State ends in A
Episode 35 State ends in A
Episode 36 State ends in A
Episode 37 State ends in E
Episode 38