In [None]:
# In this chapter we cover Monte Carlo methods for reinforcement learning
# MC methods are RL methods which learn the value functions and policy from experience by averaging the returns 
# that are gotten from a state from the episodes.
# It uses averaging from the returns in the episodes to converge at the value function following a particular policy
# the value function gotten from that evaluation (which in MC is called prediction) is then used to improve the policy
# This is similar to DP except that there is no full model of the environment. But it still uses the general policy iteration system
# In monte carlo methods we use compute returns and not partial returns

In [1]:
my_list = [1, 2, 3]
my_list[1:]

[2, 3]

In [2]:
my_dict = {"hello": "there", "name": "kosi"}

for i in my_dict:
    print(i)

hello
name


In [None]:
import numpy as np

# compute episode simulates an entire episode and returns all the states and returns from that episode
def compute_episode(policy):
    # returns a list of states and rewards
    return [], []

def compute_return(returns, discount):
    if len(returns) == 0:
        return 0
    return returns[0] + (discount * compute_return(returns[1:]), discount)

# My algorithm for first visit monte carlo methods
def first_visit_monte_carlo(num_episodes, policy, discount):
    state_value_list = dict()
    for _ in len(num_episodes):
        states, returns = compute_episode(policy, discount)
        seen_states_this_episode = []
        for index, (state, r) in enumerate(zip(states, returns)):
            # compute the returns for that state
            if state not in state_value_list:
                state_value_list[state] = []
            # if the state has not bee seen this episode(ie first visit MC) then compute returns
            if state not in seen_states_this_episode:
                state_value_list[state].append(compute_return(returns[index:]))
                seen_states_this_episode.append(state)
            
    # at the end of all the episodes find the mean for each state value list know the actual value of each state
    state_value = dict()
    for state in state_value_list:
        state_value[state] = np.mean(state_value_list[state])
        
    return state_value
            