<a href="https://colab.research.google.com/github/adityamavle/drl-huggingface/blob/master/learning_strategies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Monte Carlo method

In [1]:
import numpy as np

# Define the environment
n_states = 9
n_actions = 4
goal_state = 8

# Initialize state values
V = np.zeros(n_states)

# Define the policy (for simplicity, using a random policy)
policy = np.ones((n_states, n_actions)) / n_actions

# Function to generate an episode
def generate_episode():
    states = []
    actions = []
    rewards = []

    state = 0  # Start from the initial state
    while state != goal_state:
        action = np.random.choice(range(n_actions), p=policy[state])
        next_state = state + (action - 1) * (action % 2)  # Update based on action (left, right, up, down)
        reward = 0 if next_state != goal_state else 1  # +1 reward upon reaching the goal

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    return states, actions, rewards

# Monte Carlo algorithm
num_episodes = 1000
returns_sum = np.zeros(n_states)
returns_count = np.zeros(n_states)

for episode in range(num_episodes):
    states, actions, rewards = generate_episode()
    G = 0
    for t in range(len(states) - 1, -1, -1):
        G = rewards[t] + G
        state = states[t]
        if state not in states[:t]:  # First visit Monte Carlo
            returns_sum[state] += G
            returns_count[state] += 1
            V[state] = returns_sum[state] / returns_count[state]

# Print the learned state values
print("Monte Carlo State Values:")
print(V.reshape(3, 3))


Monte Carlo State Values:
[[1. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


### Temporal Difference

In [2]:
import numpy as np

# Define the environment
n_states = 9
n_actions = 4
goal_state = 8

# Initialize Q-values
Q = np.zeros((n_states, n_actions))

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Epsilon-greedy exploration parameter

# Function to choose an action based on epsilon-greedy policy
def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.choice(range(n_actions))
    else:
        return np.argmax(Q[state])

# Q-learning algorithm
num_episodes = 1000
for episode in range(num_episodes):
    state = 0  # Start from the initial state
    while state != goal_state:
        action = choose_action(state)
        next_state = state + (action - 1) * (action % 2)  # Update based on action (left, right, up, down)
        reward = 0 if next_state != goal_state else 1  # +1 reward upon reaching the goal

        # Q-value update
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        state = next_state

# Print the learned Q-values
print("Temporal Difference (Q-learning) Q-values:")
print(Q.reshape(n_states, n_actions))


Temporal Difference (Q-learning) Q-values:
[[0.62916823 0.58683668 0.56727745 0.729     ]
 [0.         0.         0.         0.        ]
 [0.68078591 0.63935018 0.66795814 0.81      ]
 [0.         0.         0.         0.        ]
 [0.76830704 0.73562645 0.70058101 0.9       ]
 [0.         0.         0.         0.        ]
 [0.80151895 0.85288594 0.87042474 1.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]]
