In [2]:
from mcts4py.QValueEstimator import *

# Example usage:
state_dim = 10
action_dim = 5
hidden_dim = 50
q_estimator = QValueEstimator(state_dim, action_dim, hidden_dim)

# Random initialization of the networks
# def initialize_weights(m):
#     if isinstance(m, nn.Linear):
#         nn.init.uniform_(m.weight, -0.5, 0.5)
#         nn.init.constant_(m.bias, 0)

# q_estimator.state_value_net.apply(initialize_weights)
# q_estimator.q_net.apply(initialize_weights)

# Predict state value
state = np.random.randn(state_dim)
state_value = q_estimator.get_state_value(state)
print("Predicted state value:", state_value)


Predicted state value: 0.09520085965602287


In [3]:
# Predict state-action value
action = np.random.randint(0, action_dim, size=action_dim)
state_action_value = q_estimator.get_q_value(state, action)
print("Predicted state-action value:", state_action_value)

Predicted state-action value: 0.6086512036331215


In [4]:


# Update state-action value
target_state_action_value = np.random.randn(1)
q_estimator.update_q_value(state, action, target_state_action_value, 70)


In [5]:
action_values = np.random.randn(action_dim)
action_values

array([ 0.19907922,  0.57904742, -0.27053891, -0.44541629,  0.51236063])

In [6]:
possible_actions = [[0, 1, 2, 3, 4], [0, 1, 5, 3, 5]]
q_estimator.get_max_q_value(state, possible_actions)

0.7942445121749403

In [7]:
possible_actions[1]

[0, 1, 5, 3, 5]

In [8]:
q_estimator.get_softmax_prob_per_action(state, possible_actions[1])

TypeError: 'int' object is not callable

In [None]:
q_estimator.get_softmax_prob_multinom(state, action_values)

{-0.2538941468193494: 0.5,
 -0.10414975209228486: 0.5,
 -1.5548246230013554: 0.5,
 1.4599978532288413: 0.5,
 1.269592037777111: 0.5}

In [None]:
q_estimator.draw_from_multinomial(q_estimator.get_softmax_prob_multinom(state, action_values))

-1.5548246230013554

In [None]:
import numpy as np

# Example function: y = (x - 3)^2
def function_to_minimize(x):
    return (x - 3) ** 2

# Gradient approximation using finite differences
def approximate_gradient(f, x, epsilon=learning_rate):
    return (f(x + epsilon) - f(x - epsilon)) / (2 * epsilon)

def gradient_descent(starting_point, learning_rate, iterations, tolerance=1e-6, decay_rate=0.9, patience=10):
    x = starting_point
    prev_loss = function_to_minimize(x)
    no_improve_counter = 0  # Tracks improvement
    
    for i in range(iterations):
        # grad = approximate_gradient(function_to_minimize, x)  # Approximate the gradient
        grad = (function_to_minimize(x+learning_rate) - function_to_minimize(x)) / (learning_rate)
        x = x - learning_rate * grad                           # Update the parameter
        
        current_loss = function_to_minimize(x)
        print(f"Iteration {i+1}: x = {x:.6f}, f(x) = {current_loss:.6f}")
        
        # Early stopping condition
        if abs(current_loss - prev_loss) < tolerance:
            print(f"Converged after {i+1} iterations.")
            break
        
        # # Learning rate decay if no improvement
        # if current_loss >= prev_loss:
        #     no_improve_counter += 1
        # else:
        #     no_improve_counter = 0  # Reset counter if there is improvement
        
        # if no_improve_counter >= patience:
        if 1:
            learning_rate *= decay_rate  # Decay learning rate
            print(f"Decayed learning rate to {learning_rate:.6f}")
            no_improve_counter = 0  # Reset counter after decaying learning rate
        
        prev_loss = current_loss
    
    return x

# Parameters
starting_point = 0.0  # Initial guess
learning_rate = 0.1   # Initial learning rate
iterations = 2000     # Max number of updates
tolerance = 1e-6      # Convergence tolerance
decay_rate = 0.99     # Learning rate decay multiplier
patience = 10         # Number of iterations with no improvement before decaying

# Run gradient descent
optimal_x = gradient_descent(starting_point, learning_rate, iterations, tolerance, decay_rate, patience)
print(f"Optimal x: {optimal_x:.6f}")


Iteration 1: x = 0.590000, f(x) = 5.808100
Decayed learning rate to 0.099000
Iteration 2: x = 1.057379, f(x) = 3.773776
Decayed learning rate to 0.098010
Iteration 3: x = 1.428566, f(x) = 2.469406
Decayed learning rate to 0.097030
Iteration 4: x = 1.724103, f(x) = 1.627913
Decayed learning rate to 0.096060
Iteration 5: x = 1.960000, f(x) = 1.081600
Decayed learning rate to 0.095099
Iteration 6: x = 2.148762, f(x) = 0.724606
Decayed learning rate to 0.094148
Iteration 7: x = 2.300183, f(x) = 0.489744
Decayed learning rate to 0.093207
Iteration 8: x = 2.421951, f(x) = 0.334141
Decayed learning rate to 0.092274
Iteration 9: x = 2.520114, f(x) = 0.230290
Decayed learning rate to 0.091352
Iteration 10: x = 2.599446, f(x) = 0.160444
Decayed learning rate to 0.090438
Iteration 11: x = 2.663718, f(x) = 0.113086
Decayed learning rate to 0.089534
Iteration 12: x = 2.715919, f(x) = 0.080702
Decayed learning rate to 0.088638
Iteration 13: x = 2.758423, f(x) = 0.058359
Decayed learning rate to 0.08