In [24]:
import numpy as np

states = np.arange(1, 11)
controls = np.array([-1, 1])
costs = np.array([1, 2, 3, 4, 5, 4, 2, 0, 1, 2])

# Revised value iteration function
def find_optimal_policy(states, controls, costs, decay, threshold=1e-4):
    values = np.zeros_like(states)  
    policy = np.zeros_like(states, dtype=int)  
    delta = np.inf
    
    while delta > threshold:
        delta = 0
        for i, state in enumerate(states):
            old_value = values[i]
            temp_values = np.zeros_like(controls)
            for j, control in enumerate(controls):
                next_state = np.clip(state + control, 1, 10)  
                next_index = int(next_state) - 1  
                cost = costs[state-1] + decay * 0.5 * (values[next_index] + values[i])
                temp_values[j] = cost
            
            values[i] = np.min(temp_values)  
            policy[i] = controls[np.argmin(temp_values)]  
            delta = np.max(np.abs(old_value - values[i]))
    
    return values, policy

# Function to simulate the optimal policy
def simulate_policy(policy, starting_state):
    current_state = starting_state
    states_visited = [current_state]
    while True:
        action = policy[current_state - 1]
        next_state = np.clip(current_state + action, 1, 10)
        states_visited.append(next_state)
        if next_state in [10, 1]:
            break
        current_state = next_state
    return states_visited

# Function to calculate the total cost of a policy
def calculate_total_cost(policy, costs, decay, starting_state):
    states_visited = simulate_policy(policy, starting_state)
    total_cost = np.sum([costs[state - 1] * (decay ** i) for i, state in enumerate(states_visited)])
    return total_cost

# Example usage with different discount factors
decay_factors = [0.99, 0.97, 0.9, 0.7, 0.5, 0.1]  
results = {}

for decay_factor in decay_factors:
    values, policy = find_optimal_policy(states, controls, costs, decay_factor)
    total_cost = calculate_total_cost(policy, costs, decay_factor, 1)
    results[decay_factor] = {'Value Function': values, 'Optimal Policy': policy, 'Total Cost': total_cost}

# Print results in a formatted way
print("Results:")
for decay_factor, data in results.items():
    print(f"Discount Factor: {decay_factor}")
    print(f"Optimal Value Function: {data['Value Function']}")
    print(f"Optimal Policy: {data['Optimal Policy']}")
    print(f"Total Cost: {data['Total Cost']}")
    print()


Results:
Discount Factor: 0.99
Optimal Value Function: [ 1  3  6 10 12  7  2  0  1  3]
Optimal Policy: [-1 -1 -1 -1  1  1  1 -1 -1 -1]
Total Cost: 1.99

Discount Factor: 0.97
Optimal Value Function: [ 1  3  6 10 12  7  2  0  1  3]
Optimal Policy: [-1 -1 -1 -1  1  1  1 -1 -1 -1]
Total Cost: 1.97

Discount Factor: 0.9
Optimal Value Function: [ 1  3  6 10 11  7  2  0  1  3]
Optimal Policy: [-1 -1 -1 -1  1  1  1 -1 -1 -1]
Total Cost: 1.9

Discount Factor: 0.7
Optimal Value Function: [1 3 5 8 9 6 2 0 1 3]
Optimal Policy: [-1 -1 -1 -1  1  1  1 -1 -1 -1]
Total Cost: 1.7

Discount Factor: 0.5
Optimal Value Function: [1 2 4 6 7 5 2 0 1 2]
Optimal Policy: [-1 -1 -1 -1 -1  1  1 -1 -1 -1]
Total Cost: 1.5

Discount Factor: 0.1
Optimal Value Function: [1 2 3 4 5 4 2 0 1 2]
Optimal Policy: [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
Total Cost: 1.1

