In [1]:
import random

In [16]:
import random

def simulate_path(initial_position, action_sequence, total_steps=10):
    """
    Simulate the path of a system given an initial position and an action sequence.
    
    Parameters:
    - initial_position: The starting position of the system.
    - action_sequence: A sequence indicating the action to take at each position.
    - total_steps: The total number of steps to simulate.
    
    Returns:
    - A list of positions representing the path of the system.
    """
    position = initial_position
    path_history = [position]  # Initialize path history with the initial position
    
    for _ in range(total_steps):
        current_action = action_sequence[position - 1]  # Get action based on current position
        if random.random() < 0.5:  # Check random factor once
            # Determine new position based on action and boundaries
            if current_action == 1 and position < 10:  # Move forward
                position += 1
            elif current_action == -1 and position > 1:  # Move backward
                position -= 1
        
        path_history.append(position)
    
    return path_history

# Example action sequence (for demonstration purposes)
example_actions = [-1, -1, -1, -1, 1, 1, 1, 1, -1, -1]

# Simulate paths for starting positions 1 through 10
resulting_paths = {start_pos: simulate_path(initial_position=start_pos, action_sequence=example_actions, total_steps=10)
                   for start_pos in range(1, 11)}

# Print the resulting paths
for start_pos, path in resulting_paths.items():
    print(f"Starting position: {start_pos}, Path: {path}")


Starting position: 1, Path: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Starting position: 2, Path: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Starting position: 3, Path: [3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Starting position: 4, Path: [4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1]
Starting position: 5, Path: [5, 6, 6, 6, 6, 7, 8, 9, 8, 9, 9]
Starting position: 6, Path: [6, 7, 7, 7, 7, 8, 8, 9, 9, 8, 9]
Starting position: 7, Path: [7, 8, 8, 9, 8, 8, 8, 9, 8, 9, 8]
Starting position: 8, Path: [8, 8, 8, 9, 8, 8, 9, 9, 9, 8, 9]
Starting position: 9, Path: [9, 9, 8, 8, 9, 9, 9, 9, 9, 9, 8]
Starting position: 10, Path: [10, 10, 10, 10, 9, 8, 9, 8, 9, 8, 9]


In [17]:
import numpy as np

def train_q_learning(num_states=10, num_actions=2, max_episodes=1000, lr=0.1, discount=0.9, explore_prob=0.1):
    """
    Train the Q-learning algorithm to find the optimal policy.
    
    Parameters:
    - num_states: Total number of states.
    - num_actions: Total number of actions available.
    - max_episodes: Number of episodes to train the agent.
    - lr: Learning rate for updating Q-values.
    - discount: Discount factor for future rewards.
    - explore_prob: Probability of taking a random action (exploration).
    
    Returns:
    - Q_table: The Q-values learned for each state-action pair.
    """
    # Initialize the Q-table with zeros
    Q_table = np.zeros((num_states, num_actions))
    
    # Define the reward structure based on a hypothetical cost function
    reward_scheme = -np.array([1, 2, 3, 4, 5, 4, 2, 0, 1, 2])
    
    for episode in range(max_episodes):
        # Start from a random initial state for each episode
        current_state = np.random.randint(0, num_states)
        
        while True:
            # Implement epsilon-greedy policy for action selection
            if np.random.rand() < explore_prob:
                chosen_action = np.random.choice(num_actions)  # Random action (exploration)
            else:
                chosen_action = np.argmax(Q_table[current_state])  # Best action (exploitation)
            
            # Map the action to the corresponding state transition
            transition = 1 if chosen_action == 1 else -1
            
            # Calculate the next state and ensure it remains within valid bounds
            next_state = current_state + transition
            next_state = np.clip(next_state, 0, num_states - 1)
            
            # Retrieve the reward for the next state
            reward = reward_scheme[next_state]
            
            # Update the Q-value using the Q-learning formula
            best_future_q = np.max(Q_table[next_state])
            Q_table[current_state, chosen_action] += lr * (reward + discount * best_future_q - Q_table[current_state, chosen_action])
            
            # Move to the next state
            current_state = next_state
            
            # Introduce a small chance to terminate the episode early
            if np.random.rand() < 0.001:
                break
                
    return Q_table

# Execute Q-learning training process
learned_Q_values = train_q_learning()

# Display the final Q-values for inspection
print(learned_Q_values)


[[-10.         -11.        ]
 [-10.         -12.9       ]
 [-11.         -15.48441356]
 [-12.89999391 -13.15975991]
 [-12.26200088  -9.63684206]
 [-13.63093391  -6.26315789]
 [ -9.63684211  -4.73684211]
 [ -6.26315789  -5.26315789]
 [ -4.73684211  -6.73684211]
 [ -5.26315789  -6.73684211]]
