# Dataset Generation


In [1]:
import random
import numpy as np
import json

DATASET_SIZE = 1
OUTPUT_FILE = "demonstrations_with_states.txt"

# --- Feature Definitions ---
# We track location (A,B,C,D) for these items:
ITEMS = ["pot", "pan", "plate", "tomato", "meat", "onion", "mushroom", "lettuce", "egg"]
# We track "is_cut" status for these items:
CUTTABLES = ["tomato", "onion", "mushroom", "lettuce"]
LOCATIONS = ["A", "B", "C", "D"]

class StateTracker:
    def __init__(self):
        # Initialize feature map indices
        self.feature_map = {}
        idx = 0
        self.num = 20

        # 1. Location Features: e.g., "pot_A": 0, "pot_B": 1...
        for item in ITEMS:
            for loc in LOCATIONS:
                self.feature_map[f"{item}_{loc}"] = idx
                idx += 1

        # 2. Cut Status Features: e.g., "tomato_cut": 36...
        for item in CUTTABLES:
            self.feature_map[f"{item}_cut"] = idx
            idx += 1

        # 3. Global Status
        self.feature_map["stove_on"] = idx
        self.n_features = idx + 1
        self.reset()


        print("Feature_map: ", self.feature_map)

    def reset(self):
        """Resets state to initial conditions (Everything at A, raw, stove off)."""
        self.current_state = np.zeros(self.n_features, dtype=int)
        # Set all items to be at Station A initially
        for item in ITEMS: self.set_feature(f"{item}_A", 1)

    def set_feature(self, key, value):
        if key in self.feature_map: self.current_state[self.feature_map[key]] = value

    def get_feature(self, key):
        if key in self.feature_map: return self.current_state[self.feature_map[key]]
        return 0

    def get_state_vector(self):
        if self.num:
            print(self.current_state)
            self.num -= 1
        return self.current_state.copy()



    def apply_action(self, action_str):
        """Parses action string and updates the internal state vector."""
        # Parse "move (item origin to dest)"
        if action_str.startswith("move"):
            # extract content between parens: "pot A to C"
            content = action_str[action_str.find("(")+1 : action_str.find(")")]
            parts = content.split()
            item, origin, _, dest = parts[0], parts[1], parts[2], parts[3]

            # Update State: Remove from origin, add to dest
            self.set_feature(f"{item}_{origin}", 0)
            self.set_feature(f"{item}_{dest}", 1)

        # Parse "cut (item loc)"
        elif action_str.startswith("cut"):
            content = action_str[action_str.find("(")+1 : action_str.find(")")]
            parts = content.split()
            item = parts[0]
            # Update State: Mark as cut
            self.set_feature(f"{item}_cut", 1)
        # Parse "turn_on (stove C)"
        elif action_str.startswith("turn_on"): self.set_feature("stove_on", 1)
        # Parse "turn_off (stove C)"
        elif action_str.startswith("turn_off"): self.set_feature("stove_on", 0)

class RecipeGenerator:
    def __init__(self):
        self.demos = []
        self.tracker = StateTracker()

    def _record_trajectory(self, actions):
        """Runs the actions through the state tracker and records (State, Action) pairs."""
        self.tracker.reset()
        trajectory = []

        for action in actions:
            # 1. Capture State BEFORE action
            state_vector = self.tracker.get_state_vector().tolist()
            # 2. Record Pair
            trajectory.append({"state": state_vector, "action": action})
            # 3. Update State for next step
            self.tracker.apply_action(action)

        final_state_vector = self.tracker.get_state_vector().tolist()
        trajectory.append({"state": final_state_vector, "action": "stop"})

        self.demos.append(trajectory)

    # --- Recipe Definitions ---
    def generate_tomato_soup(self):         return ["move (pot A to C)", "move (tomato A to B)", "cut (tomato B)", "move (tomato B to C)", "turn_on (stove C)", "move (plate A to D)", "move (pot C to D)"]
    def generate_grilled_steak(self):       return ["move (pan A to C)", "move (meat A to C)", "turn_on (stove C)", "move (plate A to B)", "turn_off (stove C)", "move (meat C to B)", "move (plate B to D)"]
    def generate_mushroom_stew(self):       return ["move (pot A to C)", "move (mushroom A to B)", "cut (mushroom B)", "move (mushroom B to C)", "move (onion A to B)", "cut (onion B)", "move (onion B to C)", "turn_on (stove C)", "move (plate A to D)", "move (pot C to D)"]
    def generate_salad(self):               return ["move (lettuce A to B)", "cut (lettuce B)", "move (onion A to B)", "cut (onion B)", "move (plate A to B)", "move (plate B to D)"]
    def generate_burger(self):              return ["move (pan A to C)", "move (meat A to C)", "turn_on (stove C)", "move (lettuce A to B)", "cut (lettuce B)", "turn_off (stove C)", "move (meat C to B)", "move (plate A to B)", "move (plate B to D)"]
    def generate_boiled_eggs(self):         return ["move (pot A to C)", "move (egg A to C)", "turn_on (stove C)", "turn_off (stove C)", "move (plate A to C)", "move (plate C to D)"]
    def generate_tomato_onion_soup_1(self):         return ["move (pot A to C)", "move (tomato A to B)", "cut (tomato B)", "move (tomato B to C)", "move (onion A to B)", "cut (onion B)", "move (onion B to C)", "turn_on (stove C)", "move (plate A to D)", "move (pot C to D)"]
    def generate_tomato_onion_soup_2(self):         return ["move (pot A to C)", "move (onion A to B)", "cut (onion B)", "move (onion B to C)", "move (tomato A to B)", "cut (tomato B)", "move (tomato B to C)", "turn_on (stove C)", "move (plate A to D)", "move (pot C to D)"]

    def generate_random_dataset(self, count):
        # available_recipes = [self.generate_tomato_soup, self.generate_grilled_steak, self.generate_mushroom_stew, self.generate_salad, self.generate_burger, self.generate_boiled_eggs]
        available_recipes = [self.generate_tomato_onion_soup_1]
        
        print(f"Generating {count} state-aware demonstrations...")

        for i in range(count):
            recipe_func = random.choice(available_recipes)
            actions = recipe_func()
            self._record_trajectory(actions)

    def save_to_file(self):
        # We save as JSON Lines (each line is a full trajectory object)
        # This is much safer for parsing lists of lists than raw text
        with open(OUTPUT_FILE, "w") as f:
            for demo in self.demos:
                f.write(json.dumps(demo) + "\n")
        print(f"Done! Saved {len(self.demos)} trajectories to {OUTPUT_FILE}")
        print(f"State Vector Size: {self.tracker.n_features}")

if __name__ == "__main__":
    gen = RecipeGenerator()
    gen.generate_random_dataset(DATASET_SIZE)
    gen.save_to_file()

Feature_map:  {'pot_A': 0, 'pot_B': 1, 'pot_C': 2, 'pot_D': 3, 'pan_A': 4, 'pan_B': 5, 'pan_C': 6, 'pan_D': 7, 'plate_A': 8, 'plate_B': 9, 'plate_C': 10, 'plate_D': 11, 'tomato_A': 12, 'tomato_B': 13, 'tomato_C': 14, 'tomato_D': 15, 'meat_A': 16, 'meat_B': 17, 'meat_C': 18, 'meat_D': 19, 'onion_A': 20, 'onion_B': 21, 'onion_C': 22, 'onion_D': 23, 'mushroom_A': 24, 'mushroom_B': 25, 'mushroom_C': 26, 'mushroom_D': 27, 'lettuce_A': 28, 'lettuce_B': 29, 'lettuce_C': 30, 'lettuce_D': 31, 'egg_A': 32, 'egg_B': 33, 'egg_C': 34, 'egg_D': 35, 'tomato_cut': 36, 'onion_cut': 37, 'mushroom_cut': 38, 'lettuce_cut': 39, 'stove_on': 40}
Generating 1 state-aware demonstrations...
[1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0]
[0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0]
[0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0]
[0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 

# IRL

In [2]:
import numpy as np
import json

def load_demonstrations(filepath):
    """
    Load demonstrations from JSON file.
    
    Returns:
        demonstrations: list of trajectories, each with (state, action) pairs
        unique_actions: list of unique action strings
    """
    with open(filepath, 'r') as f:
        content = f.read()
    
    # Parse JSON arrays (one per line)
    demo_lists = [json.loads(line) for line in content.strip().split('\n')]
    
    demonstrations = []
    all_actions = set()
    
    for demo in demo_lists:
        trajectory = []
        for step in demo:
            state = tuple(step['state'])  # Convert to tuple for hashing
            action = step['action']
            trajectory.append((state, action))
            all_actions.add(action)
        demonstrations.append(trajectory)
    
    # Get state dimensionality from first state
    unique_actions = sorted(list(all_actions))
    
    return demonstrations, unique_actions


def create_state_action_mappings(demonstrations, unique_actions):
    """
    Create mappings between states/actions and indices.
    
    Returns:
        state_to_idx: dict mapping state tuples to integer indices
        idx_to_state: dict mapping integer indices to state tuples
        action_to_idx: dict mapping action strings to integer indices
        idx_to_action: dict mapping integer indices to action strings
    """
    # Collect all unique states
    unique_states = set()
    for trajectory in demonstrations:
        for state, _ in trajectory:
            unique_states.add(state)
    
    # Create state mappings
    state_to_idx = {state: idx for idx, state in enumerate(sorted(unique_states))}
    idx_to_state = {idx: state for state, idx in state_to_idx.items()}
    
    # Create action mappings
    action_to_idx = {action: idx for idx, action in enumerate(unique_actions)}
    idx_to_action = {idx: action for action, idx in action_to_idx.items()}
    
    print("state_to_idx:", state_to_idx)
    print("idx_to_state:", idx_to_state)
    print("action_to_idx:", action_to_idx)
    print("idx_to_action:", idx_to_action)

    return state_to_idx, idx_to_state, action_to_idx, idx_to_action


# def create_feature_matrix(idx_to_state, state_dim):
#     """
#     Create feature matrix from states.
#     Uses the state vector directly as features.
    
#     Returns:
#         feature_matrix: 2D array of shape (n_states, n_features)
#     """
#     n_states = len(idx_to_state)
    
#     # Use state vector as features directly
#     feature_matrix = np.zeros((n_states, state_dim))
    
#     for idx in range(n_states):
#         state = idx_to_state[idx]
#         feature_matrix[idx] = np.array(state)
    
#     print("feature_matrix:", feature_matrix)
#     np.savetxt("feature_matrix.txt", feature_matrix, fmt="%.3f")
#     ## The feature matrix is just all the state vectors stacked up.
#     return feature_matrix

def create_feature_matrix(idx_to_state):
    """
    Create task-discriminative features for kitchen environment.
    State vector layout (41 dims):
    - Indices 0-35: Object locations (9 objects × 4 locations)
      - pot: 0-3, pan: 4-7, plate: 8-11
      - tomato: 12-15, egg: 16-19, meat: 20-23
      - lettuce: 24-27, onion: 28-31, potato: 32-35
    - Indices 36-39: Cut status (tomato, lettuce, onion, potato)
    - Index 40: Stove on/off
    """
    n_states = len(idx_to_state)
    features = []
    
    for idx in range(n_states):
        state = np.array(idx_to_state[idx])
        feat = []
        
        # Original location features (keep these)
        for loc in range(4):
            items_at_loc = 0
            for item_type in range(9):
                if state[item_type*4 + loc] == 1:
                    items_at_loc += 1
            feat.append(items_at_loc)
        
        feat.append(np.sum(state[36:40]))  # Feature 4: total cut items
        feat.append(state[40])              # Feature 5: stove on
        
        # NEW DISCRIMINATIVE FEATURES
        
        # Feature 6: Pot at stove C (egg cooking setup)
        pot_at_C = state[2]
        feat.append(pot_at_C)
        
        # Feature 7: Pan at stove C (meat cooking setup)
        pan_at_C = state[6]
        feat.append(pan_at_C)
        
        # Feature 8: Egg at stove C (ready to cook)
        egg_at_C = state[18]
        feat.append(egg_at_C)
        
        # Feature 9: Tomato cut and at stove (tomato cooking)
        tomato_cut = state[36]
        tomato_at_C = state[14]
        feat.append(tomato_cut * tomato_at_C * pot_at_C)
        
        # Feature 10: Meat at stove (meat cooking)
        meat_at_C = state[22]
        feat.append(meat_at_C * pan_at_C)
        
        # Feature 11: Lettuce cut (salad preparation)
        lettuce_cut = state[37]
        feat.append(lettuce_cut)
        
        # Feature 12: Onion cut (salad preparation)
        onion_cut = state[38]
        feat.append(onion_cut)
        
        # Feature 13: Vegetables at cutting board B
        lettuce_at_B = state[25]
        onion_at_B = state[29]
        feat.append(lettuce_at_B + onion_at_B)
        
        # Feature 14: Tomato at cutting board (cooking prep)
        tomato_at_B = state[13]
        feat.append(tomato_at_B)
        
        # Feature 15: Stove active with cookable container
        cooking_active = state[40] * (pot_at_C + pan_at_C)
        feat.append(cooking_active)
        
        # Feature 16: Plate at serving location D
        plate_at_D = state[11]
        feat.append(plate_at_D)
        
        # Feature 17: Cooked food ready to serve
        # (stove was on, now off, pot/pan still at C)
        feat.append((1 - state[40]) * (pot_at_C + pan_at_C) * (egg_at_C + tomato_at_C + meat_at_C))
        
        features.append(feat)
    
    return np.array(features)
# def create_feature_matrix(idx_to_state):
#     """
#     Create more meaningful features instead of raw state vectors.
#     """
#     n_states = len(idx_to_state)
    
#     # Design better features, e.g.:
#     # - Number of items at each location
#     # - Number of items cut
#     # - Whether stove is on
#     # - Items at correct goal locations
#     features = []

#     for idx in range(n_states):
#         state = np.array(idx_to_state[idx])
#         state_features = []
        
#         # Original 7 features
#         for loc in range(4):
#             items_at_loc = 0
#             for item_type in range(9):
#                 if state[item_type*4 + loc] == 1:
#                     items_at_loc += 1
#             state_features.append(items_at_loc)
        
#         state_features.append(np.sum(state[36:40]))  # num_cut
#         state_features.append(state[40])              # stove_on
        
#         # Feature 8: Cut items at stove (ready to cook)
#         cut_at_stove = 0
#         if state[10] == 1:  # pot at C
#             for cut_idx in range(36, 40):
#                 cut_at_stove += state[cut_idx]
#         state_features.append(cut_at_stove)
        
#         # Feature 9: Cookable container at stove
#         cookable_at_C = state[2] + state[6]  # pot_C + pan_C
#         state_features.append(cookable_at_C)
        
#         # Feature 10: Stove active WITH cookable
#         productive_cooking = state[40] * cookable_at_C
#         state_features.append(productive_cooking)
        
#         # Feature 11: Plate ready for serving
#         plate_at_D = state[11]  # plate_D
#         state_features.append(plate_at_D)
        
#         # Feature 12: Items at cutting board AND not yet cut
#         items_at_B = sum([state[1], state[5], state[9]])  # pot, pan, plate at B
#         uncut = 4 - np.sum(state[36:40])
#         cutting_needed = items_at_B * (uncut > 0)
#         state_features.append(cutting_needed)
        
#         features.append(state_features)
    
#     return np.array(features)  # Now (n_states, 13) features


def max_entropy_inverse_rl_kitchen(demonstrations, state_to_idx, action_to_idx, feature_matrix, 
                                   temperature=2.0, gamma=0.9, n_iterations=100, learning_rate=0.05):
    """
    Maximum Entropy Inverse RL adapted for kitchen task demonstrations.
    Uses trajectory-based approach since we have sequential demonstrations.
    """
    n_states = len(state_to_idx)
    n_actions = len(action_to_idx)
    n_features = feature_matrix.shape[1]
    
    # Initialize reward weights
    reward_weights = np.zeros(n_features)
    
    # Compute empirical feature expectations from expert demonstrations
    empirical_feature_expectations = np.zeros(n_features)
    for trajectory in demonstrations:
        # Compute discounted state visitation for this trajectory
        discounted_visitation = np.zeros(n_states)
        for t, (state, action) in enumerate(trajectory):
            s_idx = state_to_idx[state]
            discounted_visitation[s_idx] += (gamma ** t)
        # Convert to feature expectations
        traj_features = discounted_visitation @ feature_matrix
        empirical_feature_expectations += traj_features
    empirical_feature_expectations /= len(demonstrations)
        
    # Build transition model from demonstrations
    # For each (s, a) pair, track which s' it leads to
    transition_model = {}  # (s_idx, a_idx) -> s_next_idx
    state_action_pairs = set()
    
    for trajectory in demonstrations:
        for i in range(len(trajectory)):
            state, action = trajectory[i]
            s_idx = state_to_idx[state]
            a_idx = action_to_idx[action]
            state_action_pairs.add((s_idx, a_idx))
            if i < len(trajectory) - 1:
                next_state, _ = trajectory[i + 1]
                s_next_idx = state_to_idx[next_state]
                transition_model[(s_idx, a_idx)] = s_next_idx
        #     # print(transition_model)
        #     print(state_action_pairs)
        # print("\n\n")
    
    print(f"\nFound {len(state_action_pairs)} unique (state, action) pairs")
    print(f"Empirical feature expectation norm: {np.linalg.norm(empirical_feature_expectations):.4f}")
    
    # Gradient descent with adaptive learning rate
    best_diff = float('inf')
    patience = 0
    best_weights = reward_weights.copy()
    momentum = np.zeros(n_features)
    momentum_beta = 0.9
    
    for iteration in range(n_iterations):
        # Adaptive learning rate - decay if oscillating
        current_lr = learning_rate * (0.95 ** (patience // 5))
        # Compute rewards for each state
        rewards = feature_matrix @ reward_weights
        
        # Compute state-action values using backward induction
        # Since trajectories are sequential, we can compute exact values
        q_values = {}  # (s_idx, a_idx) -> Q-value
        values = np.zeros(n_states)
        
        # Initialize terminal state values
        for s_idx in range(n_states): values[s_idx] = rewards[s_idx]
        
        # Backward value iteration
        for vi_iter in range(30):
            new_values = rewards.copy()
            
            for (s_idx, a_idx) in state_action_pairs:
                if (s_idx, a_idx) in transition_model:
                    s_next = transition_model[(s_idx, a_idx)]
                    q_values[(s_idx, a_idx)] = rewards[s_idx] + gamma * values[s_next]
                else: q_values[(s_idx, a_idx)] = rewards[s_idx]         # Terminal action
            
            # Update values using soft-max
            for s_idx in range(n_states):
                # Get all actions available from this state
                available_qs = []
                for a_idx in range(n_actions):
                    if (s_idx, a_idx) in q_values: available_qs.append(q_values[(s_idx, a_idx)])
                
                if len(available_qs) > 0:
                    available_qs = np.array(available_qs)
                    max_q = np.max(available_qs)
                    normalized_qs = (available_qs - max_q) / temperature
                    new_values[s_idx] = max_q + temperature * np.log(np.sum(np.exp(normalized_qs)))
            
            if np.max(np.abs(new_values - values)) < 1e-6: break
            values = new_values
        
        # Compute soft policy from Q-values
        policy = {}
        for s_idx in range(n_states):
            # Get Q-values for all actions from this state
            state_qs = []
            state_actions = []
            
            for a_idx in range(n_actions):
                if (s_idx, a_idx) in q_values:
                    state_qs.append(q_values[(s_idx, a_idx)])
                    state_actions.append(a_idx)

            if len(state_qs) > 0:
                state_qs = np.array(state_qs)
                max_q = np.max(state_qs)
                exp_qs = np.exp((state_qs - max_q) / temperature)
                probs = exp_qs / np.sum(exp_qs)
                
                for a_idx, prob in zip(state_actions, probs): policy[(s_idx, a_idx)] = prob
        
        # Compute expected feature counts by rolling out policy
        # Monte Carlo estimation: sample trajectories according to learned policy
        expected_feature_counts = np.zeros(n_features)
        n_samples = 100  # Number of trajectory samples
        
        for _ in range(n_samples):
            # Sample a starting state from demonstrations
            start_traj = demonstrations[np.random.randint(len(demonstrations))]
            s_idx = state_to_idx[start_traj[0][0]]
            
            traj_features = np.zeros(n_features)
            for t in range(20):  # Max trajectory length
                # Add current state features
                traj_features += (gamma ** t) * feature_matrix[s_idx]
                
                # Sample action according to policy
                available_actions = []
                action_probs = []
                for a_idx in range(n_actions):
                    if (s_idx, a_idx) in policy:
                        available_actions.append(a_idx)
                        action_probs.append(policy[(s_idx, a_idx)])
                if len(available_actions) == 0: break
                
                action_probs = np.array(action_probs)
                action_probs /= np.sum(action_probs)
                a_idx = np.random.choice(available_actions, p=action_probs)
                
                # Transition to next state
                if (s_idx, a_idx) in transition_model: s_idx = transition_model[(s_idx, a_idx)]
                else: break
            expected_feature_counts += traj_features
        expected_feature_counts /= n_samples
        
        # Compute gradient
        gradient = empirical_feature_expectations - expected_feature_counts
        grad_norm = np.linalg.norm(gradient)
        # Momentum update
        momentum = momentum_beta * momentum + (1 - momentum_beta) * gradient
        # Update weights with momentum
        reward_weights += current_lr * momentum
        # Track best solution
        if grad_norm < best_diff:
            best_diff = grad_norm
            best_weights = reward_weights.copy()
            patience = 0
        else: patience += 1
        # Reset to best if diverging too much
        if patience > 15:
            reward_weights = best_weights.copy()
            momentum = np.zeros(n_features)
            patience = 0
            current_lr *= 0.5
            if iteration > 0: print(f"  Reset to best solution, reducing lr to {current_lr:.4f}")
        
        if (iteration + 1) % 10 == 0: print(f"Iteration {iteration + 1}: Gradient norm = {grad_norm:.6f}, "
                                            f"Reward range = [{np.min(rewards):.2f}, {np.max(rewards):.2f}], "
                                            f"Best = {best_diff:.6f}, LR = {current_lr:.4f}")
        # Early stopping
        if grad_norm < 0.1:
            print(f"Converged at iteration {iteration + 1}")
            break
    # Use best weights found
    reward_weights = best_weights
    recovered_rewards = feature_matrix @ reward_weights
    print(f"\nFinal best gradient norm: {best_diff:.6f}")
    return reward_weights, recovered_rewards


def predict_action(current_state, reward_weights, feature_matrix, state_to_idx, 
                   action_to_idx, idx_to_action, transition_model, 
                   temperature=2.0, gamma=0.9):
    """
    Predict the next action given current state using learned reward function.
    
    Args:
        current_state: tuple representing the current state
        reward_weights: learned reward weights from IRL
        feature_matrix: feature matrix for all states
        state_to_idx: mapping from state to index
        action_to_idx: mapping from action string to index
        idx_to_action: mapping from action index to string
        transition_model: dictionary (s_idx, a_idx) -> s_next_idx
        temperature: softmax temperature for policy
        gamma: discount factor
        
    Returns:
        predicted_action: string of predicted action
        action_probs: dict of {action_string: probability} for all valid actions
    """
    # Check if state is known
    if current_state not in state_to_idx:
        print(f"Warning: Unknown state! Not seen in demonstrations.")
        return None, {}
    
    s_idx = state_to_idx[current_state]
    # Compute rewards
    rewards = feature_matrix @ reward_weights
    # Find all valid actions from this state
    valid_actions = []
    q_values_list = []
    
    for a_idx in range(len(action_to_idx)):
        if (s_idx, a_idx) in transition_model:
            s_next_idx = transition_model[(s_idx, a_idx)]
            # Compute Q-value: Q(s,a) = r(s) + γ * V(s')
            # Approximate V(s') ≈ r(s') for simplicity
            q_value = rewards[s_idx] + gamma * rewards[s_next_idx]
            
            valid_actions.append(a_idx)
            q_values_list.append(q_value)
    if len(valid_actions) == 0:
        print(f"Warning: No valid actions from this state!")
        return None, {}
    # Convert to numpy array
    q_values_array = np.array(q_values_list)
    # Compute softmax policy: π(a|s) ∝ exp(Q(s,a)/temperature)
    max_q = np.max(q_values_array)
    exp_q = np.exp((q_values_array - max_q) / temperature)
    action_probs_array = exp_q / np.sum(exp_q)
    # Create action probability dictionary
    action_probs = {}
    for a_idx, prob in zip(valid_actions, action_probs_array):
        action_str = idx_to_action[a_idx]
        action_probs[action_str] = prob
    # Get most likely action
    best_action_idx = valid_actions[np.argmax(action_probs_array)]
    predicted_action = idx_to_action[best_action_idx]
    
    return predicted_action, action_probs


def evaluate_policy(demonstrations, reward_weights, feature_matrix, 
                   state_to_idx, action_to_idx, idx_to_action, 
                   transition_model, temperature=2.0, gamma=0.9):
    """
    Evaluate how well the learned policy matches expert demonstrations.
    
    Returns:
        accuracy: fraction of actions correctly predicted
        confusion_matrix: dict showing prediction results
    """
    total_predictions = 0
    correct_predictions = 0
    # Track predictions for each action
    action_results = {}  # action -> {"correct": count, "total": count}
    for trajectory in demonstrations:
        for i in range(len(trajectory) - 1):  # Exclude last state (no next action)
            current_state, expert_action = trajectory[i]
            # Predict action
            predicted_action, action_probs = predict_action(
                current_state, reward_weights, feature_matrix,
                state_to_idx, action_to_idx, idx_to_action,
                transition_model, temperature, gamma)
            
            if predicted_action is not None:
                total_predictions += 1
                # Check if prediction matches expert
                if predicted_action == expert_action:
                    correct_predictions += 1
                    is_correct = True
                else: is_correct = False
                # Track per-action statistics
                if expert_action not in action_results: action_results[expert_action] = {"correct": 0, "total": 0}
                action_results[expert_action]["total"] += 1
                if is_correct: action_results[expert_action]["correct"] += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy, action_results


def test_predictions(demonstrations, reward_weights, feature_matrix,
                    state_to_idx, idx_to_state, action_to_idx, idx_to_action,
                    transition_model, n_examples=5, temperature=2.0, gamma=0.9):
    """
    Show example predictions vs expert actions.
    """
    print("\n=== Example Predictions ===")
    examples_shown = 0
    for traj_idx, trajectory in enumerate(demonstrations):
        if examples_shown >= n_examples: break
        # Pick a random step in the middle of trajectory
        if len(trajectory) > 2:
            step_idx = len(trajectory) // 2
            current_state, expert_action = trajectory[step_idx]
            print(f"\nTrajectory {traj_idx + 1}, Step {step_idx + 1}:")
            print(f"  Expert action: {expert_action}")
            # Get prediction
            predicted_action, action_probs = predict_action(
                current_state, reward_weights, feature_matrix,
                state_to_idx, action_to_idx, idx_to_action,
                transition_model, temperature, gamma)
            print(f"  Predicted action: {predicted_action}")
            print(f"  Match: {'✓' if predicted_action == expert_action else '✗'}")
            # Show top 3 actions by probability
            sorted_actions = sorted(action_probs.items(), key=lambda x: x[1], reverse=True)
            print(f"  Top 3 action probabilities:")
            for action, prob in sorted_actions[:3]:
                marker = "←" if action == expert_action else " "
                print(f"    {action}: {prob:.4f} {marker}")
            examples_shown += 1


# Main execution
if __name__ == "__main__":
    # Load demonstrations
    demonstrations, unique_actions = load_demonstrations('demonstrations_with_states.txt')
    
    print(f"Loaded {len(demonstrations)} demonstrations")
    print(f"Number of unique actions: {len(unique_actions)}")
    print(f"Actions: {unique_actions[:5]}...")  # Show first 5
    
    # Create mappings
    state_to_idx, idx_to_state, action_to_idx, idx_to_action = \
        create_state_action_mappings(demonstrations, unique_actions)
    
    print(f"Number of unique states: {len(state_to_idx)}")

    
    # Build transition model for predictions
    transition_model = {}
    for trajectory in demonstrations:
        for i in range(len(trajectory)):
            state, action = trajectory[i]
            s_idx = state_to_idx[state]
            a_idx = action_to_idx[action]
            
            if i < len(trajectory) - 1:
                next_state, _ = trajectory[i + 1]
                s_next_idx = state_to_idx[next_state]
                transition_model[(s_idx, a_idx)] = s_next_idx
    
    # Create feature matrix
    feature_matrix = create_feature_matrix(idx_to_state)
    
    print(f"Feature matrix shape: {feature_matrix.shape}")
    
    # Run IRL
    print("\nRunning Maximum Entropy IRL...")
    reward_weights, recovered_rewards = max_entropy_inverse_rl_kitchen(
        demonstrations=demonstrations,
        state_to_idx=state_to_idx,
        action_to_idx=action_to_idx,
        feature_matrix=feature_matrix,
        temperature=1.0,
        gamma=0.95,
        n_iterations=5,
        learning_rate=0.01
    )
    
    print("\n=== Results ===")
    print(f"Learned reward weights shape: {reward_weights.shape}")
    print(f"Reward weight statistics:")
    print(f"  Mean: {np.mean(reward_weights):.4f}")
    print(f"  Std: {np.std(reward_weights):.4f}")
    print(f"  Min: {np.min(reward_weights):.4f}")
    print(f"  Max: {np.max(reward_weights):.4f}")
    
    print(f"\nRecovered rewards statistics:")
    print(f"  Mean: {np.mean(recovered_rewards):.4f}")
    print(f"  Std: {np.std(recovered_rewards):.4f}")
    print(f"  Min: {np.min(recovered_rewards):.4f}")
    print(f"  Max: {np.max(recovered_rewards):.4f}")

Loaded 1 demonstrations
Number of unique actions: 11
Actions: ['cut (onion B)', 'cut (tomato B)', 'move (onion A to B)', 'move (onion B to C)', 'move (plate A to D)']...
state_to_idx: {(0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1): 0, (0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1): 1, (0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0): 2, (0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1): 3, (0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0): 4, (0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0): 5, (0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0

In [3]:
print(action_to_idx)

{'cut (onion B)': 0, 'cut (tomato B)': 1, 'move (onion A to B)': 2, 'move (onion B to C)': 3, 'move (plate A to D)': 4, 'move (pot A to C)': 5, 'move (pot C to D)': 6, 'move (tomato A to B)': 7, 'move (tomato B to C)': 8, 'stop': 9, 'turn_on (stove C)': 10}


In [4]:
# Example 1: Test the 5th step of trajectory 2
# state = demonstrations[2][4][0]
# action, probs = predict_action(
#     state, reward_weights, feature_matrix,
#     state_to_idx, action_to_idx, idx_to_action,
#     transition_model
# )
# print(f"Predicted: {action}")
# print(f"Probabilities: {probs}", "\n\n")

# Example 2: Test the first step (initial state)
for i in range (1):
    for j in range(0,6):
        state = demonstrations[i][j][0]
        action, probs = predict_action(
            state, reward_weights, feature_matrix,
            state_to_idx, action_to_idx, idx_to_action,
            transition_model
        )
        print(f"Actual: {demonstrations[i][j][1]} \n Predicted: {action} \n")
    print("\n\n\n")

# Example 3: Create a custom state (41 binary values)
# This is the initial state: everything at location A
custom_state = (1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 
                1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 0,0,0,0, 0)
action, probs = predict_action(
    custom_state, reward_weights, feature_matrix,
    state_to_idx, action_to_idx, idx_to_action,
    transition_model
)
# print(action)

Actual: move (pot A to C) 
 Predicted: move (pot A to C) 

Actual: move (tomato A to B) 
 Predicted: move (tomato A to B) 

Actual: cut (tomato B) 
 Predicted: cut (tomato B) 

Actual: move (tomato B to C) 
 Predicted: move (tomato B to C) 

Actual: move (onion A to B) 
 Predicted: move (onion A to B) 

Actual: cut (onion B) 
 Predicted: cut (onion B) 





