
# Uniform Initialization Experiment

Run the Babes-Vroman EM MM-IRL algorithm with uniform initialization on the deterministic Canonical PuddleWorld.


In [None]:
notebook_path = %pwd

In [None]:
import sys
sys.path.append(notebook_path)

In [None]:

import pprint
import gym
import copy
import pickle
import warnings

import numpy as np
import pandas as pd
import itertools as it
import matplotlib.pyplot as plt

from datetime import datetime

from sklearn.cluster import KMeans

from puddle_world.envs import *
from explicit_env.soln import value_iteration, q_from_v, OptimalPolicy, policy_evaluation
from unimodal_irl.utils import get_rollouts, empirical_feature_expectations

from multimodal_irl import bv_em_maxent

from unimodal_irl import sw_maxent_irl
from unimodal_irl.utils import pad_terminal_mdp


In [None]:

# Experimental parameters
ENVIRONMENT = "CanonicalPuddleWorld"
TRANSITIONS = 'deterministic'
NUM_GT_CLUSTERS = 2
NUM_CLUSTERS_SWEEP = [2, 3, 1]
NUM_ROLLOUTS_SWEEP = [10, 50, 100]
ALGORITHM = "BV-MaxEnt"
INITIALISATION = 'kmeans'
NUM_REPLICATES = 100

if TRANSITIONS == 'deterministic':
    wind = 0.0
else:
    wind = 0.2


In [None]:

env_wet = CanonicalPuddleWorldEnv(mode='wet', wind=wind)
env_dry = CanonicalPuddleWorldEnv(mode='dry', wind=wind)
env_ = copy.deepcopy(env_wet)

# Pre-compute optimal SD policy value for each mode
pi_wet = OptimalPolicy(q_from_v(value_iteration(env_wet), env_wet), stochastic=False)
pi_wet_v = policy_evaluation(env_wet, pi_wet)

pi_dry = OptimalPolicy(q_from_v(value_iteration(env_dry), env_dry), stochastic=False)
pi_dry_v = policy_evaluation(env_dry, pi_dry)

filename_wet = f"pw-{TRANSITIONS}-wet.pkl"
with open(filename_wet, "rb") as file:
    all_rollouts_wet = pickle.load(file)
filename_dry = f"pw-{TRANSITIONS}-dry.pkl"
with open(filename_dry, "rb") as file:
    all_rollouts_dry = pickle.load(file)

df = pd.DataFrame(
    columns=[
        # Independent Variables
        "Environment",
        "Transition Type",
        "Num GT Clusters",
        "Num Learned Clusters",
        "Num Rollouts",
        "Algorithm",
        "Initialisation",
        
        "Replicate",
        
        # Dependent Variables
        "Iterations",
        "Responsibility Matrix",
        "Reward Weights",
        "Runtime (s)",
    ],
    index=range(len(NUM_ROLLOUTS_SWEEP) * len(NUM_CLUSTERS_SWEEP) * NUM_REPLICATES)
)
df["Responsibility Matrix"] = df["Responsibility Matrix"].astype('object')
df["Reward Weights"] = df["Reward Weights"].astype('object')


_exp_num = 0
for num_rollouts in NUM_ROLLOUTS_SWEEP:
    for num_clusters in NUM_CLUSTERS_SWEEP:
        for replicate in range(NUM_REPLICATES):
            print(f"Exp {_exp_num}/{len(df)}, N={num_rollouts}, K={num_clusters} Replicate {replicate}/{NUM_REPLICATES}")
            
            # Slice out rollouts for this experiment
            start_idx = replicate * num_rollouts // 2
            end_idx = (replicate + 1) * num_rollouts // 2
            rollouts = [*all_rollouts_wet[start_idx:end_idx], *all_rollouts_dry[start_idx:end_idx]]
            
            rollout_features = np.array([
                empirical_feature_expectations(env_, [r])[0]
                for r in rollouts
            ])
            
            # Run experiment
            t0 = datetime.now()
            
            # Initialize mode weights with K-Means clustering
            km = KMeans(n_clusters=num_clusters, n_init=5000)
            hard_initial_clusters = km.fit_predict(rollout_features)
            print("Initial clusters:", hard_initial_clusters)
            soft_initial_clusters = np.zeros((len(rollouts), num_clusters))
            for idx, clstr in enumerate(hard_initial_clusters):
                soft_initial_clusters[idx, clstr] = 1.0
            
            # Compute intial reward weights from up-front clustering
            env_padded, rollouts_padded = pad_terminal_mdp(env_, rollouts=rollouts)
            initial_reward_weights = []
            for m in range(num_clusters):
                initial_reward_weights.append(
                    sw_maxent_irl(
                        rollouts_padded,
                        env_padded,
                        rs=True,
                        rbound=env_.reward_range,
                        with_dummy_state=True,
                        grad_twopoint=True,
                        path_weights=soft_initial_clusters[:, m],
                    )[0][:-1]
                )
            num_iterations, responsibility_matrix, _, reward_weights = bv_em_maxent(
                env_,
                rollouts,
                num_clusters,
                initial_reward_weights=initial_reward_weights
            )
            t1 = datetime.now()
            dt = (t1 - t0).total_seconds()
            
            df.iloc[_exp_num] = [
                ENVIRONMENT,
                TRANSITIONS,
                NUM_GT_CLUSTERS,
                num_clusters,
                num_rollouts,
                ALGORITHM,
                INITIALISATION,
                
                replicate,
                
                num_iterations,
                responsibility_matrix.tolist(),
                reward_weights.tolist(),
                dt
            ]
            _exp_num += 1
            
            filename = f"{ENVIRONMENT}-{TRANSITIONS}-{INITIALISATION}.csv"
            df.to_csv(filename)
    
