In [None]:
#export PYTHONPATH="$(pwd):$PYTHONPATH"

In [5]:
from cleanil.utils import (
    load_yaml, 
    set_seed, 
    get_device, 
    get_logger, 
    write_json,
    LoggerConfig,
)
from cleanil.data import normalize, train_test_split  # Remove load_d4rl_expert_trajs
from cleanil.il import iqlearn
from cleanil.rl.actor import make_tanh_normal_actor
from cleanil.rl.critic import DoubleQNetwork
from torchrl.data import LazyTensorStorage, ReplayBuffer

# Add your custom data loader
from datasets import load_dataset
import torch

def load_custom_expert_trajs(dataset_name, num_expert_trajs, obs_dim, act_dim):
    """Your custom data loading function"""
    # Load from HuggingFace datasets
    dataset = load_dataset(dataset_name, split="train")
    
    # Convert to the required format (implement based on your dataset structure)
    # This is a template - adjust based on your actual data format
    trajectories = []
    
    for i, episode in enumerate(dataset):
        if i >= num_expert_trajs:
            break
            
        # Extract data - adjust these key names based on your dataset
        observations = torch.tensor(episode['observations'], dtype=torch.float32)
        actions = torch.tensor(episode['actions'], dtype=torch.float32)
        
        # Ensure correct shapes
        if len(observations.shape) == 1:
            observations = observations.unsqueeze(-1)
        if len(actions.shape) == 1:
            actions = actions.unsqueeze(-1)
            
        next_observations = torch.cat([observations[1:], observations[-1:]], dim=0)
        episode_length = len(observations)
        
        from tensordict import TensorDict
        episode_data = TensorDict({
            'observation': observations,
            'action': actions,
            'reward': torch.zeros(episode_length, 1),
            'terminated': torch.cat([torch.zeros(episode_length-1), torch.ones(1)]).bool(),
            'truncated': torch.zeros(episode_length).bool(), 
            'done': torch.cat([torch.zeros(episode_length-1), torch.ones(1)]).bool(),
            'next': TensorDict({
                'observation': next_observations,
                'reward': torch.zeros(episode_length, 1),
                'terminated': torch.cat([torch.zeros(episode_length-1), torch.ones(1)]).bool(),
                'truncated': torch.zeros(episode_length).bool(),
                'done': torch.cat([torch.zeros(episode_length-1), torch.ones(1)]).bool(),
            }, batch_size=[episode_length])
        }, batch_size=[episode_length])
        
        trajectories.append(episode_data)
    
    if trajectories:
        return torch.cat(trajectories, dim=0)
    else:
        raise ValueError("No trajectories loaded")

def main(**kwargs):
    config = load_yaml(**kwargs)
    algo_config = iqlearn.IQLearnConfig(**config["algo"])
    write_json(config, f"{algo_config.save_path}/config.json")

    set_seed(config["seed"])
    device = get_device(config["device"])
    logger = get_logger(config, LoggerConfig(**config["logger"]))
    
    print("device", device)

    # CHANGE 1: Define your data dimensions manually (no environment needed)
    obs_dim = 4  # CartPole observation dimension
    act_dim = 1  # CartPole action dimension (discrete -> continuous conversion needed)
    action_bounds = (-1.0, 1.0)  # Set appropriate bounds

    # CHANGE 2: Load your custom data instead of D4RL
    expert_data = load_custom_expert_trajs(
        "NathanGavenski/CartPole-v1",  # Your dataset name
        algo_config.num_expert_trajs,
        obs_dim,
        act_dim
    )
    
    expert_data = expert_data.to(device)
    expert_data, eval_data = train_test_split(expert_data, algo_config.train_ratio)

    # Normalize data
    obs_mean = expert_data["observation"].mean(0)
    obs_std = expert_data["observation"].std(0)
    expert_data["observation"] = normalize(expert_data["observation"], obs_mean, obs_std**2)
    expert_data["next"]["observation"] = normalize(expert_data["next"]["observation"], obs_mean, obs_std**2)
    eval_data["observation"] = normalize(eval_data["observation"], obs_mean, obs_std**2)
    eval_data["next"]["observation"] = normalize(eval_data["next"]["observation"], obs_mean, obs_std**2)
    
    # CHANGE 3: Create agent without environment specs
    actor = make_tanh_normal_actor(
        obs_dim, 
        act_dim, 
        algo_config.hidden_dims, 
        algo_config.activation,
        torch.tensor([action_bounds[0]] * act_dim),  # Manual action bounds
        torch.tensor([action_bounds[1]] * act_dim),
    )
    critic = DoubleQNetwork(
        obs_dim, 
        act_dim, 
        algo_config.hidden_dims, 
        algo_config.activation,
    )
    actor.to(device)
    critic.to(device)

    # Make buffers
    expert_buffer = ReplayBuffer(
        storage=LazyTensorStorage(
            len(expert_data), 
            device=device,
        )
    )
    expert_buffer.extend(expert_data)

    eval_buffer = ReplayBuffer(
        storage=LazyTensorStorage(
            len(eval_data), 
            device=device,
        )
    )
    eval_buffer.extend(eval_data)

    # CHANGE 4: Create trainer without eval_env
    trainer = iqlearn.OfflineTrainer(  # Use modified trainer
        algo_config,
        actor,
        critic,
        expert_buffer,
        eval_buffer,
        obs_mean,
        obs_std,
        act_dim,  # Pass action dimension instead of env
        logger,
        device,
    )
    trainer.train()


In [6]:
main(config_path ="configs/il/iqlearn/offline_cartpole.yaml")


usage: ipykernel_launcher.py [-h] [--config_path CONFIG_PATH]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/ashish_panchal/.local/share/jupyter/runtime/kernel-82f05f1c-f808-40eb-84a3-add4e52ac70d.json


SystemExit: 2

In [None]:
!python scripts/il/train_iqlearn_offline_hf.py --config configs/il/iqlearn/offline_cartpole.yaml

In [None]:
!python scripts/il/train_iqlearn_offline_hf_discrete.py --config configs/il/iqlearn/offline_cartpole_discrete.yaml


In [7]:
!python scripts/il/train_iqlearn_offline_hf_discrete.py --config  configs/il/iqlearn/offline_lunarlander_discrete.yaml

Traceback (most recent call last):
  File "/home/ashish_panchal/Ashish_exp_set_1/IQ_learn/cleanil_copy/scripts/il/train_iqlearn_offline_hf_discrete.py", line 1, in <module>
    from cleanil.utils import (
ModuleNotFoundError: No module named 'cleanil'


In [None]:
!python scripts/il/train_iqlearn_offline_hf_discrete.py --config configs/il/iqlearn/offline_rust_discrete.yaml

In [10]:
import torch
from datasets import load_dataset
import pandas as pd
import numpy as np
from tensordict import TensorDict

def hf_to_tensordict_discrete(dataset_name, split="train"):
    ds = load_dataset(dataset_name, split=split)
    
    obs = torch.tensor(ds["obs"], dtype=torch.float32)
    # Keep actions as discrete integers but add dimension for compatibility
    actions = torch.tensor(ds["actions"], dtype=torch.long)#.unsqueeze(-1)
    rewards = torch.tensor(ds["rewards"], dtype=torch.float32).unsqueeze(-1)
    starts = torch.tensor(ds["episode_starts"], dtype=torch.bool)


    
    actions = torch.tensor(ds["actions"], dtype=torch.float32)   # [T, act_dim]

    
    
    rewards = torch.tensor(ds["rewards"], dtype=torch.float32).unsqueeze(-1)  # [T,1]
    starts = torch.tensor(ds["episode_starts"], dtype=torch.bool) # [T]


    
    
    T = obs.shape[0]
    terminated = starts.roll(-1, dims=0)
    terminated[-1] = True
    truncated = torch.zeros_like(terminated)
    done = terminated | truncated
    
    next_obs = obs.roll(-1, dims=0)
    next_obs[-1] = obs[-1]
    
    td = TensorDict({
        "observation": obs,
        "action": actions,
        "reward": rewards,
        "terminated": terminated,
        "truncated": truncated,
        "done": done,
        "next": TensorDict({
            "observation": next_obs,
            "reward": rewards,
            "terminated": terminated,
            "truncated": truncated,
            "done": done,
        }, batch_size=[T]),
    }, batch_size=[T])
    
    return td

In [13]:
383994/1000

383.994

In [11]:
hf_to_tensordict_discrete("NathanGavenski/LunarLander-v2")

Generating train split: 100%|█| 383994/383994 [00:00<00:00, 513962.09 examples/s


TensorDict(
    fields={
        action: Tensor(shape=torch.Size([383994]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([383994]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([383994]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([383994, 8]), device=cpu, dtype=torch.float32, is_shared=False),
                reward: Tensor(shape=torch.Size([383994, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([383994]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=torch.Size([383994]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([383994]),
            device=None,
            is_shared=False),
        observation: Tensor(shape=torch.Size([383994, 8]), device=cpu, 