<a href="https://colab.research.google.com/github/amithrajiv/Hackathon/blob/main/rlml_lab_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#2303A51901

M.amith rajiv

Batch:- 09

Implementing Behavioral Cloning for a simple task using expert demonstrations.

In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gymnasium as gym

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

def collect_expert(env_name, n_episodes=200, max_steps=500):
    env = gym.make(env_name)
    obs_list, act_list = [], []
    for _ in range(n_episodes):
        o, info = env.reset(seed=random.randint(0, 10**6))
        terminated = False
        truncated = False
        steps = 0
        while not (terminated or truncated) and steps < max_steps:
            angle = o[2]
            a = 1 if angle > 0 else 0
            obs_list.append(np.array(o, dtype=np.float32))
            act_list.append(int(a))
            o, r, terminated, truncated, info = env.step(a)
            steps += 1
    env.close()
    obs = np.vstack(obs_list)
    acts = np.array(act_list, dtype=np.int64)
    return obs, acts

class BCDataset(Dataset):
    def __init__(self, observations, actions):
        self.x = torch.from_numpy(observations)
        self.y = torch.from_numpy(actions)
    def __len__(self):
        return self.x.shape[0]
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

def make_model(obs_dim, n_actions, hidden_sizes=(128,128)):
    layers = []
    inp = obs_dim
    for h in hidden_sizes:
        layers.append(nn.Linear(inp, h))
        layers.append(nn.ReLU())
        inp = h
    layers.append(nn.Linear(inp, n_actions))
    return nn.Sequential(*layers)

def train_bc(model, dataloader, epochs=10, lr=1e-3, device='cpu'):
    model.to(device)
    opt = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        total = 0
        for xb, yb in dataloader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = loss_fn(logits, yb)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item() * xb.size(0)
            total += xb.size(0)
        avg = total_loss / total if total > 0 else 0.0
        print(f"Epoch {epoch+1}/{epochs}  loss={avg:.4f}")
    return model

def evaluate_policy(env_name, model, n_episodes=20, max_steps=500, device='cpu', render=False):
    render_mode = 'human' if render else None
    env = gym.make(env_name, render_mode=render_mode)
    model.to(device)
    returns = []
    for _ in range(n_episodes):
        o, info = env.reset(seed=random.randint(0, 10**6))
        terminated = False
        truncated = False
        total_r = 0.0
        steps = 0
        while not (terminated or truncated) and steps < max_steps:
            xb = torch.from_numpy(np.array(o, dtype=np.float32)).unsqueeze(0).to(device)
            with torch.no_grad():
                logits = model(xb)
                a = int(torch.argmax(logits, dim=1).item())
            step_out = env.step(a)
            if len(step_out) == 5:
                o, r, terminated, truncated, info = step_out
            else:
                o, r, done, info = step_out
                terminated, truncated = done, False
            total_r += r
            steps += 1
        returns.append(total_r)
    env.close()
    mean_return = float(np.mean(returns))
    std_return = float(np.std(returns))
    print(f"Eval over {n_episodes} episodes: mean_return={mean_return:.2f} std={std_return:.2f}")
    return returns

if __name__ == "__main__":
    ENV = "CartPole-v1"
    obs, acts = collect_expert(ENV, n_episodes=300)
    dataset = BCDataset(obs, acts)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)
    model = make_model(obs_dim=obs.shape[1], n_actions=int(np.max(acts))+1)
    model = train_bc(model, loader, epochs=20, lr=1e-3, device='cpu')
    evaluate_policy(ENV, model, n_episodes=30)
    torch.save(model.state_dict(), "bc_cartpole_gymnasium.pth")
    print("Model saved to bc_cartpole_gymnasium.pth")


Epoch 1/20  loss=0.3156


  return datetime.utcnow().replace(tzinfo=utc)


Epoch 2/20  loss=0.0957
Epoch 3/20  loss=0.0659
Epoch 4/20  loss=0.0519
Epoch 5/20  loss=0.0410
Epoch 6/20  loss=0.0358
Epoch 7/20  loss=0.0311
Epoch 8/20  loss=0.0291
Epoch 9/20  loss=0.0242
Epoch 10/20  loss=0.0240
Epoch 11/20  loss=0.0217
Epoch 12/20  loss=0.0190
Epoch 13/20  loss=0.0227
Epoch 14/20  loss=0.0231
Epoch 15/20  loss=0.0166
Epoch 16/20  loss=0.0174
Epoch 17/20  loss=0.0159
Epoch 18/20  loss=0.0145
Epoch 19/20  loss=0.0130
Epoch 20/20  loss=0.0161
Eval over 30 episodes: mean_return=41.77 std=7.43
Model saved to bc_cartpole_gymnasium.pth
