In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import gym
import torch
import torch.nn as nn
import torch.functional as F
from matplotlib import pyplot as plt
import pickle 
import imageio 

In [2]:
data_path = "expert_data/Pendulum-v1_10_-130.pkl"


In [4]:
with open(data_path, "rb") as f:
    data_good = pickle.load(f)
print('expert data loaded')
# print(data_good)
data_good=data_good[:20]

good_obs=[]
good_acts=[] 
for traj in data_good: 
    s,a,r=traj   
    good_obs.append(s)
    good_acts.append(a) 

states=np.vstack(good_obs)
actions=np.vstack(good_acts)
print(states)
print(actions)
print('X:',states.shape,' y:', actions.shape)

expert data loaded
[[-0.9602126   0.27927014  0.732989  ]
 [-0.97569436  0.21913576  1.2421062 ]
 [-0.9908181   0.135202    1.7062256 ]
 ...
 [ 0.9974078   0.07195614  0.02709827]
 [ 0.99750435  0.07060467 -0.0270983 ]
 [ 0.9974078   0.07195614  0.02709814]]
[[ 1.9977639 ]
 [ 1.9984503 ]
 [ 1.9985516 ]
 ...
 [-0.72109115]
 [ 0.00828624]
 [-0.7210891 ]]
X: (2000, 3)  y: (2000, 1)


In [4]:
data_loader = torch.utils.data.DataLoader( list(zip(states, actions)), batch_size=64, shuffle=True)

batch=next(iter(data_loader))
states,actions = batch
states.shape,actions.shape

(torch.Size([64, 3]), torch.Size([64, 1]))

In [5]:
action_dim=actions.shape[1]
state_dim=states.shape[1]
print(state_dim, action_dim)

3 1


In [6]:
class MLP(nn.Module):
    def __init__(self, input_dim, size=32):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim,size),
            nn.ReLU(), 
            nn.Linear(size,size),
            nn.ReLU() 
        )
    def forward(self,x):
        x = self.net(x)
        return x
    
class RegNet(MLP):
    def __init__(self, input_dim , size, action_dim):
        super(RegNet, self).__init__(input_dim, size)
        self.decoder = nn.Linear(size, action_dim)
    def forward(self,x):
        x = self.net(x)
        x = self.decoder(x)
        return x

In [7]:
model = RegNet(state_dim, 32, action_dim)
criterion = nn.MSELoss()
learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [8]:
loss_list = []
test_loss = []
batch_size = 256
n_epoch = 1_000
 
for itr in range(0, n_epoch+1):
    total_loss = 0
    b=0
    for batch_states, batch_actions in data_loader: 
        y_pred = model(batch_states)
        loss   = criterion(y_pred, batch_actions) 
        total_loss += loss.item() 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        b += 1 
        
        loss_list.append(loss.item())
        
    if itr % (n_epoch//20)==0:
        print(f'Epoch {itr} Loss: {total_loss/b:.3f}')

Epoch 0 Loss: 0.347
Epoch 50 Loss: 0.075
Epoch 100 Loss: 0.056
Epoch 150 Loss: 0.046
Epoch 200 Loss: 0.037
Epoch 250 Loss: 0.020
Epoch 300 Loss: 0.019
Epoch 350 Loss: 0.019
Epoch 400 Loss: 0.017
Epoch 450 Loss: 0.014
Epoch 500 Loss: 0.013
Epoch 550 Loss: 0.015
Epoch 600 Loss: 0.020
Epoch 650 Loss: 0.013
Epoch 700 Loss: 0.012
Epoch 750 Loss: 0.011
Epoch 800 Loss: 0.011
Epoch 850 Loss: 0.007
Epoch 900 Loss: 0.009
Epoch 950 Loss: 0.037
Epoch 1000 Loss: 0.010


In [9]:
env_name='Pendulum-v1'

In [15]:
def play_an_episode(env_name, model, video_path=None, max_steps=1000):
    video_writer=None 
    if video_path is not None:
        print(f'Saving video to {video_path}')
        # video_writer = imageio.get_writer(video_path, fps=20)
        env=gym.make(env_name, render_mode='human')
    else:
        env=gym.make(env_name)

    obs,_=env.reset()
    rewards=0
    step=0
    for _ in range(max_steps):
        step+=1
        state=torch.tensor(obs, dtype=torch.float)
        action=model(state).detach().numpy() 
        obs, reward, done, trunc,_ = env.step(action)

        if video_path is not None:
            image=env.render()
            video_writer.append_data(image)

        rewards+=reward
        if done or trunc:
            break

    if video_path is not None:
        video_writer.close()
    return {'reward':rewards, 'step':step-1}

In [16]:
scores=[]
n_trajectory=20
for i in range(n_trajectory):
    stats=play_an_episode(env_name, model)
    rewards=stats['reward']
    print(f'episode #{i} reward: {rewards:0.2f}')
    scores.append(rewards)

print(f'\n score: {np.mean(scores):0.2f} +- {np.std(scores):0.2f}')

  if not isinstance(terminated, (bool, np.bool8)):


episode #0 reward: -120.84
episode #1 reward: -117.70
episode #2 reward: -120.70
episode #3 reward: -1.12
episode #4 reward: -246.26
episode #5 reward: -229.88
episode #6 reward: -245.76
episode #7 reward: -122.74
episode #8 reward: -119.19
episode #9 reward: -115.52
episode #10 reward: -3.33
episode #11 reward: -236.93
episode #12 reward: -244.58
episode #13 reward: -122.25
episode #14 reward: -351.74
episode #15 reward: -238.50
episode #16 reward: -115.40
episode #17 reward: -1.84
episode #18 reward: -124.20
episode #19 reward: -115.74

 score: -149.71 +- 90.81


In [17]:
stats=play_an_episode(env_name, model, video_path='bc_pendulum.mp4')
stats 

Saving video to bc_pendulum.mp4


AttributeError: 'NoneType' object has no attribute 'append_data'

: 

In [13]:
from IPython.display import Video
Video('bc_pendulum.mp4' )

In [14]:
from IPython.display import HTML
from base64 import b64encode

video_path = 'bc_pendulum.mp4'

mp4 = open(video_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"""
<video width=400 controls>
      <source src="{data_url}" type="video/mp4">
</video>
""")
