In [1]:
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
import gym
import numpy as np

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std):
        super(ActorCritic, self).__init__()
        # action mean range -1 to 1
        self.actor =  nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.Tanh(),
                nn.Linear(64, 32),
                nn.Tanh(),
                nn.Linear(32, action_dim),
                nn.Tanh()
                )
        # critic
        self.critic = nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.Tanh(),
                nn.Linear(64, 32),
                nn.Tanh(),
                nn.Linear(32, 1)
                )
        self.action_var = torch.full((action_dim,), action_std*action_std).to(device)
        
    def forward(self):
        raise NotImplementedError
    
    def act(self, state, memory):
        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).to(device)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(action_logprob)
        
        return action.detach()
    
    def evaluate(self, state, action):   
        action_mean = self.actor(state)
        
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(device)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.critic(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

class PPO:
    def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        
        self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def select_action(self, state, memory):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()
    
    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        
        # convert list to tensor
        old_states = torch.squeeze(torch.stack(memory.states).to(device), 1).detach()
        old_actions = torch.squeeze(torch.stack(memory.actions).to(device), 1).detach()
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs), 1).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
        
# def main():
############## Hyperparameters ##############
env_name = "BipedalWalker-v3"
render = False
solved_reward = 300         # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_episodes = 10000        # max training episodes
max_timesteps = 1500        # max timesteps in one episode

update_timestep = 4000      # update policy every n timesteps
action_std = 0.5            # constant std for action distribution (Multivariate Normal)
K_epochs = 80               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr = 0.0003                 # parameters for Adam optimizer
betas = (0.9, 0.999)

random_seed = None
saved_model = False
#############################################

# creating environment
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

if random_seed:
    print("Random Seed: {}".format(random_seed))
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

memory = Memory()
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)

# logging variables
running_reward = 0
avg_length = 0
time_step = 0

if saved_model:
#     PATH = 'PPO_continuous_BipedalWalker-v3-2000.pth'
    PATH = 'PPO_continuous_BipedalWalker-v2-fullytrained.pth'
    #         ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    ppo.policy.load_state_dict(torch.load(PATH))
    ppo.policy.eval()
    # ppo.policy.train()

# training loop
for i_episode in range(1, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
        time_step +=1
        # Running policy_old:
        action = ppo.select_action(state, memory)
        state, reward, done, _ = env.step(action)

        # Saving reward and is_terminals:
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

        # update if its time
        if time_step % update_timestep == 0:
            ppo.update(memory)
            memory.clear_memory()
            time_step = 0
        running_reward += reward
        if render:
            env.render()
        if done:
            break

    avg_length += t

    # stop training if avg_reward > solved_reward
    if running_reward > (log_interval*solved_reward):
        print("########## Solved! ##########")
        torch.save(ppo.policy.state_dict(), './PPO_continuous_solved_{}.pth'.format(env_name))
        break

    # save every 500 episodes
    if i_episode % 500 == 0:
        torch.save(ppo.policy.state_dict(), './PPO_continuous_{}.pth'.format(env_name))

    # logging
    if i_episode % log_interval == 0:
        avg_length = int(avg_length/log_interval)
        running_reward = int((running_reward/log_interval))

        print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward))
#             torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))

        running_reward = 0
        avg_length = 0
            
# if __name__ == '__main__':
#     main()



0.0003 (0.9, 0.999)
Episode 20 	 Avg length: 146 	 Avg reward: -109
Episode 40 	 Avg length: 79 	 Avg reward: -112
Episode 60 	 Avg length: 74 	 Avg reward: -111
Episode 80 	 Avg length: 154 	 Avg reward: -110
Episode 100 	 Avg length: 152 	 Avg reward: -108
Episode 120 	 Avg length: 144 	 Avg reward: -108
Episode 140 	 Avg length: 568 	 Avg reward: -94
Episode 160 	 Avg length: 425 	 Avg reward: -100
Episode 180 	 Avg length: 714 	 Avg reward: -92
Episode 200 	 Avg length: 1149 	 Avg reward: -76
Episode 220 	 Avg length: 1294 	 Avg reward: -66
Episode 240 	 Avg length: 1431 	 Avg reward: -59
Episode 260 	 Avg length: 1355 	 Avg reward: -60
Episode 280 	 Avg length: 1355 	 Avg reward: -58
Episode 300 	 Avg length: 1427 	 Avg reward: -55
Episode 320 	 Avg length: 1151 	 Avg reward: -63
Episode 340 	 Avg length: 1285 	 Avg reward: -54
Episode 360 	 Avg length: 722 	 Avg reward: -77
Episode 380 	 Avg length: 580 	 Avg reward: -83
Episode 400 	 Avg length: 1143 	 Avg reward: -54
Episode 42

Episode 3340 	 Avg length: 1388 	 Avg reward: 160
Episode 3360 	 Avg length: 1191 	 Avg reward: 116
Episode 3380 	 Avg length: 1258 	 Avg reward: 122
Episode 3400 	 Avg length: 1234 	 Avg reward: 124
Episode 3420 	 Avg length: 1372 	 Avg reward: 157
Episode 3440 	 Avg length: 1104 	 Avg reward: 94
Episode 3460 	 Avg length: 1260 	 Avg reward: 138
Episode 3480 	 Avg length: 1210 	 Avg reward: 124
Episode 3500 	 Avg length: 1228 	 Avg reward: 120
Episode 3520 	 Avg length: 1345 	 Avg reward: 162
Episode 3540 	 Avg length: 1373 	 Avg reward: 157
Episode 3560 	 Avg length: 1312 	 Avg reward: 160
Episode 3580 	 Avg length: 1353 	 Avg reward: 160
Episode 3600 	 Avg length: 1322 	 Avg reward: 153
Episode 3620 	 Avg length: 1243 	 Avg reward: 142
Episode 3640 	 Avg length: 1282 	 Avg reward: 153
Episode 3660 	 Avg length: 1285 	 Avg reward: 137
Episode 3680 	 Avg length: 1121 	 Avg reward: 111
Episode 3700 	 Avg length: 1360 	 Avg reward: 158
Episode 3720 	 Avg length: 1397 	 Avg reward: 183
E

Episode 6620 	 Avg length: 1310 	 Avg reward: 178
Episode 6640 	 Avg length: 1260 	 Avg reward: 170
Episode 6660 	 Avg length: 1166 	 Avg reward: 145
Episode 6680 	 Avg length: 1146 	 Avg reward: 150
Episode 6700 	 Avg length: 1138 	 Avg reward: 147
Episode 6720 	 Avg length: 1347 	 Avg reward: 206
Episode 6740 	 Avg length: 1262 	 Avg reward: 173
Episode 6760 	 Avg length: 1385 	 Avg reward: 196
Episode 6780 	 Avg length: 1350 	 Avg reward: 192
Episode 6800 	 Avg length: 1144 	 Avg reward: 140
Episode 6820 	 Avg length: 1189 	 Avg reward: 155
Episode 6840 	 Avg length: 1181 	 Avg reward: 163
Episode 6860 	 Avg length: 1230 	 Avg reward: 165
Episode 6880 	 Avg length: 1269 	 Avg reward: 174
Episode 6900 	 Avg length: 1251 	 Avg reward: 170
Episode 6920 	 Avg length: 1205 	 Avg reward: 163
Episode 6940 	 Avg length: 1175 	 Avg reward: 152
Episode 6960 	 Avg length: 1316 	 Avg reward: 185
Episode 6980 	 Avg length: 1348 	 Avg reward: 197
Episode 7000 	 Avg length: 1404 	 Avg reward: 213


Episode 9940 	 Avg length: 895 	 Avg reward: 110
Episode 9960 	 Avg length: 1050 	 Avg reward: 155
Episode 9980 	 Avg length: 1032 	 Avg reward: 150
Episode 10000 	 Avg length: 1062 	 Avg reward: 156


In [4]:
# Load and run the policy for visualization? - Thats the replacement for .eval
# Try .eval and .load with Cassie if possible?
#  Save the model using Zhaoming's code. Save the model exactly and let's see what happens?
import numpy as np

env = gym.make(env_name)
state = env.reset()
running_reward = 0
done = False
PATH = 'PPO_continuous_BipedalWalker-v3-2000.pth'
# PATH = 'PPO_continuous_BipedalWalker-v2-fullytrained.pth'
ppo.policy.load_state_dict(torch.load(PATH))
state_action_list = []

while not done:
    action = ppo.select_action(state, memory)
    env.render()
    state,reward,done,_ = env.step(action)
    running_reward += reward
    state_action_list.append([state,action])

print(running_reward)
np.save('state_action_bipedal.npy',state_action_list)
env.close()

254.06209483377606
