In [None]:
from google.colab import files
uploaded = files.upload()

###########         準備專家數據-要先作             ######################
#https://hrl.boyuai.com/chapter/3/%E6%A8%A1%E4%BB%BF%E5%AD%A6%E4%B9%A0

In [8]:
import numpy as np
import random

#My Code
def sample_expert_dataMy():
    file_obs = "MountainCar-v0_expert_states.csv"
    file_act = "MountainCar-v0_expert_actions.csv"
    expert_obs = np.loadtxt(file_obs, delimiter="\t")
    expert_act = np.loadtxt(file_act, delimiter="\t")
    expert_act = expert_act.astype('int64')
    return expert_obs, expert_act

expert_s, expert_a = sample_expert_dataMy()

n_samples = 500  # 采样x个数据
random_index = random.sample(range(expert_s.shape[0]), n_samples)
expert_s = expert_s[random_index]
expert_a = expert_a[random_index]

############  Behavior Cloning   ###################

In [None]:
import gym
import torch
import torch.nn.functional as F
import torch.nn as nn


class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(PolicyNet, self).__init__()
        self.actor = nn.Sequential(
                        nn.Linear(state_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, action_dim),
                        nn.Softmax(dim=-1)
                    )

class BehaviorClone:
    def __init__(self, state_dim, hidden_dim, action_dim, lr):
        self.policy = PolicyNet(state_dim, action_dim, hidden_dim).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

    def learn(self, states, actions):
        states = torch.tensor(states, dtype=torch.float).to(device)
        actions = torch.tensor(actions).view(-1, 1).to(device)
        log_probs = torch.log(self.policy.actor(states).gather(1, actions))
        bc_loss = torch.mean(-log_probs)  # 最大似然估计; bc_loss=> 正值
        #bc_lossL.append(bc_loss.item())#觀察訓練誤差用

        self.optimizer.zero_grad()
        bc_loss.backward()
        self.optimizer.step()

    def take_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(device)
        probs = self.policy.actor(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()


def test_agent(agent, env, n_episode):
    return_list = []
    for episode in range(n_episode):
        episode_return = 0
        state = env.reset()
        done = False
        while not done:
            action = agent.take_action(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            episode_return += reward
        return_list.append(episode_return)
    return np.mean(return_list)


env_name = 'MountainCar-v0'
env = gym.make(env_name)
env.seed(0)
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

hidden_dim = 128
lr = 1e-3
bc_agent = BehaviorClone(state_dim, hidden_dim, action_dim, lr)
n_iterations = 5000 #訓練/測試次數ori: 1000
batch_size = 64
test_returns = []
#bc_lossL = []#觀察訓練誤差用

for i in range(n_iterations):
  sample_indices = np.random.randint(low=0, high=expert_s.shape[0], size=batch_size)
  bc_agent.learn(expert_s[sample_indices], expert_a[sample_indices])
  current_return = test_agent(bc_agent, env, 5)#5=> 5次平均
  print('i: ',i,' 5次平均return: ',current_return)
  test_returns.append(current_return)
  #if current_return > -110:
  #  torch.save(bc_agent.policy.actor.state_dict(),'BC_'+str(i)+'.pth')

#torch.save(bc_agent.policy.state_dict(),'bc_1000.pth')

In [None]:
import matplotlib.pyplot as plt
iteration_list = list(range(len(test_returns)))
plt.plot(iteration_list, test_returns)
plt.xlabel('Iterations')
plt.ylabel('Returns')
plt.title('BC on {}'.format(env_name))
plt.show()

#torch.save(bc_agent.policy.state_dict(),'bcAgent.pth')#儲存模型參數

In [None]:
iteration_list = list(range(len(bc_lossL)))
plt.plot(iteration_list, bc_lossL)
plt.xlabel('Iterations')
plt.ylabel('bc_loss')
plt.title('bc_loss on {}'.format(env_name))
plt.show()

In [None]:
#Behavior Cloning MountainCar- 測試測試測試測試測試測試測試
import gym
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np


class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(PolicyNet, self).__init__()
        self.actor = nn.Sequential(
                        nn.Linear(state_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, action_dim),
                        nn.Softmax(dim=-1)
                    )

    def actT(self, stat):
      stat = torch.from_numpy(stat).unsqueeze(0)
      action = self.actor(stat)
      return action.argmax().item()

env_name = 'MountainCar-v0'
env = gym.make(env_name)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.n
hidden_dim = 128

policy_net = PolicyNet(num_inputs, num_actions, hidden_dim)
#state_dict = torch.load('BC_775.pth')
state_dict = torch.load('/content/data/E569_PPOMC_agent.pth')
policy_net.actor.load_state_dict(state_dict)

test_epochs = 100
rewards = []
for i_episode in range(test_epochs):
    state = env.reset()
    current_ep_reward = 0
    for t in range(300):
        #state = state.astype('float64')
        action = policy_net.actT(state)
        state, reward, done, _ = env.step(action)

        current_ep_reward += reward

        if done:
            print('i_episode: ',i_episode,' current_ep_reward: ',current_ep_reward)
            rewards.append(current_ep_reward)
            break
avg_return = np.array(rewards)
print('Average Return: ',np.mean(avg_return))

In [None]:
#20231223 BC-PPO解MC: 比純PPO快很多
import os
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical
import random
import numpy as np
import gym

# set device to cpu or cuda
device = torch.device('cpu')

if(torch.cuda.is_available()):
    device = torch.device('cuda:0')
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")

class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []


    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
                        nn.Linear(state_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, action_dim),
                        nn.Softmax(dim=-1)
                    )


        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, 1)
                    )

    def act(self, state):

        action_probs = self.actor(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        action_logprob = dist.log_prob(action)

        return action.detach(), action_logprob.detach()


    def evaluate(self, state, action):

        action_probs = self.actor(state)
        dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)

        return action_logprobs, state_values, dist_entropy

class PPO:
    def __init__(self, state_dim, action_dim, hidden_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.buffer = RolloutBuffer()
        self.policy = ActorCritic(state_dim, action_dim, hidden_dim).to(device)
        state_dict = torch.load('/content/BC_697.pth')
        self.policy.actor.load_state_dict(state_dict)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim, hidden_dim).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()


    def select_action(self, state):

        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action, action_logprob = self.policy_old.act(state)

        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(action_logprob)

        return action.item()


    def update(self):

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)


        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)

            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()


    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)


    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))

print("============================================================================================")

def seed_torch(seed):
        torch.manual_seed(seed)
        if torch.backends.cudnn.enabled:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

####### initialize environment hyperparameters ######
env_name = "MountainCar-v0"
seed = 1
np.random.seed(seed)
random.seed(seed)
seed_torch(seed)

################ PPO hyperparameters ################
K_epochs = 40               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr_actor = 0.0003       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network
#####################################################

print("training environment name : " + env_name)

env = gym.make(env_name)

# state space dimension
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
hidden_dim = 128 #20231222
#hidden_dim = 64 #20231223 純PPO時,64比128快(??)
################# training procedure ################

# initialize a PPO agent
ppo_agent = PPO(state_dim, action_dim, hidden_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip)

num_epochs = 50000
update_epoch = 8
maxR = -200
os.makedirs('./data/', exist_ok=True)
for i_episode in range(num_epochs):

    state = env.reset()
    current_ep_reward = 0

    for t in range(300):

        # select action with policy
        action = ppo_agent.select_action(state)
        state, reward, done, _ = env.step(action)

        # saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)

        current_ep_reward += reward

        if done:
            if i_episode % update_epoch == 0:
                ppo_agent.update()
            break



    if current_ep_reward > maxR:
      maxR = current_ep_reward
    if i_episode % 10 == 0:
      print("i_episode: ", i_episode, " current_ep_reward: ",current_ep_reward," maxR: ",maxR)

    if current_ep_reward > -90:
      fileN = './data/'+'E'+str(i_episode)+'_PPOMC_agent.pth'
      torch.save(ppo_agent.policy.actor.state_dict(),fileN)


#ppo_agent.policy.critic._modules['0'].weight.detach().numpy()