## Basic imports

In [1]:
#%matplotlib.pyplot inline

import pickle
import rospy
import baxter_interface
import os.path as path
import copy
from tqdm import tqdm_notebook as tqdmn
import matplotlib.pyplot as plt
import itertools
import numpy as np



### Initialize trajectory server

In [2]:
# initialize ros node
rospy.init_node('trajectory_player')
limb = baxter_interface.Limb('right')

### Print joint information

In [3]:
# print(dir(limb))
import time

start = time.time()
print(limb.joint_angles())
print(limb.joint_velocities())
print(limb.joint_efforts())
end = time.time()
print(end-start)

{'right_s0': -0.2744842539179597, 'right_s1': 1.0470027045962942, 'right_w0': -0.5528420198175317, 'right_w1': -0.07753854894385359, 'right_w2': -0.01024264306187117, 'right_e0': 0.007258533610022155, 'right_e1': 0.5036673439242714}
{'right_s0': -5.150252590296569e-06, 'right_s1': 4.2477373544554246e-08, 'right_w0': -0.003930651858118608, 'right_w1': 1.6500116544143132e-05, 'right_w2': -0.0008170283254832181, 'right_e0': 8.690006990924557e-05, 'right_e1': -3.110044784728971e-05}
{'right_s0': 0.0, 'right_s1': 0.0, 'right_w0': 0.0, 'right_w1': 0.0, 'right_w2': 0.0, 'right_e0': 0.0, 'right_e1': 0.0}
0.00160217285156


### Run path and record a different joint
- plot this data

# Setup for RL and PPO

### Imports
- from ppo_gym.py

In [4]:
import argparse
import gym
import os
import sys
import pickle
import time
import math

print(os.getcwd() + '/PPO')
sys.path.append(os.path.abspath(os.getcwd() + '/PPO'))

/home/arclabdl2/ros_ws/src/baxter_experiments/scripts/PPO


In [5]:
from utils import *
from models.mlp_policy import Policy
from models.mlp_critic import Value
from torch.autograd import Variable

from core.ppo import ppo_step
from core.common import estimate_advantages
from core.agent import Agent

### Defaults

In [135]:
#pytorch setup
use_gpu = 0
is_disc_action = False
Tensor = DoubleTensor
ActionTensor = LongTensor if is_disc_action else DoubleTensor
torch.set_default_tensor_type('torch.DoubleTensor')

#parameters
gamma = 0.99
tau = 0.95
l2_reg = 1e-3
learning_rate = 3e-4
clip_epsilon = 0.2
num_threads = 4
mini_batch_size = 2048
log_std = 0
max_iter_num = 500

#environment variables
state_dim = 8#torque, velocity, motor position, joint position, appended: goal
action_dim = 1 #next motor position


### Robot environment setup

need:
- reward function: e^-(setpoint_position[t] @ s1 - true_position[t] @ s1)^2
- input: current state(true_position, velocity, torque)[t]
    ||goal[t+1] @ {s1, e1}
- output: delta_setpoint[t+1] @ s1

In [142]:
def reward(goal, measured):
    return np.exp(-(goal - measured)**2)

def generateTrajectory(numSteps = 1000):
    positions = []
    true_positions = []
    for i in range(numSteps):
        positions.append(np.clip(np.cos(float(i)*(2*np.pi)/1000)*0.5+0.55, -0, 3))
    return positions

def getState(limb):
    measured_pos = np.array([limb.joint_angles()['right_s1'], 
                             limb.joint_angles()['right_e1']])
    measured_vel = np.array([limb.joint_velocities()['right_s1'], 
                             limb.joint_velocities()['right_e1']])
    measured_torque = np.array([limb.joint_efforts()['right_s1'], 
                             limb.joint_efforts()['right_e1']])
    return measured_pos, measured_vel, measured_torque

#make sine wave
positions = generateTrajectory()
setpoint_positions = []

#get initial joint positions
pos = limb.joint_angles()
pos['right_s0'] = 0 
pos['right_s1'] = np.pi/10    
pos['right_e0'] = 0


### Pytorch setup

In [143]:
running_state = ZFilter((state_dim,), clip=5)

policy_net = Policy(state_dim, action_dim, log_std=log_std, hidden_size=(3,3))
value_net = Value(state_dim)

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=learning_rate)

# optimization epoch number and batch size for PPO
optim_epochs = 5
optim_batch_size = 64

from collections import namedtuple
Transition = namedtuple('Transition', ('state', 'action', 'mask', 'next_state',
                                       'reward'))

def batchData(*args):
    return Transition(*args)

In [144]:
def update_params(batch, i_iter):
    states = torch.from_numpy(np.stack(batch.state))
    actions = torch.from_numpy(np.stack(batch.action))
    rewards = torch.from_numpy(np.stack(batch.reward))
    masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
    if use_gpu:
        states, actions, rewards, masks = states.cuda(), actions.cuda(), rewards.cuda(), masks.cuda()
    values = value_net(Variable(states, volatile=True)).data
    fixed_log_probs = policy_net.get_log_prob(Variable(states, volatile=True), Variable(actions)).data

    """get advantage estimation from the trajectories"""
    advantages, returns = estimate_advantages(rewards, masks, values, gamma, tau, use_gpu)

    lr_mult = max(1.0 - float(i_iter) / max_iter_num, 0)

    """perform mini-batch PPO update"""
    optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
    for _ in range(optim_epochs):
        perm = torch.randperm(states.shape[0])
        # perm = np.arange(states.shape[0])
        # np.random.shuffle(perm)
        # perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)

        states, actions, returns, advantages, fixed_log_probs = \
            states[perm], actions[perm], returns[perm], advantages[perm], fixed_log_probs[perm]

        for i in range(optim_iter_num):
            ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0]))
            states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

            ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b,
                     advantages_b, fixed_log_probs_b, lr_mult, learning_rate, clip_epsilon, l2_reg)

In [None]:
rewards_looped = []

for itr in range(100):
    #reset environment
    limb.set_joint_positions(pos)
    rospy.sleep(1)

    #GET BATCH
    obs_s = []
    act_s = []
    done_s = []
    reward_s = []
    obs_s_next = []
    #
    true_positions = []
    setpoint_positions = []
    goal_positions = []
    goal = pos['right_s1'] #currently static
    
    measured_pos, measured_vel, measured_torque = getState(limb)
    obs = np.hstack((measured_pos.copy(), measured_vel.copy(), measured_torque.copy(), goals.copy()))



    for i, itered_pos in enumerate(positions):
        #OBSERVATION
        goals = np.array([itered_pos, goal])
        
        #ACTION
        obs_th = Variable(torch.from_numpy(obs)).unsqueeze(0)
        action = policy_net(obs_th)[0].data[0].numpy() #decrease this guy's magnitude?

        pos['right_e1'] = itered_pos #directly controlled joint, not compensated
        setpoint_position = goal + action
        pos['right_s1'] = setpoint_position #compensated joint
        limb.set_joint_positions(pos) #take action

            
        rospy.sleep(0.01)
        
        measured_pos, measured_vel, measured_torque = getState(limb)
        obs_new = np.hstack((measured_pos.copy(), measured_vel.copy(), measured_torque.copy(), goals.copy()))
        
        setpoint_positions.append(setpoint_position)
        true_positions.append(measured_pos[0])
        goal_positions.append(goal)
        obs_s.append(obs)
        obs_s_next.append(obs_new)
        act_s.append(action)
        reward_s.append(reward(goal, measured_pos[0]))
            
        if i == len(positions) - 1:
            done_s.append(1)
        else:
            done_s.append(0)

        obs = obs_new





    #update algorithm
    mask = 1 - np.array(done_s)
    batch = batchData(np.array(obs_s), np.array(act_s), mask, np.array(obs_s_next), np.array(reward_s))
    update_params(batch, itr)
    
    rewards_looped.extend(reward_s)
    

In [None]:
plt.figure()
plt.plot(rewards_looped)
plt.show()


In [151]:
np.sqrt(-np.log(0.995))*180/np.pi
np.sqrt(-np.log(0.972))*180/np.pi

9.6555670624167362

In [None]:
plt.figure()
plt.plot(true_positions)
plt.plot(setpoint_positions)
plt.plot(goal_positions)
plt.plot(rewards)
plt.legend(['true', 'setpoint', 'goals', 'rewards'])
plt.show()


### Training

In [None]:
class Agent:

    def __init__(self, policy, mean_action=False, 
                 tensor_type=torch.DoubleTensor):
        self.policy = policy
        self.mean_action = mean_action
        self.tensor = tensor_type

    def collect_samples(self, min_batch_size):
        t_start = time.time()
        if use_gpu:
            self.policy.cpu()
        thread_batch_size = int(math.floor(min_batch_size / self.num_threads))
        queue = multiprocessing.Queue()
        workers = []

        for i in range(self.num_threads-1):
            worker_args = (i+1, queue, self.env_list[i + 1], self.policy, self.custom_reward, self.mean_action,
                           self.tensor, False, self.running_state, False, thread_batch_size)
            workers.append(multiprocessing.Process(target=collect_samples, args=worker_args))
        for worker in workers:
            worker.start()

        memory, log = collect_samples(0, None, self.env_list[0], self.policy, self.custom_reward, self.mean_action,
                                      self.tensor, self.render, self.running_state, True, thread_batch_size)

        worker_logs = [None] * len(workers)
        worker_memories = [None] * len(workers)
        for _ in workers:
            pid, worker_memory, worker_log = queue.get()
            worker_memories[pid - 1] = worker_memory
            worker_logs[pid - 1] = worker_log
        for worker_memory in worker_memories:
            memory.append(worker_memory)
        batch = memory.sample()
        if self.num_threads > 1:
            log_list = [log] + worker_logs
            log = merge_log(log_list)
        if use_gpu:
            self.policy.cuda()
        t_end = time.time()
        log['sample_time'] = t_end - t_start
        log['action_mean'] = np.mean(np.vstack(batch.action), axis=0)
        log['action_min'] = np.min(np.vstack(batch.action), axis=0)
        log['action_max'] = np.max(np.vstack(batch.action), axis=0)
        return batch, log


In [None]:
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    tensor, render, running_state, update_rs, min_batch_size):
    torch.randn(pid, )
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state, update=update_rs)
        reward_episode = 0

        for t in range(10000):
            state_var = Variable(tensor(state).unsqueeze(0), volatile=True)
            if mean_action:
                action = policy(state_var)[0].data[0].numpy()
            else:
                action = policy.select_action(state_var)[0].numpy()
            action = int(action) if policy.is_disc_action else action.astype(np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state, update=update_rs)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log

In [None]:
def main_loop():
    for i_iter in range(args.max_iter_num):
        """generate multiple trajectories that reach the minimum batch_size"""
        batch, log = agent.collect_samples(args.min_batch_size)
        t0 = time.time()
        update_params(batch, i_iter)
        t1 = time.time()

        if i_iter % args.log_interval == 0:
            print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format(
                i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward']))

        if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0:
            if use_gpu:
                policy_net.cpu(), value_net.cpu()
            pickle.dump((policy_net, value_net, running_state),
                        open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb'))
            if use_gpu:
                policy_net.cuda(), value_net.cuda()

        """clean up gpu memory"""
        if use_gpu:
            torch.cuda.empty_cache()