In [1]:
import gym
import os
import time
import torch
import numpy as np
from gym import wrappers
from PIL import Image
from itertools import count
from collections import namedtuple
import ptan

from PG.pg import PG
from PG.utils import mkdir
from PG.buffer import MeanBuffer

In [2]:
env_name = 'CartPole-v1'
lr_base = 0.0001
lr_decay = 0.0001

random_seed = 42
gamma = 0.99                # discount for future rewards
batch_size = 64         # num of transitions sampled from replay buffer

max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
log_interval = 50           # print avg reward after interval

entropy_beta = 0.01
bellman_steps = 10
baseline_steps = 50000


In [3]:
config = [
     {'dim': [None, 128], 'dropout': False, 'activation': 'relu'},    
     {'dim': [128, None], 'dropout': False, 'activation': False},    
] 

In [4]:
class PG_Trainer():
    
    def __init__(self, env_name, config, random_seed=42, lr_base=0.001, lr_decay=0.00005, 
                 gamma=0.99, batch_size=32, 
                 max_episodes=100000, max_timesteps=3000, 
                 log_interval=5, threshold=None, lr_minimum=1e-10, 
                 entropy_beta=0.01, bellman_steps=10, baseline_steps=50000):
                
        self.algorithm_name = 'pg'
        self.env_name = env_name
        self.env = gym.make(env_name)
               
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n    
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold
              
        self.config = config
        self.config[0]['dim'][0] = self.state_dim
        self.config[-1]['dim'][1] = self.action_dim      
        
        
        self.random_seed = random_seed
        self.lr_base = lr_base
        self.lr_decay = lr_decay
        self.lr_minimum = lr_minimum        
        self.gamma = gamma
        self.batch_size = batch_size   
        
        self.entropy_beta = entropy_beta
        self.bellman_steps = bellman_steps
        self.baseline_steps = baseline_steps        
        
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.log_interval = log_interval
        
        self.logdir = mkdir('.', 'log')
        
        prdir = mkdir('.', 'preTrained')
        self.directory = mkdir(prdir, self.algorithm_name)
        self.filename = "{}_{}_{}".format(self.algorithm_name, self.env_name, self.random_seed)
                       
        self.policy = PG(self.env, self.config, self.gamma, self.bellman_steps)   
        
        # The experience source interacts with the environment and returns (s,a,r,s') transitions
        self.exp_source = ptan.experience.ExperienceSourceFirstLast(self.env, self.policy.ptan_agent,
                                                                    gamma=self.gamma,
                                                                    steps_count=self.bellman_steps)
        
        self.baseline_buffer = MeanBuffer(self.baseline_steps)
      
        
        self.reward_history = []
        self.make_plots = False       
        
        if self.random_seed:
            print("Random Seed: {}".format(self.random_seed))
            self.env.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
            
    def train(self):
        
        start_time = time.time()        
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={} \n".format(self.threshold))        

        # loading models
        self.policy.load(self.directory, self.filename)
                
        # logging variables:        
        log_f = open(os.path.join(self.logdir, "train_{}_{}.txt".format(self.algorithm_name, self.random_seed)),
                     "w+")
        
        print("\nTraining started ... ")
        avg_loss = 0
        total_rewards = []
        step_rewards = []        
        step_idx = 0
        episode = 0
              
        batch_states, batch_actions, batch_scales = [], [], []
        learning_rate = self.lr_base
        self.policy.set_optimizers(lr=learning_rate)

        # each iteration runs one action in the environment and returns a (s,a,r,s') transition
        for step_idx, exp in enumerate(self.exp_source):
            self.baseline_buffer.add(exp.reward)
            baseline = self.baseline_buffer.mean()
            #writer.add_scalar("baseline", baseline, step_idx)
            
            batch_states.append(exp.state)
            batch_actions.append(int(exp.action))
            batch_scales.append(exp.reward - baseline)

            # handle when an episode is completed
            episode_rewards = self.exp_source.pop_total_rewards()
            if episode_rewards:
                episode += 1
                reward = episode_rewards[0]
                total_rewards.append(reward)
                avg_reward = float(np.mean(total_rewards[-100:]))
                
                if len(self.policy.loss_list) > 0:               
                    avg_loss = np.mean(self.policy.loss_list[-100:])     
                
                # Print avg reward every log interval:
                if episode % self.log_interval == 0:            
                    self.policy.save(self.directory, self.filename)
                    print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Loss: {:8.6f}".format(
                        episode, reward, avg_reward, learning_rate, avg_loss))
                
                # logging updates:        
                log_f.write('{},{}\n'.format(episode, reward))
                log_f.flush()
                
                learning_rate = max(self.lr_base / (1.0 + episode * self.lr_decay), self.lr_minimum) 
                
                              
                #writer.add_scalar("reward", reward, step_idx)
                #writer.add_scalar("reward_100", mean_rewards, step_idx)
                #writer.add_scalar("episodes", done_episodes, step_idx)
                
                # if avg reward > threshold then save and stop traning:
                if avg_reward >= self.threshold and episode > 100: 
                    print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Loss: {:8.6f}".format(
                        episode, reward, avg_reward, learning_rate, avg_loss))
                    print("########## Solved! ###########")
                    name = self.filename + '_solved'
                    self.policy.save(self.directory, name)
                    log_f.close()
                    self.env.close()  
                    training_time = time.time() - start_time
                    print("Training time: {:6.2f} sec".format(training_time))
                    break    

            if len(batch_states) < self.batch_size:
                continue
            
            self.policy.update(batch_states, batch_actions, batch_scales, self.batch_size, self.entropy_beta)        
                    
            

            '''    
            writer.add_scalar("baseline", baseline, step_idx)
            writer.add_scalar("entropy", entropy_v.item(), step_idx)
            writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx)
            writer.add_scalar("loss_entropy", entropy_loss_v.item(), step_idx)
            writer.add_scalar("loss_policy", loss_policy_v.item(), step_idx)
            writer.add_scalar("loss_total", loss_v.item(), step_idx)
            writer.add_scalar("grad_l2", grad_means / grad_count, step_idx)
            writer.add_scalar("grad_max", grad_max, step_idx)
            '''

            batch_states.clear()
            batch_actions.clear()
            batch_scales.clear()

            
                                
    def test(self, episodes=3, render=True, save_gif=True):              

        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)
        
        t = 0
        for episode in range(1, episodes+1):
            ep_reward = 0.0            
            epdir = mkdir(algdir, str(episode))
        
            for step_idx, exp in enumerate(self.exp_source):
                self.baseline_buffer.add(exp.reward)
                baseline = self.baseline_buffer.mean()
                #writer.add_scalar("baseline", baseline, step_idx)

                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                t+= 1
                    
                # handle when an episode is completed
                episode_rewards = self.exp_source.pop_total_rewards()
                if episode_rewards:
                    ep_reward = episode_rewards[0]
                    t = 0
                    break
        
            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))           
            self.env.close()        
                   
            

In [5]:
agent = PG_Trainer(env_name, config, random_seed=random_seed, lr_base=lr_base, lr_decay=lr_decay, 
                   gamma=gamma, batch_size=batch_size,
                   max_episodes=max_episodes, max_timesteps=max_timesteps, 
                   log_interval=log_interval, entropy_beta=entropy_beta, 
                   bellman_steps=bellman_steps, baseline_steps=baseline_steps
                   )
agent.train()

NETWORK: Sequential(
  (0): Linear(in_features=4, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=2, bias=True)
) Device: cpu
Random Seed: 42
action_space=Discrete(2)
obs_space=Box(4,)
threshold=475.0 

DIR=./preTrained/pg NAME=pg_CartPole-v1_42
No models to load

Training started ... 
Ep:   50  Rew:   11.00  Avg Rew:   21.56  LR:0.00009951  Loss: -0.014341
Ep:  100  Rew:   13.00  Avg Rew:   20.72  LR:0.00009902  Loss: -0.028912
Ep:  150  Rew:    9.00  Avg Rew:   19.79  LR:0.00009853  Loss: -0.045848
Ep:  200  Rew:   10.00  Avg Rew:   19.30  LR:0.00009805  Loss: -0.052478
Ep:  250  Rew:   29.00  Avg Rew:   19.71  LR:0.00009757  Loss: -0.033148
Ep:  300  Rew:   26.00  Avg Rew:   22.11  LR:0.00009710  Loss: 0.013484
Ep:  350  Rew:    9.00  Avg Rew:   21.86  LR:0.00009663  Loss: 0.004037
Ep:  400  Rew:   30.00  Avg Rew:   21.01  LR:0.00009616  Loss: 0.011181
Ep:  450  Rew:   17.00  Avg Rew:   21.15  LR:0.00009570  Loss: 0.019755
Ep:  500  Rew:   13.0

Ep: 5500  Rew:  198.00  Avg Rew:  314.96  LR:0.00006452  Loss: -0.010073
Ep: 5550  Rew:  311.00  Avg Rew:  308.27  LR:0.00006431  Loss: -0.010473
Ep: 5600  Rew:  321.00  Avg Rew:  335.25  LR:0.00006411  Loss: -0.005752
Ep: 5650  Rew:  467.00  Avg Rew:  309.43  LR:0.00006390  Loss: -0.026763
Ep: 5700  Rew:  200.00  Avg Rew:  261.88  LR:0.00006370  Loss: -0.026570
Ep: 5750  Rew:  290.00  Avg Rew:  253.05  LR:0.00006350  Loss: -0.017915
Ep: 5800  Rew:  263.00  Avg Rew:  268.68  LR:0.00006330  Loss: -0.007003
Ep: 5850  Rew:  342.00  Avg Rew:  288.41  LR:0.00006310  Loss: 0.014755
Ep: 5900  Rew:  303.00  Avg Rew:  277.39  LR:0.00006290  Loss: -0.010873
Ep: 5950  Rew:  181.00  Avg Rew:  263.45  LR:0.00006270  Loss: -0.029372
Ep: 6000  Rew:  344.00  Avg Rew:  254.31  LR:0.00006250  Loss: -0.023504
Ep: 6050  Rew:  123.00  Avg Rew:  260.29  LR:0.00006231  Loss: -0.005409
Ep: 6100  Rew:  153.00  Avg Rew:  277.12  LR:0.00006212  Loss: 0.002269
Ep: 6150  Rew:  200.00  Avg Rew:  271.49  LR:0.000061

Ep:11150  Rew:  354.00  Avg Rew:  344.77  LR:0.00004728  Loss: -0.014857
Ep:11200  Rew:  500.00  Avg Rew:  320.44  LR:0.00004717  Loss: -0.025516
Ep:11250  Rew:  289.00  Avg Rew:  303.05  LR:0.00004706  Loss: -0.021524
Ep:11300  Rew:  500.00  Avg Rew:  324.69  LR:0.00004695  Loss: 0.011951
Ep:11350  Rew:  286.00  Avg Rew:  343.53  LR:0.00004684  Loss: -0.015788
Ep:11400  Rew:  281.00  Avg Rew:  344.32  LR:0.00004673  Loss: -0.011324
Ep:11450  Rew:  500.00  Avg Rew:  349.31  LR:0.00004662  Loss: -0.011057
Ep:11500  Rew:  306.00  Avg Rew:  373.29  LR:0.00004651  Loss: 0.004462
Ep:11550  Rew:  486.00  Avg Rew:  392.19  LR:0.00004641  Loss: -0.001847
Ep:11600  Rew:  357.00  Avg Rew:  329.92  LR:0.00004630  Loss: -0.036404
Ep:11650  Rew:  255.00  Avg Rew:  296.01  LR:0.00004619  Loss: -0.011150
Ep:11700  Rew:  250.00  Avg Rew:  301.10  LR:0.00004609  Loss: -0.021540
Ep:11750  Rew:  473.00  Avg Rew:  280.71  LR:0.00004598  Loss: -0.025249
Ep:11800  Rew:  239.00  Avg Rew:  270.01  LR:0.000045

KeyboardInterrupt: 

In [None]:
agent.test()