In [1]:
import math
import time
import random
from collections import deque
from collections import namedtuple
from dataclasses import dataclass

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F  #activation function https://pytorch.org/docs/stable/nn.functional.html

from torch.distributions.categorical import Categorical

from torch.profiler import profile, record_function, ProfilerActivity

import gym
import rl_gym

import matplotlib.pyplot as plt

from tqdm import tqdm
import gc


In [None]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

print('Using device:', device)
#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    


In [None]:
#--------------------------------
class SimpleMLP(nn.Module):

    def __init__(self, num_observations, num_actions, num_neurons):
        super(SimpleMLP, self).__init__()
        
        self.layer1 = nn.Linear(num_observations, num_neurons)
        self.layer2 = nn.Linear(num_neurons, num_neurons)
        self.layer3 = nn.Linear(num_neurons, num_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

#--------------------------------
#--------------------------------

def reward_to_go(rewards):
    n = len(rewards)
    rtgs = np.zeros_like(rewards)
    for i in reversed(range(n)):
        rtgs[i] = rewards[i] + (rtgs[i+1] if i+1 < n else 0)
    return rtgs
        
    
#--------------------------------
#--------------------------------

@dataclass
class RLVPGParams:
    num_neurons: int = 100             #number of neurons in the simple MLP used to compute Q values
    max_episode_length: int = 600      #maximum length of an episode
    num_epochs: int = 1000             #number of training epochs
    num_episodes_per_epoch: int = 10   #number of episodes in each training batch
    learning_rate: float = 0.001       #learning rate for the AdamW optimizer
    
class RLVPG():
    def __init__(self, env: gym.Env, params: RLVPGParams, device):
        
        #get input parameters
        self.params = params
        self.num_observations = env.observation_space.shape[0]
        self.num_actions = env.action_space.n
        self.device = device

        self.model = SimpleMLP(num_observations = self.num_observations, num_actions=self.num_actions, num_neurons=self.params.num_neurons).to(self.device)

    def train(self, env, params):
        
        self.params = params
        print("start training with params:")
        print(self.params)
        
        optimizer = optim.AdamW(self.model.parameters(), lr=self.params.learning_rate, amsgrad=True) #https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html

        scores = [] #for logging only
                                     
        #main training loop
        pbar = tqdm(range(self.params.num_epochs))
        for epoch in pbar:
        
            batch_obs = []
            batch_actions = []
            batch_weights = []
            epoch_scores = [] #for logging only
            
            #run the policy for several episodes to collect a batch of trajectories
            for episode in range(self.params.num_episodes_per_epoch):
                
                #initialize a new episode
                episode_t  = 0  #time steps executed during this episode
                rewards = []
                obs = env.reset()
                #obs = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0) #https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
                
                #run one episode to completion
                done=False
                while not done:
                
                    #sample the current policy
                    with torch.no_grad():
                        logits = self.model(torch.tensor(obs, dtype=torch.float32, device=self.device)) #get the proba from the current policy
                        action = Categorical(logits=logits).sample().item() #sample it
                    
                    #step the model with sampled action
                    next_obs, reward, done, info = env.step(action)
                    
                    batch_obs.append(obs)
                    batch_actions.append(action)
                    rewards.append(reward)
                    
                    obs = next_obs
                    episode_t += 1
                    if episode_t>self.params.max_episode_length:
                        done = True
                    
                #record what we need from this episode's trajectory
                batch_weights += list(reward_to_go(rewards))  #cumulative rewards to end of episode for each time step
                epoch_scores.append(sum(rewards)) #for logging only
                
            #adjust the policy model using the last batch of trajectories
            optimizer.zero_grad()
            logits = self.model(torch.tensor(np.array(batch_obs), dtype=torch.float32, device=self.device))
            logp = Categorical(logits=logits).log_prob(torch.tensor(np.array(batch_actions), dtype=torch.float32, device=self.device))
            loss = -(logp * torch.tensor(np.array(batch_weights), dtype=torch.float32, device=self.device)).mean()
            loss.backward()
            optimizer.step()
                
            scores.extend(epoch_scores)
            pbar.set_description(f'score:{np.mean(scores):.0f}')
            pbar.refresh()
            #print("epoch: %d average score:%.0f"%(epoch, np.mean(scores)))

        return scores
    
    def best_action(self, obs):
        with torch.no_grad():
            logits = self.model(torch.tensor(obs, dtype=torch.float32, device=self.device)) #get the proba from the current policy
            action = torch.max(logits,dim=-1,keepdim=False) #return index of greatest logit, it is the most likely action
            return action.indices.item()
            
        
    def play(self, env):
        obs = env.reset()
        done=False
        while not done:
            env.render()
            action = self.best_action(obs)
            obs, reward, done, info = env.step(action)        
        env.close()
        
    def evaluate(self, env, num_episodes=100, episode_length=600):
        scores=[]
        for e in range(num_episodes):
            obs = env.reset()
            score=0
            done=False
            episode_t=0
            while not done:
                action = self.best_action(obs)
                obs, reward, done, info = env.step(action)
                score += reward
                episode_t +=1
                if episode_t>episode_length: #force episode termination
                    done=True
            scores.append(score)
        mean_reward = np.mean(scores)
        std_reward = np.std(scores)
        return mean_reward, std_reward, scores


            

In [None]:
#env = gym.wrappers.TimeLimit(gym.make("rl_gym/PuckWorld-v0", fps=60), max_episode_steps=600)
env = gym.make("rl_gym/PuckWorld-v0", fps=60)

print(device)
params = RLVPGParams()
vpg = RLVPG(env, params, device)

start_time = time.time()
mean_reward, std_reward, scores = vpg.evaluate(env, num_episodes=100, episode_length=600)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
print("--- %.1f seconds ---" % (time.time() - start_time))
#print(scores)

In [None]:
params = RLVPGParams(num_epochs=10_000, num_episodes_per_epoch=50)

print('train')
start_time = time.time()
scores = vpg.train(env, params)
print("--- %.1f minutes ---" % ((time.time() - start_time)/60))

plt.plot(scores)
window=20
y = np.array(scores)
plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'))
plt.show()

print('evaluate')
start_time = time.time()
mean_reward, std_reward, scores = vpg.evaluate(env, num_episodes=100, episode_length=600)
print("--- %s seconds ---" % (time.time() - start_time))
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



In [None]:
scenarios = [RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=100, max_time_steps=400*1000, learning_rate=0.00001),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=100, max_time_steps=400*1000, learning_rate=0.0001),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=100, max_time_steps=400*1000, learning_rate=0.001),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=400, max_time_steps=400*1000, learning_rate=0.01),
            ]
results = []

for params in scenarios:
    
    dqn = RLDQN(env, params, device)
    
    print('=======================')
    print('\t\ttrain')
    start_time = time.time()
    dqn.train(env, params)
    training_time = time.time() - start_time
    print("--- %s minutes ---" % (training_time/60))
    
    log = TrainingRecord(*zip(*dqn.training_record))
    #y = np.array(log.loss_mean)
    #y = np.array(log.loss_std)
    y = np.array(log.score)
    #y = np.array(log.epsilon)
    window=50
    y_runningmean = np.convolve(y, np.ones(window)/window, mode='valid')
    plt.plot(y)
    plt.plot(range(int(window/2),len(y)-int(window/2)+1),y_runningmean)
    plt.show()
       

    print('\t\tevaluate')
    start_time = time.time()
    mean_reward, std_reward, scores = dqn.evaluate(env, num_episodes=100, episode_length=600)
    evaluating_time = time.time() - start_time
    print("--- %s seconds ---" % (evaluating_time))
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    
    r = {"params": params, 
         "log":TrainingRecord(*zip(*dqn.training_record)),
         "mean_reward": mean_reward,
         "std_reward": std_reward,
         "training_time": training_time,
         "evaluating_time": evaluating_time   
    }
    results.append(r)

    print('=======================')
    


In [None]:
window=50

print("Timing results")
for r in results:
    label = '%f'%(r["params"].learning_rate)
    print(label, r["training_time"]/60, r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%f'%(r["params"].learning_rate)
    print(label, r["mean_reward"], r["std_reward"])

    y = np.array(r["log"].score)
#    y = np.array(r["log"].loss_mean)
#    y = np.array(r["log"].loss_std)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
scenarios = [RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=50),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=100),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=400),
            ]
#max_time_steps: int = 1000*600
results = []

for params in scenarios:
    
    dqn = RLDQN(env, params, device)
    
    print('=======================')
    print('\t\ttrain')
    start_time = time.time()
    dqn.train(env, params)
    training_time = time.time() - start_time
    print("--- %s minutes ---" % (training_time/60))
    
    log = TrainingRecord(*zip(*dqn.training_record))
    #y = np.array(log.loss_mean)
    #y = np.array(log.loss_std)
    y = np.array(log.score)
    #y = np.array(log.epsilon)
    window=50
    y_runningmean = np.convolve(y, np.ones(window)/window, mode='valid')
    plt.plot(y)
    plt.plot(range(int(window/2),len(y)-int(window/2)+1),y_runningmean)
    plt.show()
       

    print('\t\tevaluate')
    start_time = time.time()
    mean_reward, std_reward, scores = dqn.evaluate(env, num_episodes=100, episode_length=600)
    evaluating_time = time.time() - start_time
    print("--- %s seconds ---" % (evaluating_time))
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    
    r = {"params": params, 
         "log":TrainingRecord(*zip(*dqn.training_record)),
         "mean_reward": mean_reward,
         "std_reward": std_reward,
         "training_time": training_time,
         "evaluating_time": evaluating_time   
    }
    results.append(r)

    print('=======================')
    


In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d'%(r["params"].epsilon_half_life)
    print(label, r["training_time"]/60, r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%d'%(r["params"].epsilon_half_life)
    print(label, r["mean_reward"], r["std_reward"])

    y = np.array(r["log"].score)
#    y = np.array(r["log"].loss_mean)
#    y = np.array(r["log"].loss_std)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
scenarios = [RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=25),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=1_000),
            ]

results = []

for params in scenarios:
    
    dqn = RLDQN(env, params, device)
    
    print('=======================')
    print('\t\ttrain')
    start_time = time.time()
    dqn.train(env, params)
    training_time = time.time() - start_time
    
    log = TrainingRecord(*zip(*dqn.training_record))
    #y = np.array(log.loss_mean)
    #y = np.array(log.loss_std)
    y = np.array(log.score)
    #y = np.array(log.epsilon)
    window=50
    y_runningmean = np.convolve(y, np.ones(window)/window, mode='valid')
    plt.plot(y)
    plt.plot(range(int(window/2),len(y)-int(window/2)+1),y_runningmean)
    plt.show()
       

    print('\t\tevaluate')
    start_time = time.time()
    mean_reward, std_reward, scores = dqn.evaluate(env, num_episodes=100, episode_length=600)
    evaluating_time = time.time() - start_time
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    
    r = {"params": params, 
         "log":TrainingRecord(*zip(*dqn.training_record)),
         "mean_reward": mean_reward,
         "std_reward": std_reward,
         "training_time": training_time,
         "evaluating_time": evaluating_time   
    }
    results.append(r)

    print("--- %s minutes ---" % ((time.time() - start_time)/60))
    print('=======================')
    


In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d-%.2f-%d'%(r["params"].num_neurons, r["params"].gamma, r["params"].memory_batch)
    print(label, r["training_time"], r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%d-%.2f-%d'%(r["params"].num_neurons, r["params"].gamma, r["params"].memory_batch)
    print(label, r["mean_reward"], r["std_reward"])

    y = np.array(r["log"].score)
#    y = np.array(r["log"].loss_mean)
#    y = np.array(r["log"].loss_std)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d-%f'%(r["params"].num_neurons, r["params"].gamma)
    print(label, r["training_time"], r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%d-%f'%(r["params"].num_neurons, r["params"].gamma)
    print(label, r["mean_reward"], r["std_reward"])

    y = np.array(r["log"].score)
#    y = np.array(r["log"].loss_mean)
#    y = np.array(r["log"].loss_std)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
scenarios = [RLDQNParams(num_neurons=128, train_period=1, gamma=0.99),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.95),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.9),
             RLDQNParams(num_neurons=128, train_period=1, gamma=0.8)]

results = []

for params in scenarios:
    
    dqn = RLDQN(env, params, device)
    
    print('=======================')
    print('\t\ttrain')
    start_time = time.time()
    dqn.train(env, params)
    training_time = time.time() - start_time
    
    log = TrainingRecord(*zip(*dqn.training_record))
    #y = np.array(log.loss_mean)
    #y = np.array(log.loss_std)
    y = np.array(log.score)
    #y = np.array(log.epsilon)
    window=50
    y_runningmean = np.convolve(y, np.ones(window)/window, mode='valid')
    plt.plot(y)
    plt.plot(range(int(window/2),len(y)-int(window/2)+1),y_runningmean)
    plt.show()
       

    print('\t\tevaluate')
    start_time = time.time()
    mean_reward, std_reward, scores = dqn.evaluate(env, num_episodes=100, episode_length=600)
    evaluating_time = time.time() - start_time
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    
    r = {"params": params, 
         "log":TrainingRecord(*zip(*dqn.training_record)),
         "mean_reward": mean_reward,
         "std_reward": std_reward,
         "training_time": training_time,
         "evaluating_time": evaluating_time   
    }
    results.append(r)

    print("--- %s minutes ---" % ((time.time() - start_time)/60))
    print('=======================')
    


In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d-%f'%(r["params"].num_neurons, r["params"].gamma)
    print(label, r["training_time"], r["evaluating_time"])

print("\nPerformance results")
for r in results[1:]:
    label = '%d-%f'%(r["params"].num_neurons, r["params"].gamma)
    print(label, r["mean_reward"], r["std_reward"])

    y = np.array(r["log"].score)
#    y = np.array(r["log"].loss_mean)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d-%f'%(r["params"].num_neurons, r["params"].gamma)
    print(label, r["training_time"], r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%d-%f'%(r["params"].num_neurons, r["params"].gamma)
    print(label, r["mean_reward"], r["std_reward"])

#    y = np.array(r["log"].score)
    y = np.array(r["log"].loss_mean)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:


scenarios = [RLDQNParams(num_neurons=32, train_period=1),
             RLDQNParams(num_neurons=512, train_period=1),
             RLDQNParams(num_neurons=1024, train_period=1),
             RLDQNParams(num_neurons=2048, train_period=1)
            ]

results = []

for params in scenarios:
    
    dqn = RLDQN(env, params, device)
    
    print('=======================')
    print('\t\ttrain')
    start_time = time.time()
    dqn.train(env, params)
    training_time = time.time() - start_time
    
    log = TrainingRecord(*zip(*dqn.training_record))
    #y = np.array(log.loss_mean)
    #y = np.array(log.loss_std)
    y = np.array(log.score)
    #y = np.array(log.epsilon)
    window=50
    y_runningmean = np.convolve(y, np.ones(window)/window, mode='valid')
    plt.plot(y)
    plt.plot(range(int(window/2),len(y)-int(window/2)+1),y_runningmean)
    plt.show()
       

    print('\t\tevaluate')
    start_time = time.time()
    mean_reward, std_reward, scores = dqn.evaluate(env, num_episodes=100, episode_length=600)
    evaluating_time = time.time() - start_time
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
    
    r = {"params": params, 
         "log":TrainingRecord(*zip(*dqn.training_record)),
         "mean_reward": mean_reward,
         "std_reward": std_reward,
         "training_time": training_time,
         "evaluating_time": evaluating_time   
    }
    results.append(r)

    print("--- %s minutes ---" % ((time.time() - start_time)/60))
    print('=======================')
    






In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d-%d'%(r["params"].num_neurons, r["params"].train_period)
    print(label, r["training_time"], r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%d-%d'%(r["params"].num_neurons, r["params"].train_period)
    print(label, r["mean_reward"], r["std_reward"])

#    y = np.array(r["log"].score)
    y = np.array(r["log"].loss_mean)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
gpu_timing = np.array([[32,3033],[512,3058],[1024,3059],[2048,4523]])
cpu_timing = np.array([[32,1532],[512,4814],[1024,12001],[2048,41592]])

plt.plot(gpu_timing[:,0],gpu_timing[:,1], label="gpu")
plt.plot(gpu_timing[:,0],cpu_timing[:,1], label="cpu")
plt.legend()
plt.show()

gpu_score = np.array([[32,-179,69],[512,-159,138],[1024,-136,51],[2048,-138,75]])
cpu_score = np.array([[32,-196,105],[512,-137,59],[1024,-138,68],[2048,-162,103]])

plt.plot(gpu_score[:,0],gpu_score[:,1], label="gpu")
plt.plot(cpu_score[:,0],cpu_score[:,1], label="cpu")
plt.legend()
plt.show()

In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d-%d'%(r["params"].num_neurons, r["params"].train_period)
    print(label, r["training_time"], r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%d-%d'%(r["params"].num_neurons, r["params"].train_period)
    print(label, r["mean_reward"], r["std_reward"])

#    y = np.array(r["log"].score)
    y = np.array(r["log"].loss_mean)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
window=50

print("Timing results")
for r in results:
    label = '%d-%d'%(r["params"].num_neurons, r["params"].train_period)
    print(label, r["training_time"], r["evaluating_time"])

print("\nPerformance results")
for r in results:
    label = '%d-%d'%(r["params"].num_neurons, r["params"].train_period)
    print(label, r["mean_reward"], r["std_reward"])

    y = np.array(r["log"].score)
#    y = np.array(r["log"].loss_mean)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:
window=50
for r in results:
    
    label = r["params"].num_neurons
    
    print(r["params"].num_neurons, r["mean_reward"], r["std_reward"])

    y = np.array(r["log"].score)
#    y = np.array(r["log"].loss_mean)
#    plt.plot(y, label=label)
    plt.plot(np.convolve(y, np.ones(window)/window, mode='valid'), label=label)
plt.legend()
plt.show()

In [None]:

dqn_c = RLDQN(env, params, torch.device('cpu'))

print("evaluating on GPU")
start_time = time.time()
dqn.evaluate(env,num_episodes=10, episode_length=600)
print("--- %s seconds ---" % (time.time() - start_time))

print("evaluating on CPU")
start_time = time.time()
dqn_c.evaluate(env,num_episodes=10, episode_length=600)
print("--- %s seconds ---" % (time.time() - start_time))





In [None]:
with profile(activities=[ProfilerActivity.CUDA,ProfilerActivity.CPU], 
             #profile_memory=True,
             record_shapes=True) as prof:
    with record_function("eval_gpu"):
        dqn.evaluate(env,num_episodes=1, episode_length=600)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))        

In [None]:
with profile(activities=[ProfilerActivity.CUDA,ProfilerActivity.CPU], 
             #profile_memory=True,
             record_shapes=True) as prof:
    with record_function("eval_cpu"):
        dqn_c.evaluate(env,num_episodes=1, episode_length=600)
        
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

In [None]:
vpg.play(env)