In [3]:
import math
import time
import random
from collections import deque
from collections import namedtuple
from dataclasses import dataclass

import numpy as np

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F  #activation function https://pytorch.org/docs/stable/nn.functional.html
from torch.profiler import profile, record_function, ProfilerActivity

import gym
import rl_gym

import matplotlib.pyplot as plt

from rldqn import RLDQN, RLDQNParams, TrainingRecord

In [4]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
env = gym.make("rl_gym/PuckWorld-v0", fps=60)

params = RLDQNParams(num_neurons=128, train_period=1, gamma=0.90, memory_batch=250, epsilon_half_life=100, max_time_steps=400*1000, learning_rate=0.001)

dqn = RLDQN(env, params, device=device)

Using device: cpu


In [7]:
print("Evaluate untrained model")
print("------------------------")
start_time = time.time()
mean_reward, std_reward, scores = dqn.evaluate(env, num_episodes=100, episode_length=600)
print("--- %0.f seconds ---" % (time.time() - start_time))
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Evaluate untrained model
------------------------
--- 10 seconds ---
mean_reward:-738.25 +/- 370.90


In [None]:
print("Train model")
print("-----------")
start_time = time.time()
dqn.train(env, params)
print("--- %1.f minutes ---" % ((time.time() - start_time)/60))

log = TrainingRecord(*zip(*dqn.training_record))

y = np.array(log.score)
#y = np.array(log.loss_mean)
#y = np.array(log.loss_std)
#y = np.array(log.epsilon)

window=50
y_runningmean = np.convolve(y, np.ones(window)/window, mode='valid')
plt.plot(y)
plt.plot(range(int(window/2),len(y)-int(window/2)+1),y_runningmean)

Train model
-----------
start training with params:
RLDQNParams(num_neurons=128, max_episode_length=600, max_time_steps=400000, train_period=1, learning_rate=0.001, memory_size=50000, memory_batch=250, gamma=0.9, epsilon=0.5, epsilon_min=0.05, epsilon_half_life=100, target_update_rate=0.05, log_recent_episodes=100)
Training(t=600, episode=0, episode_t=601, epsilon=0.49653426409720025, score=-386.40708897983137, loss_mean=0.05291391371290313, loss_std=0.11332770635399256)
Training(t=60700, episode=100, episode_t=601, epsilon=0.24766868066200204, score=-78.72883765215568, loss_mean=0.02992271338554623, loss_std=0.021861901717934253)
Training(t=120800, episode=200, episode_t=601, epsilon=0.12353583592541169, score=-85.89994158767541, loss_mean=0.032828322114611196, loss_std=0.023872244919941655)


In [None]:
print("Evaluate trained model")
print("------------------------")
start_time = time.time()
mean_reward, std_reward, scores = dqn.evaluate(env, num_episodes=100, episode_length=600)
print("--- %s seconds ---" % (time.time() - start_time))
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
print("Play model")
print("----------")
dqn.play(env)