In [1]:
import torch


print(f"Cuda is available: {torch.cuda.is_available()} \n\
Cuda Device Count: {torch.cuda.device_count()} \n\
Current Cuda Device: {torch.cuda.current_device()} \
[{torch.cuda.get_device_name(0)}]")


Cuda is available: True 
Cuda Device Count: 1 
Current Cuda Device: 0 [NVIDIA GeForce MX250]


In [1]:
import tensorflow as tf

try:
    %tensorflow_version 2.x
except Exception:
    pass

# A hack to force the runtime to restart, needed to include the above dependencies.
# import os
# os._exit(0)

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [2]:
import numpy as np

import gym
from gym.spaces import Discrete, Box

import ray
import ray.rllib.agents.dqn as dqn
from ray.tune.logger import pretty_print

In [3]:
T = 20
price_max = 500
price_step = 10
q_0 = 5000
k = 20
unit_cost = 100
a_q = 300
b_q = 100
price_grid = np.arange(price_step, price_max, price_step)

## Environment simulator
def plus(x):
    return 0 if x < 0 else x

def minus(x):
    return 0 if x > 0 else -x

def shock(x):
    return np.sqrt(x)

# Demand at time step t for current price p_t and previous price p_t_1
def q_t(p_t, p_t_1, q_0, k, a, b):
    return plus(q_0 - k*p_t - a*shock(plus(p_t - p_t_1)) + b*shock(minus(p_t - p_t_1)))

# Profit at time step t
def profit_t(p_t, p_t_1, q_0, k, a, b, unit_cost):
    return q_t(p_t, p_t_1, q_0, k, a, b)*(p_t - unit_cost) 

## Partial bindings for readability
def profit_t_response(p_t, p_t_1):
    return profit_t(p_t, p_t_1, q_0, k, a_q, b_q, unit_cost)

def env_intial_state():
    return np.repeat(0, 2*T)


def env_step(t, state, action):
    next_state = np.repeat(0, len(state))
    next_state[0] = price_grid[action]
    next_state[1:T] = state[0:T-1]
    next_state[T+t] = 1
    reward = profit_t_response(next_state[0], next_state[1])
    return next_state, reward


In [4]:
class HiLoPricingEnv(gym.Env):
    def __init__(self, config):
        self.reset()
        self.action_space = Discrete(len(price_grid))
        self.observation_space = Box(0, 10000, shape=(2*T, ), dtype=np.float32)

    def reset(self):
        self.state = env_intial_state()
        self.t = 0
        return self.state

    def step(self, action):
        next_state, reward = env_step(self.t, self.state, action)
        self.t += 1
        self.state = next_state
        return next_state, reward, self.t == T - 1, {}

In [6]:
def train_dqn():
    config = dqn.DEFAULT_CONFIG.copy()
    config["log_level"] = "WARN"
    config["lr"] = 0.002
    config["gamma"] = 0.80
    config["train_batch_size"] = 64
    config["buffer_size"] = 10000
    config["timesteps_per_iteration"] = 5000
    config["hiddens"] = [128, 128, 128]
    config["exploration_config"] = {
        "final_epsilon": 0.01
    }
    # config["num_gpus"] = 1
    # config["framework"] = "tfe"

    trainer = dqn.DQNTrainer(config=config, env=HiLoPricingEnv)
    for i in range(50):
        result = trainer.train()
        print(pretty_print(result))

In [7]:
ray.shutdown()
ray.init(num_gpus=1)

train_dqn()

2021-06-16 13:49:48,981	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-06-16 13:49:50,490	INFO trainer.py:696 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


RuntimeError: tf.placeholder() is not compatible with eager execution.