In [1]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment, tf_py_environment, utils, wrappers, suite_gym
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts

2025-01-03 20:28:37.422573: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-03 20:28:37.517746: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-03 20:28:37.789458: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-03 20:28:37.789522: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-03 20:28:37.809488: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [3]:
class CardGameEnv(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=1, name='action'
        )
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.int32, minimum=0, name='observation'
        )
        self._state = 0
        self._episode_ended = False

    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec
    
    def _reset(self):
        self._state = 0
        self._episode_ended = False
        return ts.restart(np.array([self._state], dtype=np.int32))
    
    def _step(self, action):
        if self._episode_ended:
            return self.reset()
        
        if action == 1:
            self._episode_ended = True
        elif action == 0:
            new_card = np.random.randint(1, 11)
            self._state += new_card
        else:
            raise(ValueError('`action` should be 0 or 1'))
        
        if self._episode_ended or self._state >= 21:
            reward = self._state - 21 if self._state <= 21 else -21
            return ts.termination(np.array([self._state], dtype=np.int32), reward)
        else:
            return ts.transition(
                np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0
            )


## CardGameEnv - A Blackjack-Inspired Environment

This custom environment extends TF-Agents’ PyEnvironment to simulate a simple Blackjack-style game. An action of 0 draws a random card (value 1–10) and updates the internal state. An action of 1 ends the round, triggering a terminal state. Rewards depend on how close the final state is to 21 without going over; if the total exceeds 21, the reward defaults to –21.

In [4]:
env = CardGameEnv()
utils.validate_py_environment(env, episodes=5)

In [6]:
get_new_card_action = np.array(0, dtype=np.int32)
end_round_action = np.array(1, dtype=np.int32)

env = CardGameEnv()
time_step = env.reset()
print(time_step)
cumulative_reward = time_step.reward

for _ in range(3):
    time_step = env.step(get_new_card_action)
    print(time_step)
    cumulative_reward += time_step.reward

time_step = env.step(end_round_action)
cumulative_reward += time_step.reward

print('Final Reward = ', cumulative_reward)

TimeStep(
{'step_type': array(0, dtype=int32),
 'reward': array(0., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([0], dtype=int32)})
TimeStep(
{'step_type': array(1, dtype=int32),
 'reward': array(0., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([10], dtype=int32)})
TimeStep(
{'step_type': array(1, dtype=int32),
 'reward': array(0., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([18], dtype=int32)})
TimeStep(
{'step_type': array(2, dtype=int32),
 'reward': array(-21., dtype=float32),
 'discount': array(0., dtype=float32),
 'observation': array([23], dtype=int32)})
Final Reward =  -42.0
