In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

import time

from src.Game import Game

In [None]:
print(pd.__version__)

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
tf.keras.backend.clear_session()

In [None]:
bid_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(24 + 1 + 2 + 15,), name="input"),
    tf.keras.layers.Dense(64, activation='relu', name="hidden1"),
    tf.keras.layers.Dense(64, activation='relu', name="hidden2"),
    tf.keras.layers.Dense(10, name="output"),
], name="bid_model")
bid_model.compile(optimizer='adam', loss='mse')

bid_model.summary()

In [None]:
play_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(4 + 24 + (6 * 4 * 24) + 16 + 20 + 14 + 30 + 30 + 6 + 2,), name="input"),  # 722
    tf.keras.layers.Dense(64, activation='relu', name="hidden1"),
    tf.keras.layers.Dense(64, activation='relu', name="hidden2"),
    tf.keras.layers.Dense(24, name="output"),
], name="play_model")
play_model.compile(optimizer='adam', loss='mse')

play_model.summary()

In [None]:
class BatchLogger(tf.keras.callbacks.Callback):
    def __init__(self):
        super().__init__()
        self.batch_logs = []
        self.total_batches_seen = 0
        self.log_df = None

    def on_train_batch_end(self, batch, logs=None):
        self.total_batches_seen += 1
        self.batch_logs.append(
            {'Iteration': self.total_batches_seen, 'Loss': logs['loss'], 'Accuracy': logs['accuracy']})

    def on_epoch_end(self, epoch, logs=None):
        # Create the log DataFrame at the end of each epoch
        self.log_df = pd.DataFrame(self.batch_logs)


logger = BatchLogger()

In [None]:
# Define training parameters
epochs = 1
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.1
episodes = 1  # 2_000
batch_size = 2 ** 4  #  8192  # 32
gamma = 1  # 0.95

In [None]:
game = Game(bid_model, play_model)

# Training loop
start_time = time.time()  # Get the current time
for e in range(episodes):
    if e % 100 == 0:
        print(f"Episode {e} | epsilon {epsilon}")

    game.reset()
    game.play_game(max_rounds=4)

    # Prepare dataset using replay buffer
    # dataset = game.players[0].bid_replay_buffer.plot_as_dataset(batch_size)
    dataset = game.players[0].bid_replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=batch_size,
        num_steps=2
    ).prefetch(3)
    iterator = iter(dataset)

    # Iterate over the dataset
    for trajectories in iterator:
        # encoded_state_dict = trajectories[0].observation
        # flattened_encoded_state = tf.concat([tf.reshape(tensor, [-1]) for tensor in encoded_state_dict.values()], 0)
        # flattened_encoded_state = tf.reshape(flattened_encoded_state, (-1, 42))
        
        encoded_states_dict = trajectories[0].observation
        current_state_dict = {k: v[:, 0, :] for k, v in encoded_states_dict.items()}
        next_state_dict = {k: v[:, 1, :] for k, v in encoded_states_dict.items()}
        
        flattened_current_state = tf.concat([tf.reshape(tensor, [-1]) for tensor in current_state_dict.values()], 0)
        flattened_next_state = tf.concat([tf.reshape(tensor, [-1]) for tensor in next_state_dict.values()], 0)
        
        flattened_current_state = tf.reshape(flattened_current_state, (-1, 42)).numpy()
        flattened_next_state = tf.reshape(flattened_next_state, (-1, 42)).numpy()
        
        actions, rewards = trajectories[0].action[:, 0], trajectories[0].reward[:, 0]
        dones = trajectories[0].step_type[:, 1]  #LAST = end of episode

        bid_targets = game.players[0].bid_model.predict(flattened_current_state)
        next_q_values = game.players[0].bid_model.predict(flattened_next_state)
        
        for i, done in enumerate(dones):
            if done:
                bid_targets[i][actions[i]] = rewards[i]
            else:
                bid_targets[i][actions[i]] = rewards[i] + gamma * np.amax(next_q_values[i])

        game.players[0].bid_model.fit(flattened_current_state, bid_targets, epochs=epochs, verbose=0, callbacks=[logger])

    # Replay Bids
    # trajectories = game.players[0].bid_replay_buffer.gather_all()
    # states, actions, rewards, next_states, dones = (trajectories.observation[:, 0],
    #                                                 trajectories.action,
    #                                                 trajectories.reward,
    #                                                 trajectories.observation[:, 1],
    #                                                 trajectories.step_type[:, 1])
    # bid_targets = game.players[0].bid_model.predict(states)
    # next_q_values = game.players[0].bid_model.predict(next_states)
    # 
    # for i, done in enumerate(dones):
    #     if done:
    #         bid_targets[i][actions[i]] = rewards[i]
    #     else:
    #         bid_targets[i][actions[i]] = rewards[i] + gamma * np.amax(next_q_values[i])
    # 
    # game.players[0].bid_model.fit(states, bid_targets, epochs=epochs, verbose=0, callbacks=[logger])

end_time = time.time()  # Get the current time again after your code has run
execution_time = end_time - start_time  # Calculate the difference
print(f"The execution time was: {execution_time} seconds")