In [10]:
import db
import numpy as np
from tensorflow import keras
from serializer import GameState1DSerializer
import sys
import datetime as dt
import numpy as np
import json
import random as rd
import agent_wrapper
import randomAgents

action_map = {}
counter = 0

for action in ["attack", "move"]:
    for q in range(-14,15):
        for r in range(-14, 15):
            action_map[counter] = f"{action},{q},{r}"
            counter += 1
            

action_map_inverse = {v:k for k,v in action_map.items()}

#print(action_map, action_map_inverse)

def prepare_training_data(from_timestamp = 0):
    
    if(isinstance(from_timestamp, dt.datetime)):
        from_timestamp = int(from_timestamp.timestamp())
        
    replays = db.get_all_experiences({ "time": { "$gt": from_timestamp}})
    
    def _get_score_from_state(state: dict):
        try:
            state = json.loads(state["gameState"])
        except:
            pass
        
        
        for key in ["player1", "player2", "player3", "player4"]:
            if state[key]["name"] == "JutricKafica":
                return state[key]["score"]
    
    
    rewards = [_get_score_from_state(replay["sp"]) - _get_score_from_state(replay["s"]) for replay in replays]

    _seralizer = GameState1DSerializer()

    serialized = [
        _seralizer.serialize_single(x) for x in replays
    ]
    
    actions = [
        replay['a'] for replay in replays
    ]

    return serialized, rewards, actions




def create_targets(training_data, rewards, actions):
    
    n = len(training_data)
    model_inputs = []
    
    for i in range(2, n):
        model_inputs.append(np.hstack(training_data[i], training_data[i-1], training_data[i-2]))
    
    states = model_inputs[:-1]
    next_states = model_inputs[1:]
    
    return states, actions[2:-1], next_states, rewards[2:-1]
    
    

class DQNAgent:
    
    def __init__(self, state_size, action_size):
        
        print(type(state_size), type(action_size))
        
        self.state_size = state_size
        self.action_size = action_size
        
        self.gamma = 0.99
        self.epsilon = 1
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.995
        self.update_rate = 300
        
        self.model = self._build_model(state_size, action_size)
        self.target_model = self._build_model(state_size, action_size)
        self.target_model.set_weights(self.model.get_weights())
        self.model.summary()
        
        self.last_updated = int((dt.datetime.now() - dt.timedelta(hours = 1)).timestamp())
    
    
    def _build_model(self, state_size, action_size):
        
        # Define the model architecture
        model = keras.Sequential()
        model.add(keras.layers.Input(shape=(state_size,)))
        model.add(keras.layers.Dense(128, activation='relu'))
        model.add(keras.layers.Dense(64, activation='relu'))
        model.add(keras.layers.Dense(action_size, activation='linear'))

        # Compile the model with an optimizer and a loss function
        model.compile(optimizer='adam', loss='mean_squared_error')

        return model
    
    def act(self, state, state_json):
        
        if np.random.rand() <= self.epsilon:
            picked_random_valid = randomAgents.pick_rand_action(state_json)
            return f"{picked_random_valid[0]},{picked_random_valid[1]},{picked_random_valid[2]}"
        
        act_values = self.model.predict(state)[0]
        
        for idx,elem in enumerate(act_values):
            action = action_map[elem]
            action_tokens = action.split(",")
            if not randomAgents.is_valid_action(action_tokens[0], action_tokens[1], action_tokens[2], state_json)
                act_values[idx] = 0
        
        return action_map[np.argmax(act_values[0])]  # Returns action using polic
    
    
    def update(self):
        
        states, actions, next_states, rewards = create_targets(*prepare_training_data(self.last_updated))
        
        data = [states, actions, next_states, rewards]
        
        print(data)
        
        rd.shuffle(data)
        
        for state, action, next_state, reward in zip(*data):
            
            target = reward + self.gamma * np.amax(self.target_model.predict(next_state))
            
            target_f = self.model.predict(state)
            target_f[0][action_map_invere(action)] = target
            
            self.model.fit(state, target_f, epochs=1, verbose = 0)
            
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        self.target_model.set_weights(self.model.get_weights())
        
        self.last_updated = int(dt.datetime.now().timestamp())

    

In [11]:
def get_state_space():
    one_document = db.replay_buffer_collection.find_one()
    serialized = GameState1DSerializer().serialize_single(one_document)
    return len(serialized)

agent = DQNAgent(3 * get_state_space(), len(action_map))

agent.update()

train = False
timestep = 0

initial_obs = json.load(open("../initial_state.json",'r'))

obs_window = 3*[initial_obs]

while train:
    
    if timestep != 0 and timestep % agent.update_rate == 0:
        agent.update()
        
    try:
        state = np.hstack([GameState1DSerializer().serialize_single(x) for x in obs_window])
    except:
        from pprint import pprint
        pprint(obs_window)

    action = agent.act(state)
    
    print(action)
    
    _split = action.split(",")
    mode = _split[0]
    x = _split[1]
    y = _split[2]

    if mode == "attack":
        info, success = agent_wrapper.attack("DQN", obs_window[-1],x,y)
    else:
        info, success = agent_wrapper.move("DQN", obs_window[-1],x,y)
        
    if(success):
        new_obs = info
        obs_window.append(new_obs)
        del obs_window[0]

        
    timestep += 1

    


<class 'int'> <class 'int'>
Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 128)               240128    
_________________________________________________________________
dense_31 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_32 (Dense)             (None, 1682)              109330    
Total params: 357,714
Trainable params: 357,714
Non-trainable params: 0
_________________________________________________________________
[[], [], [], []]
