In [1]:
import numpy as np
import textworld
import nltk
from nltk.tokenize import word_tokenize
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.tree import Tree
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [40]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.9 #0.995
        self.learning_rate = 0.5 #0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size)) #, activation='relu'
        #model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='sigmoid')) #linear
        model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states, targets_f = [], []
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target 
            # Filtering out states and targets for training
            states.append(state[0])
            targets_f.append(target_f[0])
        history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0)
        # Keeping track of loss
        loss = history.history['loss'][0]
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return loss

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [41]:
def get_room(desc):
    
    room = ''
    lines = desc
    lines = lines.lower()
    lines = lines.split('\n')
    line = lines[0]
    line = line.strip()
    if "-=" in line and "=-" in line:
        room = line.split(" ")[1]
    
    return room

In [42]:
def convertStateTextToStateValue(state):
    global all_state_keys,state_size
    
    #5個房間
    room = ['scullery', 'attic', 'bedchamber', 'pantry', 'vault']
    
    all_state_vals = []
    
    current_room = get_room(state)
    for r in room:
        if current_room == r:
            v = 1
        else:
            v = 0
        all_state_vals.append(v)
    
    return all_state_vals

def convertActionIndexToActionValue(action_index):
    #28個action
    action = ['take formless keycard', 'go east', 'insert passkey into locker', 'take passkey from locker', 'go west', 'insert formless keycard into locker', 'take formless keycard from locker', 'go south', 'insert cloak into toolbox', 'take key from toolbox', 'insert keycard into toolbox', 'go north', 'insert lampshade into locker', 'take lampshade from locker', 'insert formless keycard into toolbox', 'insert lampshade into toolbox', 'take formless keycard from toolbox', 'insert key into toolbox', 'take cloak from toolbox', 'take keycard from toolbox', 'insert passkey into toolbox', 'insert cloak into locker', 'insert keycard into locker', 'take cloak from locker', 'take passkey from toolbox', 'take lampshade from toolbox', 'take keycard from locker', 'insert key into locker']
    return action[action_index]

In [43]:
state_size = 5
action_size = 28
agent = DQNAgent(state_size,action_size)
done = False
batch_size = 32

step_total = 1000
step_count = 0

env = textworld.start("gen_games/tw-game-vjs3cos0-house-GP-OgOJFl9Jtba5I1Rb.ulx")
tw = textworld.agents.NaiveAgent()  # Or your own `textworld.Agent` subclass.

env.activate_state_tracking()
env.compute_intermediate_reward()
env.enable_extra_info("description")
env.enable_extra_info("inventory")
    
avg_moves, avg_scores = [], []
EPISODES = 10

for e in range(EPISODES):
    tw.reset(env)  # Tell the agent a new episode is starting.
    game_state = env.reset()  # Start new episode.
    
    #state = env.reset()
    #state = np.reshape(state, [1, state_size])

    reward = 0
    done = False
    
    for time in range(step_total):
        
        room = get_room(game_state.description)
        state = convertStateTextToStateValue(room)
        state = np.reshape(state, [1, state_size])
        
        #command = get_good_command(game_state.admissible_commands)
        action = agent.act(state)
        command = convertActionIndexToActionValue(action)
        game_state, reward, done = env.step(command)
        
        im_reward = game_state.intermediate_reward
        
        next_room = get_room(game_state.description)
        next_state = convertStateTextToStateValue(next_room)
        next_state = np.reshape(next_state, [1, state_size])
        
        agent.remember(state, action, im_reward, next_state, done)
        #state = next_state
        
        if done:
            print('======COMPLETED======')
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, time, agent.epsilon))
            break
            
        if len(agent.memory) > batch_size:
            loss = agent.replay(batch_size)
            # Logging training loss every 10 timesteps
            #if time % 10 == 0:
            #    print("episode: {}/{}, time: {}, loss: {:.4f}".format(e, EPISODES, time, loss))         
    # See https://textworld-docs.maluuba.com/textworld.html#textworld.core.GameState
    avg_moves.append(game_state.nb_moves)
    avg_scores.append(game_state.score)

env.close()
print("avg. steps: {:5.1f}; avg. score: {:4.1f} / 1.".format(sum(avg_moves)/EPISODES, sum(avg_scores)/EPISODES))


avg. steps: 1000.0; avg. score:  0.0 / 1.
