# 1. Import Libraries

In [22]:
import pickle
import import_ipynb
from utils import remove_empty_slots
from utils import reward_function
import random, copy

## 2. Goal of the agent : Inform a match for the key "ticket"

In [29]:
user_sim_default = 'ticket'
FAIL = -1
NO_OUTCOME = 0
SUCCESS = 1
user_sim_required_inform_key = ['moviename']

usersim_intents = ['inform', 'request', 'thanks', 'reject', 'done']

# list of policy defined by rule based approach
rule_requests = ['moviename', 'starttime', 'city', 'date', 'theater', 'numberofpeople']
no_query_keys = ['numberofpeople', user_sim_default]

## 3. Dialogue config constants used by Agent

#### a. The annotation schema contains 11 intents (e.g., inform, request, confirm_question, confirm_answer, etc.), and 29 slots (e.g., moviename, starttime, theater, numberofpeople).
#### b. Most of the slots are informational slots, which can be used to filter the search. Others are request slots with which users can request information from the agent
#### c. For every user turn, all the known, unknown slots and user request are extracted. Thee aggregate of all this data is stored into a user-goal database for the simulator to utilize. When triggering a dialogue, the user simulator randomly samples one user goal from this database.

In [24]:
agent_inform_slots = ['moviename', 'theater', 'starttime', 'date', 'genre', 'state', 'city', 'zip', 'critic_rating',
                     'mpaa_rating', 'distanceconstraints', 'video_format', 'theater_chain', 'price', 'actor',
                     'description', 'other', 'numberofkids']
agent_request_slots = ['moviename', 'theater', 'starttime', 'date', 'numberofpeople', 'genre', 'state', 'city', 'zip',
                      'critic_rating', 'mpaa_rating', 'distanceconstraints', 'video_format', 'theater_chain', 'price',
                      'actor', 'description', 'other', 'numberofkids']

agent_actions = [
   {'intent': 'done', 'inform_slots': {}, 'request_slots': {}}, 
   {'intent': 'match_found', 'inform_slots': {}, 'request_slots': {}}
]

for slot in agent_inform_slots:
   agent_action_dict = {'intent': 'inform', 'inform_slots': {slot: 'PLACEHOLDER'}, 'request_slots': {}}
   agent_actions.append(agent_action_dict)

for slot in agent_request_slots:
   agent_action_dict = {'intent': 'request', 'inform_slots': {}, 'request_slots': {slot: 'UNK'}}
   agent_actions.append(agent_action_dict)

# Rule-based policy request list
rule_requests = ['moviename', 'starttime', 'city', 'date', 'theater', 'numberofpeople']
no_query_keys = ['numberofpeople', user_sim_default]

## 4. Example of User Goal for movie ticket booking task.

#### e.g. the user wants to buy 2 tickets of Tennet at 9:00 pm tomorrow at imax regal destiny, Syracuse

In [25]:
{
    "request_slots": {
    "ticket": "UNK"
    },
    "inform_slots": {
        "city": "syracuse",
        "numberofpeople": "2",
        "theater": "imax regal destiny",
        "starttime": "9:00 pm",
        "date": "tomorrow",
        "moviename": "Tennet"
        }
}

{'request_slots': {'ticket': 'UNK'},
 'inform_slots': {'city': 'syracuse',
  'numberofpeople': '2',
  'theater': 'imax regal destiny',
  'starttime': '9:00 pm',
  'date': 'tomorrow',
  'moviename': 'Tennet'}}

## 5. Configuration variables for RL

In [26]:
db_file_path = "data/movie_db.pkl"
movie_dict_path = "data/movie_dict.pkl"
user_goals_path = "data/movie_user_goals.pkl"

USE_USERSIM = 'true'
initial_warm_memory = 1000
TRAIN_EPISODE_COUNT = 5
TRAIN_FREQ = 100
MAX_ROUND = 20
THRESHOLD_SUCCESS_RATE = 0.3

slot_error_mode = 0
slot_error_prob = 0.05
intent_error_prob = 0.0

### Funtion to remove items containing empty strings 

In [14]:
def remove_empty_values(_dict):
    for id in list(_dict.keys()):
        for key in list(_dict[id].keys()):
            if _dict[id][key] == '':
                _dict[id].pop(key)

In [15]:
database = pickle.load(open(db_file_path, 'rb'), encoding='latin1')

# clean database
remove_empty_values(database)

### Data records in knowledge base

In [16]:
# Load movie dict
db_dict = pickle.load(open(movie_dict_path, 'rb'), encoding='latin1')

In [17]:
# Load goal file
user_goals = pickle.load(open(user_goals_path, 'rb'), encoding='latin1')

### 2. User simulator based on frame-level semantics

### Frame-level semantics : A dialog act form (e.g request(moviename; genre = action; date = this weekend))

In [36]:
class Simulator:
    def __init__(self, goal_list, database):
        self.goal_list = goal_list
        self.max_round = MAX_ROUND
        self.default_key = user_sim_default
        self.init_informs = user_sim_required_inform_key
        self.no_query = no_query_keys
        self.database = database

    def reset(self):
        """
        Empty state and return initial user action
        """

        self.goal = random.choice(self.goal_list)
        # default value for goal request 
        self.goal['request_slots'][self.default_key] = 'UNK'
        self.state = {'history_slots' : {}, 'inform_slots' : {}, 'request_slots' : {},'rest_slots' : {} ,'intent' : ''}
        self.state['rest_slots'].update(self.goal['inform_slots'])
        self.state['rest_slots'].update(self.goal['request_slots'])
        '''
        Failure = False
        Success = True
        '''
        self.constraint_check = FAIL
        return self.initial_action()

    def initial_action(self):
        """
        The function returns the initial action of that specific episode and contains intent of request, 
        inform slots,single request slot and required initital inform slots
        """
        self.state['intent'] = 'request'
        if self.goal['inform_slots']:
            for inform_key in self.init_informs:
                if inform_key in self.goal['inform_slots']:
                    self.state['inform_slots'][inform_key] = self.goal['inform_slots'][inform_key]
                    self.state['rest_slots'].pop(inform_key)
                    self.state['history_slots'][inform_key] = self.goal['inform_slots'][inform_key]
 
            if not self.state['inform_slots']:
                key, value = random.choice(list(self.goal['inform_slots'].items()))
                self.state['inform_slots'][key] = value
                self.state['rest_slots'].pop(key)
                self.state['history_slots'][key] = value

        self.goal['request_slots'].pop(self.default_key)
        if self.goal['request_slots']:
            req_key = random.choice(list(self.goal['request_slots'].keys()))
        else:
            req_key = self.default_key
        self.goal['request_slots'][self.default_key] = 'UNK'
        self.state['request_slots'][req_key] = 'UNK'

        user_response = {'intent' : self.state['intent'], 'request_slots' : copy.deepcopy(self.state['request_slots']),
                        'inform_slots' : copy.deepcopy(self.state['inform_slots']) }
        return user_response

    def step(self, agent_action):
        """
        The function returns the output of user simulator to the agent. This id done with the rules of user simulator.
        """
        for value in agent_action['inform_slots'].values():
            assert value != 'UNK'
            assert value != 'PLACEHOLDER'
        for value in agent_action['request_slots'].values():
            assert value != 'PLACEHOLDER'

        self.state['inform_slots'].clear()
        self.state['intent'] = ''

        done = False
        success = NO_OUTCOME
        if agent_action['round'] == self.max_round:
            done = True
            success = FAIL
            self.state['intent'] = 'done'
            self.state['request_slots'].clear()
        else:
            agent_intent = agent_action['intent']
            if agent_intent == 'request':
                self.request_response(agent_action)
            elif agent_intent == 'inform':
                self.inform_response(agent_action)
            elif agent_intent == 'match_found':
                self.identified_response_match(agent_action)
            elif agent_intent == 'done':
                success = self.done_response()
                self.state['intent'] = 'done'
                self.state['request_slots'].clear()
                done = True
        if self.state['intent'] == 'request':
            assert self.state['request_slots']
        if self.state['intent'] == 'inform':
            assert self.state['inform_slots']
            assert not self.state['request_slots']
        assert 'UNK' not in self.state['inform_slots'].values()
        assert 'PLACEHOLDER' not in self.state['request_slots'].values()
        for key in self.state['rest_slots']:
            assert key not in self.state['history_slots']
        for key in self.state['history_slots']:
            assert key not in self.state['rest_slots']
        for inf_key in self.goal['inform_slots']:
            assert self.state['history_slots'].get(inf_key, False) or self.state['rest_slots'].get(inf_key, False)
        for req_key in self.goal['request_slots']:
            assert self.state['history_slots'].get(req_key, False) or self.state['rest_slots'].get(req_key,
                                                                                                   False), req_key
        for key in self.state['rest_slots']:
            assert self.goal['inform_slots'].get(key, False) or self.goal['request_slots'].get(key, False)
        assert self.state['intent'] != ''

        user_response = {'intent':self.state['intent'], 'request_slots' : copy.deepcopy(self.state['request_slots']),
                        'inform_slots' : copy.deepcopy(self.state['inform_slots']) }
        reward = reward_function(success, self.max_round)
        print('Round : {} , Reward : {}'.format(agent_action['round'],reward))

        return user_response, reward, done, True if success is 1 else False

    def request_response(self, agent_action):
        """
        UPdate the response state to the agent action having an intent of request.
        """

        agent_request_key = list(agent_action['request_slots'].keys())[0]
        # First Case: if agent requests for something that is in the user sims goal inform slots, then inform it
        if agent_request_key in self.goal['inform_slots']:
            self.state['intent'] = 'inform'
            self.state['inform_slots'][agent_request_key] = self.goal['inform_slots'][agent_request_key]
            self.state['request_slots'].clear()
            self.state['rest_slots'].pop(agent_request_key, None)
            self.state['history_slots'][agent_request_key] = self.goal['inform_slots'][agent_request_key]
        # Second Case: if the agent requests for something in user sims goal request slots and it has already been
        # informed, then inform it
        elif agent_request_key in self.goal['request_slots'] and agent_request_key in self.state['history_slots']:
            self.state['intent'] = 'inform'
            self.state['inform_slots'][agent_request_key] = self.state['history_slots'][agent_request_key]
            self.state['request_slots'].clear()
            assert agent_request_key not in self.state['rest_slots']
        # Third Case: if the agent requests for something in the user sims goal request slots and it HASN'T been
        # informed, then request it with a random inform
        elif agent_request_key in self.goal['request_slots'] and agent_request_key in self.state['rest_slots']:
            self.state['request_slots'].clear()
            self.state['intent'] = 'request'
            self.state['request_slots'][agent_request_key] = 'UNK'
            rest_informs = {}
            for key, value in list(self.state['rest_slots'].items()):
                if value != 'UNK':
                    rest_informs[key] = value
            if rest_informs:
                key_choice, value_choice = random.choice(list(rest_informs.items()))
                self.state['inform_slots'][key_choice] = value_choice
                self.state['rest_slots'].pop(key_choice)
                self.state['history_slots'][key_choice] = value_choice
        # Fourth and Final Case: otherwise the user sim does not care about the slot being requested, then inform
        # 'anything' as the value of the requested slot
        else:
            assert agent_request_key not in self.state['rest_slots']
            self.state['intent'] = 'inform'
            self.state['inform_slots'][agent_request_key] = 'anything'
            self.state['request_slots'].clear()
            self.state['history_slots'][agent_request_key] = 'anything'

    def inform_response(self, agent_action):
        """
        Updates the response state to the agent action having an intent of inform.
        """

        agent_inform_key = list(agent_action['inform_slots'].keys())[0]
        agent_inform_value = agent_action['inform_slots'][agent_inform_key]

        assert agent_inform_key != self.default_key

        # Add all informs (by agent too) to hist slots
        self.state['history_slots'][agent_inform_key] = agent_inform_value
        # Remove from rest slots if in it
        self.state['rest_slots'].pop(agent_inform_key, None)
        # Remove from request slots if in it
        self.state['request_slots'].pop(agent_inform_key, None)

        # First Case: If agent informs something that is in goal informs and the value it informed doesnt match,
        # then inform the correct value
        if agent_inform_value != self.goal['inform_slots'].get(agent_inform_key, agent_inform_value):
            self.state['intent'] = 'inform'
            self.state['inform_slots'][agent_inform_key] = self.goal['inform_slots'][agent_inform_key]
            self.state['request_slots'].clear()
            self.state['history_slots'][agent_inform_key] = self.goal['inform_slots'][agent_inform_key]
        # Second Case: Otherwise pick a random action to take
        else:
            # - If anything in state requests then request it
            if self.state['request_slots']:
                self.state['intent'] = 'request'
            # - Else if something to say in rest slots, pick something
            elif self.state['rest_slots']:
                def_in = self.state['rest_slots'].pop(self.default_key, False)
                if self.state['rest_slots']:
                    key, value = random.choice(list(self.state['rest_slots'].items()))
                    if value != 'UNK':
                        self.state['intent'] = 'inform'
                        self.state['inform_slots'][key] = value
                        self.state['rest_slots'].pop(key)
                        self.state['history_slots'][key] = value
                    else:
                        self.state['intent'] = 'request'
                        self.state['request_slots'][key] = 'UNK'
                else:
                    self.state['intent'] = 'request'
                    self.state['request_slots'][self.default_key] = 'UNK'
                if def_in == 'UNK':
                    self.state['rest_slots'][self.default_key] = 'UNK'
            # - Otherwise respond with 'nothing to say' intent
            else:
                self.state['intent'] = 'thanks'

    def identified_response_match(self, agent_action):
        """
        Changes the agent response state action having an intent of match_found.
        """

        agent_informs = agent_action['inform_slots']

        self.state['intent'] = 'thanks'
        self.constraint_check = SUCCESS

        assert self.default_key in agent_informs
        self.state['rest_slots'].pop(self.default_key, None)
        self.state['history_slots'][self.default_key] = str(agent_informs[self.default_key])
        self.state['request_slots'].pop(self.default_key, None)

        if agent_informs[self.default_key] == 'no match available':
            self.constraint_check = FAIL

        # Check to see if all goal informs are in the agent informs, and that the values match
        for key, value in self.goal['inform_slots'].items():
            assert value != None
            # For items that cannot be in the queries don't check to see if they are in the agent informs here
            if key in self.no_query:
                continue
            # Will return true if key not in agent informs OR if value does not match value of agent informs[key]
            if value != agent_informs.get(key, None):
                self.constraint_check = FAIL
                break

        if self.constraint_check == FAIL:
            self.state['intent'] = 'reject'
            self.state['request_slots'].clear()

    def done_response(self):
        """
        Updates the agent action response state having an intent of done.
        If the constraint_check is SUCCESS and both the rest and request slots of the state are empty for the agent
        to succeed in this episode/conversation.
 
        """

        if self.constraint_check == FAIL:
            return FAIL

        if not self.state['rest_slots']:
            assert not self.state['request_slots']
        if self.state['rest_slots']:
            return FAIL

        assert self.state['history_slots'][self.default_key] != 'no match available'

        match = copy.deepcopy(self.database[int(self.state['history_slots'][self.default_key])])

        for key, value in self.goal['inform_slots'].items():
            assert value != None
            if key in self.no_query:
                continue
            if value != match.get(key, None):
                assert True is False, 'match: {}\ngoal: {}'.format(match, self.goal)
                break
        return SUCCESS

In [37]:
user = Simulator(user_goals, database)

### 3. The purpose of Error Model is to introduce error to the action semantic frame of user simulator and thereby improve the training pprocess

In [38]:
class ErrorModelController:

    def __init__(self, db_dict):

        self.movie_dict = db_dict
        self.slot_error_prob = slot_error_prob
        self.slot_error_mode = slot_error_mode 
        self.intent_error_prob = intent_error_prob
        self.intents = usersim_intents

    def infuse_error(self, frame):
        """
        Takes a semantic frame/action as a dict and adds 'error' by replacing slot values, replacing slot 
        and its values, delete a slot.
        """

        informs_dict = frame['inform_slots']
        for key in list(frame['inform_slots'].keys()):
            assert key in self.movie_dict
            if random.random() < self.slot_error_prob:
                if self.slot_error_mode == 0: 
                    self.add_slot_value_error(key, informs_dict)
                elif self.slot_error_mode == 1:  
                    self.add_slot_error(key, informs_dict)
                elif self.slot_error_mode == 2:  
                    self.remove_slot(key, informs_dict)
                else: 
                    rand_choice = random.random()
                    if rand_choice <= 0.33:
                        self.add_slot_value_error(key, informs_dict)
                    elif rand_choice > 0.33 and rand_choice <= 0.66:
                        self.add_slot_error(key, informs_dict)
                    else:
                        self.remove_slot(key, informs_dict)
        if random.random() < self.intent_error_prob: 
            frame['intent'] = random.choice(self.intents)

    def add_slot_value_error(self, key, informs_dict):
        """
        Assign a new values to the slot based on the key and a given dictionary to change
        """
        informs_dict[key] = random.choice(self.movie_dict[key])

    def add_slot_error(self, key, informs_dict):
        """
        This function is to replace current slot based on the given key in informs dictionary and assigns a new slot with a random value.
        """

        informs_dict.pop(key)
        random_slot = random.choice(list(self.movie_dict.keys()))
        informs_dict[random_slot] = random.choice(self.movie_dict[random_slot])

    def remove_slot(self, key, informs_dict):
        """
        This function is to remove the slot based on the given key from the informs dictionary
        """

        informs_dict.pop(key)

In [39]:
emc = ErrorModelController(db_dict)

### 4. The Dialogue State Tracker takes user dialogue semantic frame and the conversation history which is used by agent's policy to build a state representation. It also tracks the state of the dialogue and creates state representation that will be used by the agent

In [3]:
class StateTracker:
    def __init__(self, database, constants):

        self.db_helper = DBQuery(database)
        self.match_key = user_sim_default
        self.intents_dict = convert_list_to_dict(all_intents)
        self.num_intents = len(all_intents)
        self.slots_dict = convert_list_to_dict(all_slots)
        self.num_slots = len(all_slots)
        self.max_round_num = constants['run']['MAX_ROUND']
        self.none_state = np.zeros(self.get_state_size())
        self.reset()

    def get_state_size(self):
        """Returns the state size of the state representation used by the agent."""

        return 2 * self.num_intents + 7 * self.num_slots + 3 + self.max_round_num

    def reset(self):
        """Resets current_informs, history and round_num."""

        self.current_informs = {}
        self.history = []
        self.round_num = 0

    def print_history(self):
        """Explore the current history action by action."""

        for action in self.history:
            print(action)

    def get_state(self, done=False):
        """
        Returns the state representation as a numpy array which is fed into the agent's neural network.
        The state representation contains useful information for the agent about the current state of the conversation.
        Processes by the agent to be fed into the neural network. Ripe for experimentation and optimization.
        """

        # If done then fill state with zeros
        if done:
            return self.none_state
        user_action = self.history[-1]
        print('Current form : ',self.current_informs)
        db_results_dict = self.db_helper.get_db_results_for_slots(self.current_informs)
        print('DB Results : ',db_results_dict)
        last_agent_action = self.history[-2] if len(self.history) > 1 else None
        print('Latest agent action : ',last_agent_action)
        # Create one-hot of intents to represent the current user action
        user_act_rep = np.zeros((self.num_intents,))
        user_act_rep[self.intents_dict[user_action['intent']]] = 1.0
        
        print('One hot intents user action : ',user_act_rep)
        # Create bag of inform slots representation to represent the current user action
        user_inform_slots_rep = np.zeros((self.num_slots,))
        for key in user_action['inform_slots'].keys():
            user_inform_slots_rep[self.slots_dict[key]] = 1.0
        
        print('Bag of inform slots : ',user_inform_slots_rep)
        # Create bag of request slots representation to represent the current user action
        user_request_slots_rep = np.zeros((self.num_slots,))
        for key in user_action['request_slots'].keys():
            user_request_slots_rep[self.slots_dict[key]] = 1.0
        
        print('Bag of request slots : ',user_request_slots_rep)
        # Create bag of filled_in slots based on the current_slots
        current_slots_rep = np.zeros((self.num_slots,))
        for key in self.current_informs:
            current_slots_rep[self.slots_dict[key]] = 1.0

        # Encode last agent intent
        agent_act_rep = np.zeros((self.num_intents,))
        if last_agent_action:
            agent_act_rep[self.intents_dict[last_agent_action['intent']]] = 1.0

        # Encode last agent inform slots
        agent_inform_slots_rep = np.zeros((self.num_slots,))
        if last_agent_action:
            for key in last_agent_action['inform_slots'].keys():
                agent_inform_slots_rep[self.slots_dict[key]] = 1.0

        # Encode last agent request slots
        agent_request_slots_rep = np.zeros((self.num_slots,))
        if last_agent_action:
            for key in last_agent_action['request_slots'].keys():
                agent_request_slots_rep[self.slots_dict[key]] = 1.0

        # Value representation of the round num
        turn_rep = np.zeros((1,)) + self.round_num / 5.

        # One-hot representation of the round num
        turn_onehot_rep = np.zeros((self.max_round_num,))
        turn_onehot_rep[self.round_num - 1] = 1.0

        # Representation of DB query results (scaled counts)
        kb_count_rep = np.zeros((self.num_slots + 1,)) + db_results_dict['matching_all_constraints'] / 100.
        for key in db_results_dict.keys():
            if key in self.slots_dict:
                kb_count_rep[self.slots_dict[key]] = db_results_dict[key] / 100.

        # Representation of DB query results (binary)
        kb_binary_rep = np.zeros((self.num_slots + 1,)) + np.sum(db_results_dict['matching_all_constraints'] > 0.)
        for key in db_results_dict.keys():
            if key in self.slots_dict:
                kb_binary_rep[self.slots_dict[key]] = np.sum(db_results_dict[key] > 0.)

        state_representation = np.hstack(
            [user_act_rep, user_inform_slots_rep, user_request_slots_rep, agent_act_rep, agent_inform_slots_rep,
             agent_request_slots_rep, current_slots_rep, turn_rep, turn_onehot_rep, kb_binary_rep,
             kb_count_rep]).flatten()
        print('STATE REPRESENT : ',state_representation)
        return state_representation

    def update_state_agent(self, agent_action):
        """
        Updates the dialogue history with the agent's action and augments the agent's action.
        Takes an agent action and updates the history. Also augments the agent_action param with query information and
        any other necessary information.

        """

        if agent_action['intent'] == 'inform':
            assert agent_action['inform_slots']
            inform_slots = self.db_helper.fill_inform_slot(agent_action['inform_slots'], self.current_informs)
            agent_action['inform_slots'] = inform_slots
            assert agent_action['inform_slots']
            key, value = list(agent_action['inform_slots'].items())[0]  # Only one
            assert key != 'match_found'
            assert value != 'PLACEHOLDER', 'KEY: {}'.format(key)
            self.current_informs[key] = value
        # If intent is match_found then fill the action informs with the matches informs (if there is a match)
        elif agent_action['intent'] == 'match_found':
            assert not agent_action['inform_slots'], 'Cannot inform and have intent of match found!'
            db_results = self.db_helper.get_db_results(self.current_informs)
            if db_results:
                # Arbitrarily pick the first value of the dict
                key, value = list(db_results.items())[0]
                agent_action['inform_slots'] = copy.deepcopy(value)
                agent_action['inform_slots'][self.match_key] = str(key)
            else:
                agent_action['inform_slots'][self.match_key] = 'no match available'
            self.current_informs[self.match_key] = agent_action['inform_slots'][self.match_key]
        agent_action.update({'round': self.round_num, 'speaker': 'Agent'})
        self.history.append(agent_action)

    def update_state_user(self, user_action):
        """
        Updates the dialogue history with the user's action and augments the user's action.
        Takes a user action and updates the history. Also augments the user_action param with necessary information.
        """

        for key, value in user_action['inform_slots'].items():
            self.current_informs[key] = value
        user_action.update({'round': self.round_num, 'speaker': 'User'})
        self.history.append(user_action)
        self.round_num += 1

In [10]:
state_tracker = StateTracker(database, constants)

### 5. The goal of DQN agent is to take a state which represents the history of the current conversation from the dialogue state tracker (ST) and selct the best possible action for the dialogue 

In [4]:
class DQNAgent:
    """The DQN agent that interacts with the user."""

    def __init__(self, state_size, constants):
        """
        The constructor of DQNAgent.
        The constructor of DQNAgent which saves constants, sets up neural network graphs, etc.
        Parameters:
            state_size (int): The state representation size or length of numpy array
            constants (dict): Loaded constants in dict
        """

        self.C = constants['agent']
        self.memory = []
        self.memory_index = 0
        self.max_memory_size = self.C['max_mem_size']
        self.eps = self.C['epsilon_init']
        self.vanilla = self.C['vanilla']
        self.lr = self.C['learning_rate']
        self.gamma = self.C['gamma']
        self.batch_size = self.C['batch_size']
        self.hidden_size = self.C['dqn_hidden_size']

        self.load_weights_file_path = self.C['load_weights_file_path']
        self.save_weights_file_path = self.C['save_weights_file_path']

        if self.max_memory_size < self.batch_size:
            raise ValueError('Max memory size must be at least as great as batch size!')

        self.state_size = state_size
        self.possible_actions = agent_actions
        self.num_actions = len(self.possible_actions)

        self.rule_request_set = rule_requests

        self.beh_model = self._build_model()
        self.tar_model = self._build_model()

        self._load_weights()

        self.reset()

    def _build_model(self):
        """Builds and returns model/graph of neural network."""

        model = Sequential()
        model.add(Dense(self.hidden_size, input_dim=self.state_size, activation='relu'))
        model.add(Dense(self.num_actions, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model

    def reset(self):
        """Resets the rule-based variables."""

        self.rule_current_slot_index = 0
        self.rule_phase = 'not done'

    def get_action(self, state, use_rule=False):
        """
        Returns the action of the agent given a state.
        Gets the action of the agent given the current state. Either the rule-based policy or the neural networks are
        used to respond.
        Parameters:
            state (numpy.array): The database with format dict(long: dict)
            use_rule (bool): Indicates whether or not to use the rule-based policy, which depends on if this was called
                             in warmup or training. Default: False
        Returns:
            int: The index of the action in the possible actions
            dict: The action/response itself
        """

        if self.eps > random.random():
            index = random.randint(0, self.num_actions - 1)
            action = self._map_index_to_action(index)
            return index, action
        else:
            if use_rule:
                return self._rule_action()
            else:
                return self._dqn_action(state)

    def _rule_action(self):
        """
        Returns a rule-based policy action.
        Selects the next action of a simple rule-based policy.
        Returns:
            int: The index of the action in the possible actions
            dict: The action/response itself
        """

        if self.rule_current_slot_index < len(self.rule_request_set):
            slot = self.rule_request_set[self.rule_current_slot_index]
            self.rule_current_slot_index += 1
            rule_response = {'intent': 'request', 'inform_slots': {}, 'request_slots': {slot: 'UNK'}}
        elif self.rule_phase == 'not done':
            rule_response = {'intent': 'match_found', 'inform_slots': {}, 'request_slots': {}}
            self.rule_phase = 'done'
        elif self.rule_phase == 'done':
            rule_response = {'intent': 'done', 'inform_slots': {}, 'request_slots': {}}
        else:
            raise Exception('Should not have reached this clause')

        index = self._map_action_to_index(rule_response)
        return index, rule_response

    def _map_action_to_index(self, response):
        """
        Maps an action to an index from possible actions.
        Parameters:
            response (dict)
        Returns:
            int
        """

        for (i, action) in enumerate(self.possible_actions):
            if response == action:
                return i
        raise ValueError('Response: {} not found in possible actions'.format(response))

    def _dqn_action(self, state):
        """
        Returns a behavior model output given a state.
        Parameters:
            state (numpy.array)
        Returns:
            int: The index of the action in the possible actions
            dict: The action/response itself
        """

        index = np.argmax(self._dqn_predict_one(state))
        action = self._map_index_to_action(index)
        return index, action

    def _map_index_to_action(self, index):
        """
        Maps an index to an action in possible actions.
        Parameters:
            index (int)
        Returns:
            dict
        """

        for (i, action) in enumerate(self.possible_actions):
            if index == i:
                return copy.deepcopy(action)
        raise ValueError('Index: {} not in range of possible actions'.format(index))

    def _dqn_predict_one(self, state, target=False):
        """
        Returns a model prediction given a state.
        Parameters:
            state (numpy.array)
            target (bool)
        Returns:
            numpy.array
        """

        return self._dqn_predict(state.reshape(1, self.state_size), target=target).flatten()

    def _dqn_predict(self, states, target=False):
        """
        Returns a model prediction given an array of states.
        Parameters:
            states (numpy.array)
            target (bool)
        Returns:
            numpy.array
        """

        if target:
            return self.tar_model.predict(states)
        else:
            return self.beh_model.predict(states)

    def add_experience(self, state, action, reward, next_state, done):
        """
        Adds an experience tuple made of the parameters to the memory.
        Parameters:
            state (numpy.array)
            action (int)
            reward (int)
            next_state (numpy.array)
            done (bool)
        """

        if len(self.memory) < self.max_memory_size:
            self.memory.append(None)
        self.memory[self.memory_index] = (state, action, reward, next_state, done)
        self.memory_index = (self.memory_index + 1) % self.max_memory_size

    def empty_memory(self):
        """Empties the memory and resets the memory index."""

        self.memory = []
        self.memory_index = 0

    def is_memory_full(self):
        """Returns true if the memory is full."""

        return len(self.memory) == self.max_memory_size

    def train(self):
        """
        Trains the agent by improving the behavior model given the memory tuples.
        Takes batches of memories from the memory pool and processing them. The processing takes the tuples and stacks
        them in the correct format for the neural network and calculates the Bellman equation for Q-Learning.
        """

        # Calc. num of batches to run
        num_batches = len(self.memory) // self.batch_size
        for b in range(num_batches):
            batch = random.sample(self.memory, self.batch_size)

            states = np.array([sample[0] for sample in batch])
            next_states = np.array([sample[3] for sample in batch])

            assert states.shape == (self.batch_size, self.state_size), 'States Shape: {}'.format(states.shape)
            assert next_states.shape == states.shape

            beh_state_preds = self._dqn_predict(states)  # For leveling error
            if not self.vanilla:
                beh_next_states_preds = self._dqn_predict(next_states)  # For indexing for DDQN
            tar_next_state_preds = self._dqn_predict(next_states, target=True)  # For target value for DQN (& DDQN)

            inputs = np.zeros((self.batch_size, self.state_size))
            targets = np.zeros((self.batch_size, self.num_actions))

            for i, (s, a, r, s_, d) in enumerate(batch):
                t = beh_state_preds[i]
                if not self.vanilla:
                    t[a] = r + self.gamma * tar_next_state_preds[i][np.argmax(beh_next_states_preds[i])] * (not d)
                else:
                    t[a] = r + self.gamma * np.amax(tar_next_state_preds[i]) * (not d)

                inputs[i] = s
                targets[i] = t

            self.beh_model.fit(inputs, targets, epochs=1, verbose=0)

    def copy(self):
        """Copies the behavior model's weights into the target model's weights."""

        self.tar_model.set_weights(self.beh_model.get_weights())

    def save_weights(self):
        """Saves the weights of both models in two h5 files."""

        if not self.save_weights_file_path:
            return
        beh_save_file_path = re.sub(r'\.h5', r'_beh.h5', self.save_weights_file_path)
        self.beh_model.save_weights(beh_save_file_path)
        tar_save_file_path = re.sub(r'\.h5', r'_tar.h5', self.save_weights_file_path)
        self.tar_model.save_weights(tar_save_file_path)

    def _load_weights(self):
        """Loads the weights of both models from two h5 files."""

        if not self.load_weights_file_path:
            return
        beh_load_file_path = re.sub(r'\.h5', r'_beh.h5', self.load_weights_file_path)
        self.beh_model.load_weights(beh_load_file_path)
        tar_load_file_path = re.sub(r'\.h5', r'_tar.h5', self.load_weights_file_path)
        self.tar_model.load_weights(tar_load_file_path)

In [12]:
dqn_agent = DQNAgent(state_tracker.get_state_size(), constants)

## Steps for single round of training 


### a. The state which is either the previous next state or an initial state is sent to the agent to to sleect an action
### b. The state tracker is updated with the action that the agent has selected
### c. The updated agent action is given to the User which outputs reward and success information
### d. Error Model introduces error in the user action
### e. The error embedded user action is sent to State tracker's update method to save this information in its history

In [14]:
def run_round(state, warmup=False):
    print('Get action...')
    agent_action_index, agent_action = dqn_agent.get_action(state, use_rule=warmup)
    print('Action : ',agent_action)
    state_tracker.update_state_agent(agent_action)
    print('UPDATED HISTORY')
    state_tracker.print_history()
    # 3) User takes action given agent action
    user_action, reward, done, success = user.step(agent_action)
    print('User action : {} , Reward : {} , Done : {} , Success : {}'.format(user_action, reward, done, success))
    if not done:
        # 4) Infuse error into semantic frame level of user action
        emc.infuse_error(user_action)
    # 5) Update state tracker with user action
    state_tracker.update_state_user(user_action)
    # 6) Get next state and add experience
    next_state = state_tracker.get_state(done)
    dqn_agent.add_experience(state, agent_action_index, reward, next_state, done)
    
    return next_state, reward, done, success

### The agent uses it's rule-based policy to make actions. Warmup process is used to fill the agents memory with these actions. We terminate this process when the size of the memory is equal to WARMUP_MEM or when the memory buffer is full.

In [15]:
def warmup_run():
    print('Warmup Started...')
    total_step = 0
    while total_step != initial_warm_memory and not dqn_agent.is_memory_full():
        # Reset episode
        episode_reset()
        done = False
        # Get initial state from state tracker
        state = state_tracker.get_state()
        print('State : ',state)
        while not done:
            next_state, _, done, _ = run_round(state, warmup=True)
            total_step += 1
            state = next_state

    print('...Warmup Ended')

## Episode reset function  is called before every episode to reset all the objects and get the initial user action of episode 

In [16]:
def episode_reset():
    """
    Resets the episode/conversation in the warmup and training loops.
    Called in warmup and train to reset the state tracker, user and agent. Also get's the initial user action.
    """

    # First reset the state tracker
    state_tracker.reset()
    # Then pick an init user action
    user_action = user.reset()
    # Infuse with error
    emc.infuse_error(user_action)
    # And update state tracker
    state_tracker.update_state_user(user_action)
    # Finally, reset agent
    dqn_agent.reset()

## Steps to train the agent which selects the best action based on a policy 

#### 1. Get state : Returns the state representation as a numpy array which is fed into the agent's neural network. The state representation contains useful information for the agent about the current state of the conversation. Processes by the agent to be fed into the neural network. 

#### 2. Returns the action of the agent given a state. Gets the action of the agent given the current state. Either the rule-based policy or the neural networks are used to respond.

#### 3. Update state tracker : Updates the dialogue history with the agent's action and augments the agent's action. Takes an agent action and updates the history. Also augments the agent_action param with query information and any other necessary information. Get the agent’s action and send it to the ST update method for an agent action: The ST updates its own history of the current conversation in this method as well as updating the agent action with database query information

#### 4. The updated agent action is sent as input into the user’s step method: In step the user sim crafts its own rule-based response and also outputs reward and success information. Return the response of the user sim. to the agent by using rules that simulate a user. Given the agent action craft a response by using deterministic rules that simulate (to some extent) a user. Some parts of the rules are stochastic. Check if the agent has succeeded or lost or still going.

In [17]:
def train_run():

    print('Training Started...')
    episode = 0
    period_reward_total = 0
    period_success_total = 0
    success_rate_best = 0.0
    while episode < TRAIN_EPISODE_COUNT:
        episode_reset()
        episode += 1
        done = False
        state = state_tracker.get_state()
        print('STATE')
        print(state)
        print('HISTORY')
        state_tracker.print_history()
        while not done:
            next_state, reward, done, success = run_round(state)
            period_reward_total += reward
            state = next_state

        period_success_total += success

        # Train
        if episode % TRAIN_FREQ == 0:
            # Check success rate
            success_rate = period_success_total / TRAIN_FREQ
            avg_reward = period_reward_total / TRAIN_FREQ
            # Flush
            if success_rate >= success_rate_best and success_rate >= THRESHOLD_SUCCESS_RATE:
                dqn_agent.empty_memory()
            # Update current best success rate
            if success_rate > success_rate_best:
                print('Episode: {} NEW BEST SUCCESS RATE: {} Avg Reward: {}' .format(episode, success_rate, avg_reward))
                success_rate_best = success_rate
                dqn_agent.save_weights()
            period_success_total = 0
            period_reward_total = 0
            # Copy
            dqn_agent.copy()
            # Train
            dqn_agent.train()
    print('...Training Ended')

In [18]:
train_run()

Training Started...
Current form :  {'moviename': 'kung fu panda 3'}
DB Results :  {'moviename': 35, 'matching_all_constraints': 35}
Latest agent action :  None
One hot intents user action :  [0. 1. 0. 0. 0. 0.]
Bag of inform slots :  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Bag of request slots :  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0.]
STATE REPRESENT :  [0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0

{'intent': 'inform', 'request_slots': {}, 'inform_slots': {'date': 'tomorrow'}, 'round': 5, 'speaker': 'User'}
{'intent': 'inform', 'inform_slots': {'critic_rating': 'no match available'}, 'request_slots': {}, 'round': 6, 'speaker': 'Agent'}
{'intent': 'inform', 'request_slots': {}, 'inform_slots': {'starttime': '8:45 pm'}, 'round': 6, 'speaker': 'User'}
{'intent': 'inform', 'inform_slots': {'critic_rating': 'no match available'}, 'request_slots': {}, 'round': 7, 'speaker': 'Agent'}
SUCCESS :  0
Round : 7 , Reward : -1
User action : {'intent': 'inform', 'request_slots': {}, 'inform_slots': {'numberofpeople': '2'}} , Reward : -1 , Done : False , Success : False
Current form :  {'moviename': 'big short', 'critic_rating': 'no match available', 'video_format': 'anything', 'theater': 'regal meridian 16', 'city': 'seattle', 'date': 'tomorrow', 'starttime': '8:45 pm', 'numberofpeople': '2'}
DB Results :  {'moviename': 3, 'critic_rating': 0, 'video_format': 991, 'theater': 131, 'city': 303, 'd

{'intent': 'request', 'inform_slots': {}, 'request_slots': {'distanceconstraints': 'UNK'}, 'round': 2, 'speaker': 'Agent'}
{'intent': 'inform', 'request_slots': {}, 'inform_slots': {'distanceconstraints': 'anything'}, 'round': 2, 'speaker': 'User'}
{'intent': 'request', 'inform_slots': {}, 'request_slots': {'distanceconstraints': 'UNK'}, 'round': 3, 'speaker': 'Agent'}
{'intent': 'inform', 'request_slots': {}, 'inform_slots': {'distanceconstraints': 'your area'}, 'round': 3, 'speaker': 'User'}
{'intent': 'inform', 'inform_slots': {'critic_rating': 'no match available'}, 'request_slots': {}, 'round': 4, 'speaker': 'Agent'}
{'intent': 'inform', 'request_slots': {}, 'inform_slots': {'date': 'tomorrow'}, 'round': 4, 'speaker': 'User'}
{'intent': 'inform', 'inform_slots': {'theater': 'no match available'}, 'request_slots': {}, 'round': 5, 'speaker': 'Agent'}
{'intent': 'inform', 'request_slots': {}, 'inform_slots': {'numberofpeople': '5'}, 'round': 5, 'speaker': 'User'}
{'intent': 'inform',

Action :  {'intent': 'inform', 'inform_slots': {'mpaa_rating': 'PLACEHOLDER'}, 'request_slots': {}}
UPDATED HISTORY
{'intent': 'request', 'request_slots': {'ticket': 'UNK'}, 'inform_slots': {'moviename': 'the witch'}, 'round': 0, 'speaker': 'User'}
{'intent': 'inform', 'inform_slots': {'date': 'tomorrow'}, 'request_slots': {}, 'round': 1, 'speaker': 'Agent'}
{'intent': 'request', 'request_slots': {'ticket': 'UNK'}, 'inform_slots': {}, 'round': 1, 'speaker': 'User'}
{'intent': 'request', 'inform_slots': {}, 'request_slots': {'distanceconstraints': 'UNK'}, 'round': 2, 'speaker': 'Agent'}
{'intent': 'inform', 'request_slots': {}, 'inform_slots': {'distanceconstraints': 'anything'}, 'round': 2, 'speaker': 'User'}
{'intent': 'inform', 'inform_slots': {'mpaa_rating': 'no match available'}, 'request_slots': {}, 'round': 3, 'speaker': 'Agent'}
SUCCESS :  0
Round : 3 , Reward : -1
User action : {'intent': 'inform', 'request_slots': {}, 'inform_slots': {'numberofpeople': '2'}} , Reward : -1 , Do

In [19]:
warmup_run()

Warmup Started...
Current form :  {'moviename': 'star wars'}
DB Results :  {'moviename': 40, 'matching_all_constraints': 40}
Latest agent action :  None
One hot intents user action :  [0. 1. 0. 0. 0. 0.]
Bag of inform slots :  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Bag of request slots :  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0.]
STATE REPRESENT :  [0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0

NameError: name 'FAIL' is not defined

In [43]:
train_run()

Training Started...
Episode: 400 NEW BEST SUCCESS RATE: 0.03 Avg Reward: -36.03
Episode: 1000 NEW BEST SUCCESS RATE: 0.11 Avg Reward: -32.1
Episode: 1500 NEW BEST SUCCESS RATE: 0.31 Avg Reward: -19.09
Episode: 1600 NEW BEST SUCCESS RATE: 0.39 Avg Reward: -14.07
Episode: 2500 NEW BEST SUCCESS RATE: 0.41 Avg Reward: -10.16
Episode: 2600 NEW BEST SUCCESS RATE: 0.45 Avg Reward: -7.32
Episode: 2700 NEW BEST SUCCESS RATE: 0.51 Avg Reward: -3.41
Episode: 2800 NEW BEST SUCCESS RATE: 0.52 Avg Reward: -2.2
Episode: 3700 NEW BEST SUCCESS RATE: 0.55 Avg Reward: 0.27
Episode: 4400 NEW BEST SUCCESS RATE: 0.56 Avg Reward: 1.34
Episode: 4700 NEW BEST SUCCESS RATE: 0.62 Avg Reward: 6.99
Episode: 5100 NEW BEST SUCCESS RATE: 0.66 Avg Reward: 9.75
Episode: 5600 NEW BEST SUCCESS RATE: 0.75 Avg Reward: 15.19
Episode: 5900 NEW BEST SUCCESS RATE: 0.79 Avg Reward: 18.81
Episode: 6200 NEW BEST SUCCESS RATE: 0.81 Avg Reward: 19.09
Episode: 7200 NEW BEST SUCCESS RATE: 0.84 Avg Reward: 21.39
Episode: 8900 NEW BEST

## Evaluation 

### To evaluate the quality of agent we are going to utilize  simulation evaluation.

### 1. Simulation Evaluation : This involves usage of 3 metrics to measure the quality {success rate, average turns, average reward}
    - Success rate   : Fraction of dialogues that ended successfully
    - Average turns  : Average length of the dialogue
    - Average reward : Average reward received during the conversation
    
### We are going to opt for a policy that has a high success rate, high average reward and low average returns.In our project we have chosen success rate as our primary evaluation metric