## Setup

In [5]:
# use full window width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os
import numpy as np
os.chdir('..')
import virl
from matplotlib import pyplot as plt

## Agent Implementation

In [10]:
class QTable:
    def __init__(self, initial, num_of_actions):
        self.q_table = {}
        self.initial = initial
        self.num_of_actions = num_of_actions
    
    def get_actions(self, state):
        state_tuple = tuple(state)
        return self.q_table.get(state_tuple, [self.initial]*self.num_of_actions)
    
    def get_action_value(self, state, action_index):
        state_tuple = tuple(state)
        return self.q_table.get(state_tuple, [self.initial]*self.num_of_actions)[action_index]
    
    def set_action_value(self, state, action_index, action_value):
        if not tuple(state) in self.q_table:
            self.q_table[tuple(state)] = [self.initial]*self.num_of_actions
        self.q_table[tuple(state)][action_index] = action_value 

class QLearningAgent:

    def __init__(self, env):
        self.num_of_actions = env.action_space.n
        self.env = env

        self.q_table = QTable(initial=0, num_of_actions=self.num_of_actions) 
        
        # hyper parameters
        self.discount = 0.99 # gamma
        self.learning_rate = 0.25 # step size, alpha
        self.episodes = 2000
        self.print_out_every_x_episodes = int(self.episodes/50)
        
        # hyper parameters for epsilon
        self.initial_epsilon = 1 # initial
        self.decrease_factor = (1/self.episodes)/1.25 # epsilon
        self.decrease_factor = 0.00075
        
        # hyper parameters for discretising state data
        self.highest = 600000000
        self.lowest = 0
        self.number_bins = 20
        
        print("Hyperparameter dump")
        print("----")
        print("Number Of Episodes = " + str(self.episodes))
        print("Print out every " + str(self.print_out_every_x_episodes) + " episodes")
        print("Learning Rate = " + str(self.learning_rate))
        print("Discount = " + str(self.discount))
        print("----")
        print("Initial Epsilon = " + str(self.initial_epsilon))
        print("Epsilon Decrease Factor = " + str(self.decrease_factor))
        print("----")
        print("Number of Bins to Discretise State = " + str(self.number_bins))
        print("----")
        
    def continous_to_discrete(self,continous_state):
        bins = np.linspace(self.lowest,self.highest,num=self.number_bins)
        discrete = np.digitize(continous_state,bins)
        return discrete
    
    def run_all_episodes(self):
        all_rewards = []
        all_q_table_exploits = []
        epislon = self.initial_epsilon # at the start only explore
        
        for episode in range(1, self.episodes + 1):
            rewards, exploited_q_table = self.run_episode(epislon)
            total_reward = np.sum(rewards)

            if episode % self.print_out_every_x_episodes == 0:
                print("Episode number: " + str(episode) + ". Total reward in episode: " + str(total_reward) + ". Episode executed with epsilon = " + str(epislon))
                print("Average total reward in last " + str(self.print_out_every_x_episodes) + " episodes: " + str(np.mean(all_rewards[-self.print_out_every_x_episodes:])))
                print("Average number of times we exploited q table in last " + str(self.print_out_every_x_episodes) + " episodes: " + str(np.mean(all_q_table_exploits[-self.print_out_every_x_episodes:])))
                print("-----")
            all_rewards.append(total_reward)
            all_q_table_exploits.append(exploited_q_table)
            epislon -= self.decrease_factor #hyperparameter
            
        return all_rewards
    
    def run_episode(self,epislon):
        rewards = []
        done = False
        
        state = self.env.reset()
        state = self.continous_to_discrete(state)
        
        exploited_q_table = 0
        
        while not done:
            random_number = np.random.random()
            if random_number < epislon:
                #explore
                action = np.random.choice(self.num_of_actions)
            else:
                #exploit
                action = self.get_action(state)
                exploited_q_table+=1
                
            new_state, reward, done, i = self.env.step(action=action) # Q-learning
            new_state = self.continous_to_discrete(new_state)
            
            #update q table
            self.update_q_table(state,new_state,action,reward)
            
            rewards.append(reward)
            state = new_state
        return (rewards, exploited_q_table)
    
    def update_q_table(self,state,new_state,action,reward):
        #target
        #max of a' given the 
        max_a_prime = np.max(self.q_table.get_actions(new_state))
        target = reward + (self.discount*max_a_prime)
        
        #compute difference
        action_value = self.q_table.get_action_value(state,action)
        difference = target - action_value
        
        #take a small step in the delta direction
        new_q = action_value + (self.learning_rate * difference)
        
        self.q_table.set_action_value(state,action,new_q)
        
    
    def get_action(self,state):
        #exploit the q table
        actions = self.q_table.get_actions(state)
        action = np.argmax(self.q_table.get_actions(state))
        return action

## Analysis

In [11]:
env = virl.Epidemic(stochastic=False, noisy=False)
agent = QLearningAgent(env)
rewards = agent.run_all_episodes()

Hyperparameter dump
----
Number Of Episodes = 2000
Print out every 40 episodes
Learning Rate = 0.25
Discount = 0.99
----
Initial Epsilon = 1
Epsilon Decrease Factor = 0.0004
----
Number of Bins to Discretise State = 20
----
Episode number: 40. Total reward in episode: -2.0787121598145073. Episode executed with epsilon = 0.9844000000000017
Average total reward in last 40 episodes: -1.6005948167025672
Average number of times we exploited q table in last 40 episodes: 0.4358974358974359
-----
Episode number: 80. Total reward in episode: -1.8484907226700567. Episode executed with epsilon = 0.9684000000000035
Average total reward in last 40 episodes: -1.6368228308587138
Average number of times we exploited q table in last 40 episodes: 1.25
-----
Episode number: 120. Total reward in episode: -1.625091768120587. Episode executed with epsilon = 0.9524000000000052
Average total reward in last 40 episodes: -1.714146345759815
Average number of times we exploited q table in last 40 episodes: 2.15
-

Episode number: 1280. Total reward in episode: -1.826881864986881. Episode executed with epsilon = 0.48840000000005473
Average total reward in last 40 episodes: -1.6288770970265027
Average number of times we exploited q table in last 40 episodes: 25.3
-----
Episode number: 1320. Total reward in episode: -1.7974512406754635. Episode executed with epsilon = 0.4724000000000543
Average total reward in last 40 episodes: -1.5803107731171218
Average number of times we exploited q table in last 40 episodes: 27.625
-----
Episode number: 1360. Total reward in episode: -1.8257774564501008. Episode executed with epsilon = 0.4564000000000538
Average total reward in last 40 episodes: -1.5734463367990428
Average number of times we exploited q table in last 40 episodes: 28.25
-----
Episode number: 1400. Total reward in episode: -1.8668698225426226. Episode executed with epsilon = 0.44040000000005336
Average total reward in last 40 episodes: -1.5657419735834583
Average number of times we exploited q ta

In [None]:
def plot(agent, rewards):
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    axes[1].plot(rewards);
    axes[1].set_xlabel('episode')
    axes[1].set_ylabel('total reward r(t)')
    
plot(agent, rewards)

## Evaluation

Eval here