# [KDD Cup|Humanities Track Tutorial Q-Learning](https://compete.hexagon-ml.com/tutorial/kdd-cuphumanities-track-tutorial/)

### KDD Cup|Humanities Track Tutorial Q-Learning
This Tutorial builds on the previous tutorial to demonstrate a baseline implementation of a standard Reinforcement Learning (RL) Algorithm

### State

$S \in \{1,2,3,4,5\}$

### Action
$A_S = [a_{ITN},a_{IRS}]$

where  $a_{ITN} \in [0,1]$ and $a_{IRS} \in [0,1]$

### Reward
$R_{\pi} \in (- \infty,\infty)$

![](image/rewards2.png)

In [2]:
import numpy as np
from collections import defaultdict
import random
# !pip3 install git+https://github.com/slremy/netsapi --user --upgrade
from netsapi.challenge import * 

### Learning a Value Function Based on ϵ-greedy action selection

This common resource was used as a reference for the implementation presented here: https://kofzor.github.io/Learning_Value_Functions/. Please refer to the blog and this Tutorial in tandem. The code below uses the first example from the blog with the Challenge Environment (as opposed to Gym).

In [6]:
env = ChallengeSeqDecEnvironment(experimentCount = 10500)

Q = defaultdict(lambda : 0.) # Q-function
n = defaultdict(lambda : 1.) # number of visits

def actionSpace(resolution):
    x,y = np.meshgrid(np.arange(0,1+resolution,resolution), np.arange(0,1+resolution,resolution))
    xy = np.concatenate((x.reshape(-1,1), y.reshape(-1,1)), axis=1)
    return xy.round(2).tolist()

#HyperParameters
epsilon = 0.1
gamma = 0.9
action_resolution = 0.1
episode_number = 1 #for submission this is fixed as 20


#Set-up
actions = actionSpace(action_resolution)
print("actions.shape=",len(actions))
actionspace = range(len(actions)-1)
greedy_action = lambda s : max(actionspace, key=lambda a : Q[(s,a)])
max_q = lambda sp : max([Q[(sp,a)] for a in actionspace])

#Training of Q Table
for _ in range(episode_number):
    env.reset()
    nextstate = env.state
    while True:
        state = nextstate
        print("state=",state)
        # Epsilon-Greedy
        if epsilon > random.random() :
            action = random.choice(actionspace)
            print('random_action',action)
        else :
            action = greedy_action(state)
        print("action=",action)    
        env_action = actions[action] #convert to ITN/IRS
        print('env_action', type(env_action))
        nextstate, reward, done, _ = env.evaluateAction(env_action)
        print('reward=',reward)
        print('done=',done)
        

        # Q-learning
        if done :
            Q[(state,action)] = Q[(state,action)] + \
                                1./n[(state,action)] * ( reward - Q[(state,action)] )
            break
        else :
            Q[(state,action)] = Q[(state,action)] + \
                                1./n[(state,action)] * ( reward + \
                                                        gamma * max_q(nextstate) - Q[(state,action)] )

#Greedy Policy Learnt from Q Table
best_policy = {state: list(actions[greedy_action(state-1)]) for state in range(1,6)}
best_reward = env.evaluatePolicy(best_policy)
print(best_policy, best_reward)

actions.shape= 121
state= 1
action= 0
env_action <class 'list'>
10500  Evaluations Remaining
reward= 2.9052422663040014
done= False
state= 2
action= 0
env_action <class 'list'>
10499  Evaluations Remaining
reward= -0.1596777840804906
done= False
state= 3
action= 0
env_action <class 'list'>
10498  Evaluations Remaining
reward= 0.07281273286044732
done= False
state= 4
action= 0
env_action <class 'list'>
10497  Evaluations Remaining
reward= -0.1395876236706961
done= False
state= 5
action= 0
env_action <class 'list'>
10496  Evaluations Remaining
reward= -0.052668676708988116
done= True
10495  Evaluations Remaining
{1: [0.0, 0.0], 2: [0.0, 0.0], 3: [0.1, 0.0], 4: [0.0, 0.0], 5: [0.1, 0.0]} 1.9885433006855462


### Creating a Valid Submission from Agent Code:

In [3]:
class Q_Agent():
    
    def __init__(self, environment):
        
        #Hyperparameters
        self.env = environment
        self.epsilon = 0.1
        self.gamma = 0.9
        self.action_resolution = 0.2
        self.Q = defaultdict(lambda : 0.) # Q-function
        self.n = defaultdict(lambda : 1.) # number of visits
        self.actions = actionSpace(self.action_resolution)
        self.actionspace = range(len(self.actions)-1)
        
    
    def actionSpace(self):
        x,y = np.meshgrid(np.arange(0,1+self.action_resolution,self.action_resolution),
                          np.arange(0,1+self.action_resolution,self.action_resolution))
        xy = np.concatenate((x.reshape(-1,1), y.reshape(-1,1)), axis=1)
        return xy.round(2).tolist()

    def train(self):
        
        Q = self.Q
        n = self.n
        actions = self.actions
        actionspace = self.actionspace

        greedy_action = lambda s : max(actionspace, key=lambda a : Q[(s,a)])
        max_q = lambda sp : max([Q[(sp,a)] for a in actionspace])

        
        for _ in range(200): #Do not change
            
            self.env.reset()
            nextstate = self.env.state
            
            while True:
                state = nextstate

                # Epsilon-Greedy Action Selection
                if epsilon > random.random() :
                    action = random.choice(actionspace)
                else :
                    action = greedy_action(state)

                env_action = actions[action]#convert to ITN/IRS
                print('env_action', env_action)
                nextstate, reward, done, _ = self.env.evaluateAction(env_action)
                print("nextstate=",nextstate)
                print("reward=",reward)

                # Q-learning
                if done :
                    Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward - Q[(state,action)] )
                    break
                else :
                    Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward + gamma * max_q(nextstate) - Q[(state,action)] )

        return Q


    def generate(self):
        best_policy = None
        best_reward = -float('Inf')
        
        Q_trained = self.train()
        greedy_eval = lambda s : max(actionspace, key=lambda a : Q_trained[(s,a)])
        
        best_policy = {state: list(actions[greedy_eval(state-1)]) for state in range(1,6)}
        best_reward = self.env.evaluatePolicy(best_policy)
        
        print(best_policy, best_reward)
        
        return best_policy, best_reward

### Run the EvaluateChallengeSubmission Method with your Agent Class

In [4]:
EvaluateChallengeSubmission(ChallengeSeqDecEnvironment, Q_Agent, "Q_submission.csv")

env_action [0.0, 0.0]
105  Evaluations Remaining
nextstate= 2
reward= 2.469036599869387
env_action [0.0, 0.0]
104  Evaluations Remaining
nextstate= 3
reward= 0.18492713129336869
env_action [0.0, 0.0]
103  Evaluations Remaining
nextstate= 4
reward= 0.11851761849252807
env_action [0.0, 0.0]
102  Evaluations Remaining
nextstate= 5
reward= 0.049422308420226546
env_action [0.0, 0.0]
101  Evaluations Remaining
nextstate= 6
reward= 0.1319613649711946
env_action [0.0, 0.0]
100  Evaluations Remaining
nextstate= 2
reward= 2.5733415915866735
env_action [0.0, 0.0]
99  Evaluations Remaining
nextstate= 3
reward= -0.2234729786123344
env_action [0.0, 0.0]
98  Evaluations Remaining
nextstate= 4
reward= 0.06848864193618898
env_action [0.0, 0.0]
97  Evaluations Remaining
nextstate= 5
reward= 0.042680290439070845
env_action [0.0, 0.0]
96  Evaluations Remaining
nextstate= 6
reward= 0.14730562992361618
env_action [0.0, 0.0]
95  Evaluations Remaining
nextstate= 2
reward= 2.4998379837878053
env_action [0.01, 

nextstate= 3
reward= 52.715558539605176
env_action [0.05, 0.0]
13  Evaluations Remaining
nextstate= 4
reward= -0.06410038351457814
env_action [0.92, 0.34]
12  Evaluations Remaining
nextstate= 5
reward= 29.21438135843079
env_action [0.09, 0.0]
11  Evaluations Remaining
nextstate= 6
reward= 0.05961615108611307
env_action [0.76, 0.73]
10  Evaluations Remaining
nextstate= 2
reward= 5.837346858028075
env_action [0.91, 0.51]
9  Evaluations Remaining
nextstate= 3
reward= -46.878764133268945
env_action [0.05, 0.0]
8  Evaluations Remaining
nextstate= 4
reward= 0.05593157675307037
env_action [0.92, 0.34]
7  Evaluations Remaining
nextstate= 5
reward= 30.60929909735881
env_action [0.09, 0.0]
6  Evaluations Remaining
nextstate= 6
reward= 0.2394097403080413
env_action [0.0, 0.0]
5  Evaluations Remaining
nextstate= 2
reward= 2.9509846098083132
env_action [0.25, 0.97]
4  Evaluations Remaining
nextstate= 3
reward= 50.51645538836125
env_action [0.05, 0.0]
3  Evaluations Remaining
nextstate= 4
reward= -0

ValueError: You have exceeded the permitted number of evaluations

In [90]:
class BanditRPM(object):
    def __init__(self,keys,init):
        self.ActionValue = {}
        for key in keys:
            self.ActionValue[key] = init

    def get_reward(self,action,text):
        if any(x in text for x in action):
            return 1
        else:
            return 0
    
    def choose_action(self):
        """
        Use Thompson sampling to choose action. Sample from each posterior and choose the max of the samples.
        """
        samples = {}
        for key in self.ActionValue:
            print("key=",key)
            print("key=",self.ActionValue[key][0])
            print("self.ActionValue[key][1]=",self.ActionValue[key][1])
            
            samples[key] = np.random.beta(self.ActionValue[key][0], self.ActionValue[key][1])
            print("samples[key]=",samples[key])
            max_value =  max(samples, key=samples.get)
            print("max_value=",max_value)
            return max_value

    def update(self,action,reward):
        """
        Update parameters of posteriors, which are Beta distributions
        """
        a, b = self.ActionValue[action]
        self.ActionValue[action] = (a+reward, b + 1 - reward)

In [91]:
bandit = BanditRPM([('hillary','clinton'),('donald','trump'),('bernie','sanders')],(1,1))

In [92]:
bandit.ActionValue

{('hillary', 'clinton'): (1, 1),
 ('donald', 'trump'): (1, 1),
 ('bernie', 'sanders'): (1, 1)}

In [93]:
action = bandit.choose_action()
bandit.update(action,1)

key= ('hillary', 'clinton')
key= 1
self.ActionValue[key][1]= 1
samples[key]= 0.44087618596399186
max_value= ('hillary', 'clinton')
