# [KDD Cup|Humanities Track Tutorial Q-Learning](https://compete.hexagon-ml.com/tutorial/kdd-cuphumanities-track-tutorial/)

### KDD Cup|Humanities Track Tutorial Q-Learning
This Tutorial builds on the previous tutorial to demonstrate a baseline implementation of a standard Reinforcement Learning (RL) Algorithm

### State

$S \in \{1,2,3,4,5\}$

### Action
$A_S = [a_{ITN},a_{IRS}]$

where  $a_{ITN} \in [0,1]$ and $a_{IRS} \in [0,1]$

### Reward
$R_{\pi} \in (- \infty,\infty)$

![](image/rewards2.png)

In [1]:
import numpy as np
from collections import defaultdict
import random
# !pip3 install git+https://github.com/slremy/netsapi --user --upgrade
from netsapi.challenge import * 

### Creating a Valid Submission from Agent Code:

In [None]:
class BanditRPM(object):
    def __init__(self,env):
        self.env = env
        self.action_resolution = 0.1
        self.actions = self.actionSpace()    
        
        self.ActionValue = {}
        self.init = (2,5)
        for key in self.actions:
            self.ActionValue[key] = self.init
        
#     def actionSpace(self):
#         x = np.arange(0,1+self.action_resolution,self.action_resolution)
#         y = 1-x
#         x = x.reshape(len(x),1)
#         y = y.reshape(len(y),1)
#         xy = np.concatenate((x, y), axis=1)
#         xy = xy.round(2)
#         xy = [tuple(row) for row in xy]
        
#         return xy
        
    def actionSpace(self):
        x,y = np.meshgrid(np.arange(0,1+self.action_resolution,self.action_resolution),
                          np.arange(0,1+self.action_resolution,self.action_resolution))
        xy = np.concatenate((x.reshape(-1,1), y.reshape(-1,1)), axis=1)
        xy = xy.round(2)
        xy = [tuple(row) for row in xy]
        return xy
    
    def choose_action(self):
        """
        Use Thompson sampling to choose action. Sample from each posterior and choose the max of the samples.
        """
        samples = {}
        for key in self.ActionValue:
            samples[key] = np.random.beta(self.ActionValue[key][0], self.ActionValue[key][1])
        max_value =  max(samples, key=samples.get)
        return max_value    

    def update(self,action,reward):
        """
        Update parameters of posteriors, which are Beta distributions
        """
        a, b = self.ActionValue[action]
        a = a+reward
        b = b + 1 - reward
        a = 0.001 if a <= 0 else a
        b = 0.001 if b <= 0 else b
        
        self.ActionValue[action] = (a, b)
        
    def train(self):
        for _ in range(20): #Do not change
            self.env.reset()
            while True:
                action =  self.choose_action()
                nextstate, reward, done, _ = self.env.evaluateAction(list(action))
                self.update(action,reward)
                if done:
                    break


    def generate(self):
        best_policy = None
        best_reward = -float('Inf')
        self.train()
        best_policy = {state: list(self.choose_action()) for state in range(1,6)}
        best_reward = self.env.evaluatePolicy(best_policy)
        
        print(best_policy, best_reward)
        
        return best_policy, best_reward                    

### Run the EvaluateChallengeSubmission Method with your Agent Class

In [None]:
EvaluateChallengeSubmission(ChallengeSeqDecEnvironment, BanditRPM, "BanditRPM_submission.csv")

In [137]:
class BanditRPM(object):
    def __init__(self,keys,init):
        self.ActionValue = {}
        for key in keys:
            self.ActionValue[key] = init
    
    def get_reward(self,action,text):
        print("action=",action)
        print("text=",text)
        if any(x in text for x in action):
            return 1
        else:
            return 0
    
    def choose_action(self):
        """
        Use Thompson sampling to choose action. Sample from each posterior and choose the max of the samples.
        """
        samples = {}
        for key in self.ActionValue:
            print("key=",key)
#             print("key=",self.ActionValue[key][0])
#             print("self.ActionValue[key][1]=",self.ActionValue[key][1])
            
            samples[key] = np.random.beta(self.ActionValue[key][0], self.ActionValue[key][1])
#             print("samples[key]=",samples[key])
        max_value =  max(samples, key=samples.get)
        print("max_value=",max_value)
        return max_value

    def update(self,action,reward):
        """
        Update parameters of posteriors, which are Beta distributions
        """
        print("action=",action)
        print("reward=",reward)
        a, b = self.ActionValue[action]
        a = a+reward
        b = b + 1 - reward
        a = 0.0001 if a <= 0 else a
        b = 0.0001 if b <= 0 else b
        print("a=",a)
        print("b=",b)
        self.ActionValue[action] = (a, b)

In [138]:
bandit = BanditRPM([('hillary','clinton'),('donald','trump'),('bernie','sanders')],(1,5))

In [111]:
bandit.ActionValue

{('hillary', 'clinton'): (1, 5),
 ('donald', 'trump'): (1, 5),
 ('bernie', 'sanders'): (1, 5)}

In [112]:
action = bandit.choose_action()
reward= bandit.get_reward(action)
bandit.update(action,reward)

max_value= ('hillary', 'clinton')


TypeError: get_reward() missing 1 required positional argument: 'text'