# [KDD Cup|Humanities Track Tutorial Q-Learning](https://compete.hexagon-ml.com/tutorial/kdd-cuphumanities-track-tutorial/)

### KDD Cup|Humanities Track Tutorial Q-Learning
This Tutorial builds on the previous tutorial to demonstrate a baseline implementation of a standard Reinforcement Learning (RL) Algorithm

### State

$S \in \{1,2,3,4,5\}$

### Action
$A_S = [a_{ITN},a_{IRS}]$

where  $a_{ITN} \in [0,1]$ and $a_{IRS} \in [0,1]$

### Reward
$R_{\pi} \in (- \infty,\infty)$

![](image/rewards2.png)

In [1]:
import numpy as np
from collections import defaultdict
import random

# !pip3 install git+https://github.com/slremy/netsapi --user --upgrade
from netsapi.challenge import * 

In [2]:
def actionSpace(resolution):
    x = np.arange(0,1+resolution,resolution)
    y = 1-x
    x = x.reshape(len(x),1)
    y = y.reshape(len(y),1)
    xy = np.concatenate((x, y), axis=1)
    return xy.round(2).tolist()
actionSpace(0.1)

[[0.0, 1.0],
 [0.1, 0.9],
 [0.2, 0.8],
 [0.3, 0.7],
 [0.4, 0.6],
 [0.5, 0.5],
 [0.6, 0.4],
 [0.7, 0.3],
 [0.8, 0.2],
 [0.9, 0.1],
 [1.0, 0.0]]

### Learning a Value Function Based on ϵ-greedy action selection

This common resource was used as a reference for the implementation presented here: https://kofzor.github.io/Learning_Value_Functions/. Please refer to the blog and this Tutorial in tandem. The code below uses the first example from the blog with the Challenge Environment (as opposed to Gym).

In [3]:
env = ChallengeSeqDecEnvironment(experimentCount = 105)

Q = defaultdict(lambda : 0.) # Q-function
n = defaultdict(lambda : 1.) # number of visits

def actionSpace(resolution):
    x = np.arange(0,1+resolution,resolution)
    y = 1-x
    x = x.reshape(len(x),1)
    y = y.reshape(len(y),1)
    xy = np.concatenate((x, y), axis=1)
    return xy.round(2).tolist()


#HyperParameters
epsilon = 0.1
gamma = 0.9
action_resolution = 0.1
episode_number = 1 #for submission this is fixed as 20


#Set-up
actions = actionSpace(action_resolution)
print(actions)
actionspace = range(len(actions)-1)
greedy_action = lambda s : max(actionspace, key=lambda a : Q[(s,a)])
max_q = lambda sp : max([Q[(sp,a)] for a in actionspace])

#Training of Q Table
for _ in range(episode_number):
    env.reset()
    nextstate = env.state
    while True:
        state = nextstate
        # Epsilon-Greedy
        if epsilon > random.random() :
            action = random.choice(actionspace)
            print('random_action',action)
        else :
            action = greedy_action(state)
        env_action = actions[action]#convert to ITN/IRS
        nextstate, reward, done, _ = env.evaluateAction(env_action)

        # Q-learning
        if done :
            Q[(state,action)] = Q[(state,action)] + \
                                1./n[(state,action)] * ( reward - Q[(state,action)] )
            break
        else :
            Q[(state,action)] = Q[(state,action)] + \
                                1./n[(state,action)] * ( reward + \
                                                        gamma * max_q(nextstate) - Q[(state,action)] )

#Greedy Policy Learnt from Q Table
best_policy = {state: list(actions[greedy_action(state-1)]) for state in range(1,6)}
best_reward = env.evaluatePolicy(best_policy)
print(best_policy, best_reward)

[[0.0, 1.0], [0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1], [1.0, 0.0]]
105  Evaluations Remaining
104  Evaluations Remaining
103  Evaluations Remaining
102  Evaluations Remaining
101  Evaluations Remaining
100  Evaluations Remaining
{1: [0.0, 1.0], 2: [0.0, 1.0], 3: [0.0, 1.0], 4: [0.0, 1.0], 5: [0.1, 0.9]} 108.07990517167795


### Creating a Valid Submission from Agent Code:

In [4]:
class Q_Agent():
    
    def __init__(self, environment):
        
        #Hyperparameters
        self.env = environment
        self.epsilon = 0.1
        self.gamma = 0.9
        self.action_resolution = 0.01
        self.Q = defaultdict(lambda : 0.) # Q-function
        self.n = defaultdict(lambda : 1.) # number of visits
        self.actions = actionSpace(self.action_resolution)
        self.actionspace = range(len(self.actions)-1)
        
    
    def actionSpace(self,resolution):
        x = np.arange(0,1+resolution,resolution)
        y = 1-x
        x = x.reshape(len(x),1)
        y = y.reshape(len(y),1)
        xy = np.concatenate((x, y), axis=1)
        return xy.round(2).tolist()

    def train(self):
        
        Q = self.Q
        n = self.n
        actions = self.actions
        print("actions=",actions)
        actionspace = self.actionspace

        greedy_action = lambda s : max(actionspace, key=lambda a : Q[(s,a)])
        max_q = lambda sp : max([Q[(sp,a)] for a in actionspace])

        
        for _ in range(20): #Do not change
            
            self.env.reset()
            nextstate = self.env.state
            
            while True:
                state = nextstate

                # Epsilon-Greedy Action Selection
                if epsilon > random.random() :
                    action = random.choice(actionspace)
                else :
                    action = greedy_action(state)

                env_action = actions[action]#convert to ITN/IRS
                print('env_action', env_action)
                nextstate, reward, done, _ = self.env.evaluateAction(env_action)
#                 print("nextstate=",nextstate)
#                 print("reward=",reward)

                # Q-learning
                if done :
                    Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward - Q[(state,action)] )
                    break
                else :
                    Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward + gamma * max_q(nextstate) - Q[(state,action)] )

        return Q


    def generate(self):
        best_policy = None
        best_reward = -float('Inf')
        
        Q_trained = self.train()
        greedy_eval = lambda s : max(actionspace, key=lambda a : Q_trained[(s,a)])
        
        best_policy = {state: list(actions[greedy_eval(state-1)]) for state in range(1,6)}
        best_reward = self.env.evaluatePolicy(best_policy)
        
        print(best_policy, best_reward)
    
        return best_policy, best_reward

### Run the EvaluateChallengeSubmission Method with your Agent Class

In [5]:
EvaluateChallengeSubmission(ChallengeSeqDecEnvironment, Q_Agent, "Q_submission.csv")

actions= [[0.0, 1.0], [0.01, 0.99], [0.02, 0.98], [0.03, 0.97], [0.04, 0.96], [0.05, 0.95], [0.06, 0.94], [0.07, 0.93], [0.08, 0.92], [0.09, 0.91], [0.1, 0.9], [0.11, 0.89], [0.12, 0.88], [0.13, 0.87], [0.14, 0.86], [0.15, 0.85], [0.16, 0.84], [0.17, 0.83], [0.18, 0.82], [0.19, 0.81], [0.2, 0.8], [0.21, 0.79], [0.22, 0.78], [0.23, 0.77], [0.24, 0.76], [0.25, 0.75], [0.26, 0.74], [0.27, 0.73], [0.28, 0.72], [0.29, 0.71], [0.3, 0.7], [0.31, 0.69], [0.32, 0.68], [0.33, 0.67], [0.34, 0.66], [0.35, 0.65], [0.36, 0.64], [0.37, 0.63], [0.38, 0.62], [0.39, 0.61], [0.4, 0.6], [0.41, 0.59], [0.42, 0.58], [0.43, 0.57], [0.44, 0.56], [0.45, 0.55], [0.46, 0.54], [0.47, 0.53], [0.48, 0.52], [0.49, 0.51], [0.5, 0.5], [0.51, 0.49], [0.52, 0.48], [0.53, 0.47], [0.54, 0.46], [0.55, 0.45], [0.56, 0.44], [0.57, 0.43], [0.58, 0.42], [0.59, 0.41], [0.6, 0.4], [0.61, 0.39], [0.62, 0.38], [0.63, 0.37], [0.64, 0.36], [0.65, 0.35], [0.66, 0.34], [0.67, 0.33], [0.68, 0.32], [0.69, 0.31], [0.7, 0.3], [0.71, 0.29]

env_action [0.0, 1.0]
97  Evaluations Remaining
env_action [0.85, 0.15]
96  Evaluations Remaining
env_action [0.73, 0.27]
95  Evaluations Remaining
env_action [0.0, 1.0]
94  Evaluations Remaining
env_action [0.63, 0.37]
93  Evaluations Remaining
env_action [0.0, 1.0]
92  Evaluations Remaining
env_action [0.85, 0.15]
91  Evaluations Remaining
env_action [0.73, 0.27]
90  Evaluations Remaining
env_action [0.75, 0.25]
89  Evaluations Remaining
env_action [0.63, 0.37]
88  Evaluations Remaining
env_action [0.0, 1.0]
87  Evaluations Remaining
env_action [0.85, 0.15]
86  Evaluations Remaining
env_action [0.42, 0.58]
85  Evaluations Remaining
env_action [0.0, 1.0]
84  Evaluations Remaining
env_action [0.63, 0.37]
83  Evaluations Remaining
env_action [0.0, 1.0]
82  Evaluations Remaining
env_action [0.85, 0.15]
81  Evaluations Remaining
env_action [0.73, 0.27]
80  Evaluations Remaining
env_action [0.0, 1.0]
79  Evaluations Remaining
env_action [0.86, 0.14]
78  Evaluations Remaining
env_action [0.

env_action [0.91, 0.09]
61  Evaluations Remaining
env_action [0.0, 1.0]
60  Evaluations Remaining
env_action [0.0, 1.0]
59  Evaluations Remaining
env_action [0.01, 0.99]
58  Evaluations Remaining
env_action [0.39, 0.61]
57  Evaluations Remaining
env_action [0.91, 0.09]
56  Evaluations Remaining
env_action [0.0, 1.0]
55  Evaluations Remaining
env_action [0.52, 0.48]
54  Evaluations Remaining
env_action [0.01, 0.99]
53  Evaluations Remaining
env_action [0.39, 0.61]
52  Evaluations Remaining
env_action [0.91, 0.09]
51  Evaluations Remaining
env_action [0.0, 1.0]
50  Evaluations Remaining
env_action [0.52, 0.48]
49  Evaluations Remaining
env_action [0.73, 0.27]
48  Evaluations Remaining
env_action [0.39, 0.61]
47  Evaluations Remaining
env_action [0.91, 0.09]
46  Evaluations Remaining
env_action [0.0, 1.0]
45  Evaluations Remaining
env_action [0.52, 0.48]
44  Evaluations Remaining
env_action [0.01, 0.99]
43  Evaluations Remaining
env_action [0.39, 0.61]
42  Evaluations Remaining
env_action

env_action [0.57, 0.43]
26  Evaluations Remaining
env_action [0.0, 1.0]
25  Evaluations Remaining
env_action [0.56, 0.44]
24  Evaluations Remaining
env_action [0.36, 0.64]
23  Evaluations Remaining
env_action [0.87, 0.13]
22  Evaluations Remaining
env_action [0.56, 0.44]
21  Evaluations Remaining
env_action [0.0, 1.0]
20  Evaluations Remaining
env_action [0.56, 0.44]
19  Evaluations Remaining
env_action [0.16, 0.84]
18  Evaluations Remaining
env_action [0.87, 0.13]
17  Evaluations Remaining
env_action [0.56, 0.44]
16  Evaluations Remaining
env_action [0.0, 1.0]
15  Evaluations Remaining
env_action [0.56, 0.44]
14  Evaluations Remaining
env_action [0.01, 0.99]
13  Evaluations Remaining
env_action [0.87, 0.13]
12  Evaluations Remaining
env_action [0.56, 0.44]
11  Evaluations Remaining
env_action [0.0, 1.0]
10  Evaluations Remaining
env_action [0.56, 0.44]
9  Evaluations Remaining
env_action [0.01, 0.99]
8  Evaluations Remaining
env_action [0.28, 0.72]
7  Evaluations Remaining
env_action 

env_action [0.0, 1.0]
104  Evaluations Remaining
env_action [0.0, 1.0]
103  Evaluations Remaining
env_action [0.71, 0.29]
102  Evaluations Remaining
env_action [0.0, 1.0]
101  Evaluations Remaining
env_action [0.0, 1.0]
100  Evaluations Remaining
env_action [0.0, 1.0]
99  Evaluations Remaining
env_action [0.01, 0.99]
98  Evaluations Remaining
env_action [0.71, 0.29]
97  Evaluations Remaining
env_action [0.27, 0.73]
96  Evaluations Remaining
env_action [0.0, 1.0]
95  Evaluations Remaining
env_action [0.0, 1.0]
94  Evaluations Remaining
env_action [0.01, 0.99]
93  Evaluations Remaining
env_action [0.71, 0.29]
92  Evaluations Remaining
env_action [0.0, 1.0]
91  Evaluations Remaining
env_action [0.0, 1.0]
90  Evaluations Remaining
env_action [0.0, 1.0]
89  Evaluations Remaining
env_action [0.01, 0.99]
88  Evaluations Remaining
env_action [0.71, 0.29]
87  Evaluations Remaining
env_action [0.0, 1.0]
86  Evaluations Remaining
env_action [0.0, 1.0]
85  Evaluations Remaining
env_action [0.46, 0

env_action [0.59, 0.41]
67  Evaluations Remaining
env_action [0.02, 0.98]
66  Evaluations Remaining
env_action [0.0, 1.0]
65  Evaluations Remaining
env_action [0.03, 0.97]
64  Evaluations Remaining
env_action [0.03, 0.97]
63  Evaluations Remaining
env_action [0.59, 0.41]
62  Evaluations Remaining
env_action [0.02, 0.98]
61  Evaluations Remaining
env_action [0.0, 1.0]
60  Evaluations Remaining
env_action [0.03, 0.97]
59  Evaluations Remaining
env_action [0.77, 0.23]
58  Evaluations Remaining
env_action [0.59, 0.41]
57  Evaluations Remaining
env_action [0.02, 0.98]
56  Evaluations Remaining
env_action [0.0, 1.0]
55  Evaluations Remaining
env_action [0.03, 0.97]
54  Evaluations Remaining
env_action [0.77, 0.23]
53  Evaluations Remaining
env_action [0.59, 0.41]
52  Evaluations Remaining
env_action [0.02, 0.98]
51  Evaluations Remaining
env_action [0.0, 1.0]
50  Evaluations Remaining
env_action [0.03, 0.97]
49  Evaluations Remaining
env_action [0.77, 0.23]
48  Evaluations Remaining
env_acti

env_action [0.0, 1.0]
31  Evaluations Remaining
env_action [0.0, 1.0]
30  Evaluations Remaining
env_action [0.97, 0.03]
29  Evaluations Remaining
env_action [0.02, 0.98]
28  Evaluations Remaining
env_action [0.73, 0.27]
27  Evaluations Remaining
env_action [0.0, 1.0]
26  Evaluations Remaining
env_action [0.0, 1.0]
25  Evaluations Remaining
env_action [0.97, 0.03]
24  Evaluations Remaining
env_action [0.86, 0.14]
23  Evaluations Remaining
env_action [0.73, 0.27]
22  Evaluations Remaining
env_action [0.0, 1.0]
21  Evaluations Remaining
env_action [0.0, 1.0]
20  Evaluations Remaining
env_action [0.97, 0.03]
19  Evaluations Remaining
env_action [0.02, 0.98]
18  Evaluations Remaining
env_action [0.73, 0.27]
17  Evaluations Remaining
env_action [0.0, 1.0]
16  Evaluations Remaining
env_action [0.0, 1.0]
15  Evaluations Remaining
env_action [0.97, 0.03]
14  Evaluations Remaining
env_action [0.86, 0.14]
13  Evaluations Remaining
env_action [0.73, 0.27]
12  Evaluations Remaining
env_action [0.0,

env_action [0.0, 1.0]
104  Evaluations Remaining
env_action [0.0, 1.0]
103  Evaluations Remaining
env_action [0.0, 1.0]
102  Evaluations Remaining
env_action [0.0, 1.0]
101  Evaluations Remaining
env_action [0.0, 1.0]
100  Evaluations Remaining
env_action [0.0, 1.0]
99  Evaluations Remaining
env_action [0.0, 1.0]
98  Evaluations Remaining
env_action [0.0, 1.0]
97  Evaluations Remaining
env_action [0.11, 0.89]
96  Evaluations Remaining
env_action [0.0, 1.0]
95  Evaluations Remaining
env_action [0.0, 1.0]
94  Evaluations Remaining
env_action [0.01, 0.99]
93  Evaluations Remaining
env_action [0.0, 1.0]
92  Evaluations Remaining
env_action [0.0, 1.0]
91  Evaluations Remaining
env_action [0.0, 1.0]
90  Evaluations Remaining
env_action [0.01, 0.99]
89  Evaluations Remaining
env_action [0.01, 0.99]
88  Evaluations Remaining
env_action [0.0, 1.0]
87  Evaluations Remaining
env_action [0.01, 0.99]
86  Evaluations Remaining
env_action [0.22, 0.78]
85  Evaluations Remaining
env_action [0.01, 0.99]

<netsapi.challenge.EvaluateChallengeSubmission at 0x117359b70>