In [1]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
id = 903657078
np.random.seed(id)

# Forest Mgmt (20 Stands)

In [2]:
P, R = forest(S=20, r1=10, r2=6, p=0.1)

In [3]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [4]:
def test_policy(P, R, policy, test_count=1000, gamma=0.99):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode

In [5]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [6]:
vi_df = trainVI(P, R, epsilon=[1e-2, 1e-4, 1e-6, 1e-8, 1e-10])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.01,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44,0.001995,4.621514,"(4.429226083429906, 4.981660312099133, 4.98166..."
1,0.0001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",66,0.0,4.606154,"(4.4706146525683454, 5.023100336527209, 5.0231..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,0.003989,4.77517,"(4.474643139169861, 5.027129333047953, 5.02712..."
3,1e-08,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",109,0.01562,4.93391,"(4.475089377376456, 5.027575565280265, 5.02757..."
4,1e-10,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",131,0.0,4.832684,"(4.475133321365347, 5.027619509211218, 5.02761..."


In [14]:
pi = PolicyIteration(P, R, gamma=0.99, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(19, 0.007977008819580078, 4.585287640372227)

In [15]:
pi_pol

(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

## Q Learning

In [17]:
def trainQ(P, R, discount=0.99, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[10000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [18]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [10000, 100000]
q_df = trainQ(P, R, discount=0.99, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 4.85947869861722
2: 4.95930414798535
3: 5.289835308988045
4: 4.951778889565525
5: 5.28753974196389
6: 5.083686712792791
7: 0.8
8: 4.977528698846463
9: 5.0406228995761495
10: 4.791964669911572
11: 5.1148234407063295
12: 4.800260206026284
13: 5.03218171134104
14: 1.05
15: 4.92633925741033
16: 0.85
17: 5.258542483329936
18: 0.85
19: 5.187188222680715
20: 4.642342528519129
21: 5.432450912511606
22: 5.244487776574169
23: 1.1
24: 0.9
25: 5.464707224145992
26: 5.240005806155211
27: 0.8
28: 0.55
29: 5.389474374238355
30: 5.374603779448078
31: 5.260087055369938
32: 5.074181383233422


In [19]:
vi_df.Policy == pi_pol

0    False
1    False
2    False
3    False
4    False
Name: Policy, dtype: bool

In [20]:
test_policy(P,R,q_df.Policy[18])

5.151627456307554

In [21]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,10000,0.99,0.001,10.0,0.99,4.859479,0.829768,"(0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...","(3.184789509095638, 3.6727802278262467, 0.8302...","[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,10000,0.99,0.0001,10.0,0.99,4.959304,0.343905,"(0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","(1.425901840014029, 1.9145061289496745, 0.4846...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
2,10000,0.999,0.001,10.0,0.99,5.289835,0.345185,"(0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, ...","(17.012412268502036, 17.511714184231508, 9.972...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
3,10000,0.999,0.0001,10.0,0.99,4.951779,0.334516,"(0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","(16.524594333145618, 17.025760014908787, 10.57...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ..."
4,10000,0.99,0.001,10.0,0.999,5.28754,0.336119,"(0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, ...","(2.4473562061258654, 2.936113802034315, 0.9596...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
5,10000,0.99,0.0001,10.0,0.999,5.083687,0.344957,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, ...","(0.8381610689149007, 1.2744792841052426, 0.528...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
6,10000,0.999,0.001,10.0,0.999,0.8,0.333167,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","(13.624956604953876, 14.123461846457706, 9.704...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
7,10000,0.999,0.0001,10.0,0.999,4.977529,0.333415,"(0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, ...","(13.386261621873814, 13.886133320812043, 9.891...","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ..."
8,10000,0.99,0.001,1.0,0.99,5.040623,0.331758,"(0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, ...","(3.312796951640167, 3.8014961710440818, 0.9092...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
9,10000,0.99,0.0001,1.0,0.99,4.791965,0.33608,"(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, ...","(1.4978451097596064, 1.9823078015992714, 0.564...","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."


In [22]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,0.9945,0.00055,5.5,0.9945,4.238459,0.367892
100000,0.9945,0.00055,5.5,0.9945,3.860504,2.817985


In [23]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,4.237553,1.597775
0.999,0.9945,0.00055,5.5,3.86141,1.588102


# Forest Mgmt (400 Stands)

In [33]:
P, R = forest(S=400, r1=100, r2=50, p=0.1)

In [34]:
vi_df = trainVI(P, R, epsilon=[1e-2, 1e-4, 1e-6, 1e-8, 1e-10])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.01,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",55,0.009967,3.232216,"(4.460720290173723, 5.013211594807497, 5.01321..."
1,0.0001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",76,0.015588,3.251499,"(4.473560831234312, 5.026046957818786, 5.02604..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",98,0.015622,3.272644,"(4.47498279201032, 5.027468979261533, 5.027468..."
3,1e-08,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",120,0.015593,3.147657,"(4.475122825121185, 5.027609012960728, 5.02760..."
4,1e-10,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",142,0.015598,3.228119,"(4.475136615199219, 5.027622803044467, 5.02762..."


In [35]:
pi = PolicyIteration(P, R, gamma=0.99, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(46, 0.3655359745025635, 3.166851115356492)

In [36]:
pi_pol

(0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [37]:
vi_df.Policy == pi_pol

0    False
1    False
2    False
3    False
4    False
Name: Policy, dtype: bool

In [38]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [10000, 100000]
q_df = trainQ(P, R, discount=0.99, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.329412465998682
2: 2.472899893099131
3: 2.3650972952278746
4: 2.329424924443961
5: 2.4618765214081497
6: 2.3348849472588884
7: 2.2707416920804704
8: 2.4156065155452735
9: 2.308491846985047
10: 2.3735151264933143
11: 2.25255523826646
12: 2.3646896710912837
13: 2.4309526553372436
14: 2.3705329569256675
15: 2.3139588215007922
16: 0.2175
17: 2.7118810023752977
18: 2.6067466764703835
19: 2.6071126281118744
20: 2.5800423451136156
21: 2.633841469805221
22: 2.652382974644151
23: 2.6615729475347663
24: 2.737904763680246
25: 2.578021022825698
26: 2.6756616691180226
27: 2.494463183843716
28: 2.6551591334680062
29: 2.57495033455721
30: 2.5826337488301925
31: 2.6238041080306096
32: 2.6370796124339924


In [39]:
test_policy(P,R,q_df.Policy[18])

2.5280532747656896

In [40]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,10000,0.99,0.001,10.0,0.99,2.329412,0.398271,"(0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...","(3.05883374035337, 3.547839427128347, 1.001862...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,10000,0.99,0.0001,10.0,0.99,2.4729,0.403708,"(0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","(1.4235714949243279, 1.905956636458472, 0.1049...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,10000,0.999,0.001,10.0,0.99,2.365097,0.413354,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","(16.43011952447195, 16.92999642422934, 10.0295...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,10000,0.999,0.0001,10.0,0.99,2.329425,0.395144,"(0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","(16.323294400118513, 16.82346819608345, 9.1173...","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,10000,0.99,0.001,10.0,0.999,2.461877,0.415341,"(0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","(2.348202578376889, 2.836778106760064, 0.78301...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,10000,0.99,0.0001,10.0,0.999,2.334885,0.395,"(0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","(0.8806607523388182, 1.3304334063384415, 0.632...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
6,10000,0.999,0.001,10.0,0.999,2.270742,0.411559,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(13.276156763304684, 13.774072077074802, 9.019...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,10000,0.999,0.0001,10.0,0.999,2.415607,0.395689,"(0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","(13.497382739742697, 13.996154558020073, 9.908...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,10000,0.99,0.001,1.0,0.99,2.308492,0.416263,"(0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...","(2.6831392589258845, 3.170899585182195, 0.1672...","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
9,10000,0.99,0.0001,1.0,0.99,2.373515,0.406917,"(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(1.4009965700814573, 1.8866900037918848, 0.706...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [41]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,0.9945,0.00055,5.5,0.9945,2.225759,0.404313
100000,0.9945,0.00055,5.5,0.9945,2.625829,3.412644


In [42]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.481573,1.903205
0.999,0.9945,0.00055,5.5,2.370014,1.913752


In [43]:
q_df.to_csv('q_df.csv')