In [1]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
import gym
import numpy as np
from numpy.random import choice
import pandas as pd
import seaborn as sns
id = 903657078
np.random.seed(id)

# Forest Mgmt (20 Stands)

In [2]:
P, R = forest(S=20, r1=10, r2=6, p=0.1)

In [3]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [4]:
def test_policy(P, R, policy, test_count=1000, gamma=0.99):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # initiate in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # ended when back to 0
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode

In [5]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [6]:
vi_df = trainVI(P, R, epsilon=[1e-2, 1e-4, 1e-6, 1e-8, 1e-10])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.01,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",44,0.001959,4.621514,"(4.429226083429906, 4.981660312099133, 4.98166..."
1,0.0001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",66,0.002969,4.606154,"(4.4706146525683454, 5.023100336527209, 5.0231..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,0.002991,4.77517,"(4.474643139169861, 5.027129333047953, 5.02712..."
3,1e-08,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",109,0.003993,4.93391,"(4.475089377376456, 5.027575565280265, 5.02757..."
4,1e-10,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",131,0.004986,4.832684,"(4.475133321365347, 5.027619509211218, 5.02761..."


In [7]:
pi = PolicyIteration(P, R, gamma=0.99, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(19, 0.008959770202636719, 4.690314567627048)

In [8]:
pi_pol

(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

## Q Learning

In [9]:
def trainQ(P, R, discount=0.99, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[10000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [10]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [10000, 100000]
q_df = trainQ(P, R, discount=0.99, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 4.963780237317811
2: 4.991795007509998
3: 5.015840848056057
4: 5.029208229083696
5: 0.95
6: 5.314415347864072
7: 5.221246677890615
8: 5.254902271828766
9: 5.271321948203251
10: 5.070904052712279
11: 0.65
12: 5.0316263819272775
13: 5.314375526831382
14: 0.85
15: 5.440745307372738
16: 5.259730136595956
17: 5.301608317747182
18: 5.435976678629387
19: 4.948067337392542
20: 4.860974950884494
21: 5.376923480642296
22: 1.1
23: 5.269575015267764
24: 0.9
25: 1.05
26: 5.113864234875365
27: 0.9
28: 5.053504132664148
29: 5.458674234927214
30: 5.212231036629547
31: 0.85
32: 5.246371027295942


In [11]:
vi_df.Policy == pi_pol

0    False
1    False
2    False
3    False
4    False
Name: Policy, dtype: bool

In [12]:
test_policy(P,R,q_df.Policy[18])

5.288364688331648

In [13]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,10000,0.99,0.001,10.0,0.99,4.96378,0.3371,"(0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...","(3.1321082741041004, 3.620524546353792, 0.9344...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,10000,0.99,0.0001,10.0,0.99,4.991795,0.333833,"(0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, ...","(1.103302396304254, 1.59103057580518, 0.433405...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,10000,0.999,0.001,10.0,0.99,5.015841,0.327823,"(0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, ...","(17.060929270394713, 17.562616907279853, 11.40...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,10000,0.999,0.0001,10.0,0.99,5.029208,0.336103,"(0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, ...","(16.2936890887697, 16.796962249522853, 10.5712...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
4,10000,0.99,0.001,10.0,0.999,0.95,0.32915,"(0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, ...","(2.473373160412038, 2.9649597064760678, 0.8172...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
5,10000,0.99,0.0001,10.0,0.999,5.314415,0.329151,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, ...","(0.855098646690035, 1.3003282428270482, 0.5674...","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
6,10000,0.999,0.001,10.0,0.999,5.221247,0.335106,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, ...","(13.898788862333323, 14.396575453776382, 10.17...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,10000,0.999,0.0001,10.0,0.999,5.254902,0.333108,"(0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, ...","(12.632697548915532, 13.131541700312402, 8.916...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
8,10000,0.99,0.001,1.0,0.99,5.271322,0.326099,"(0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, ...","(3.157726674825064, 3.646918807946182, 1.03143...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ..."
9,10000,0.99,0.0001,1.0,0.99,5.070904,0.335041,"(0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, ...","(1.2671932964194959, 1.7506439406515846, 0.314...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."


# Forest Mgmt (400 Stands)

In [14]:
P, R = forest(S=400, r1=100, r2=50, p=0.1)

In [15]:
vi_df = trainVI(P, R, epsilon=[1e-2, 1e-4, 1e-6, 1e-8, 1e-10])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.01,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",55,0.009367,3.114714,"(4.460720290173723, 5.013211594807497, 5.01321..."
1,0.0001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",76,0.011968,3.17668,"(4.473560831234312, 5.026046957818786, 5.02604..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",98,0.016539,3.223264,"(4.47498279201032, 5.027468979261533, 5.027468..."
3,1e-08,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",120,0.019092,3.237396,"(4.475122825121185, 5.027609012960728, 5.02760..."
4,1e-10,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",142,0.023616,3.133958,"(4.475136615199219, 5.027622803044467, 5.02762..."


In [16]:
pi = PolicyIteration(P, R, gamma=0.99, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(46, 0.36949801445007324, 3.2166265232634923)

In [17]:
pi_pol

(0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [18]:
vi_df.Policy == pi_pol

0    False
1    False
2    False
3    False
4    False
Name: Policy, dtype: bool

In [19]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [10000, 100000]
q_df = trainQ(P, R, discount=0.99, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.3732298136188716
2: 2.418332991137467
3: 2.263986525137082
4: 2.328984411497443
5: 0.22
6: 2.403161268021506
7: 2.357075757906654
8: 2.448436042642781
9: 2.3391663902844133
10: 2.298714739236609
11: 2.317533036825829
12: 0.2025
13: 2.45012331701929
14: 2.38375445718938
15: 2.4918040808158595
16: 0.2225
17: 2.7464699435643887
18: 2.6189503949160415
19: 2.5387749591361217
20: 2.633366442056226
21: 2.6090982826354976
22: 2.647071866583212
23: 2.756300463539056
24: 2.4884060416510874
25: 2.684812748550465
26: 2.670763657880487
27: 2.5429599331842234
28: 2.6041095385224673
29: 2.5873502226793694
30: 2.527111685618704
31: 2.5190671172621464
32: 2.5089861561510505


In [20]:
test_policy(P,R,q_df.Policy[18])

2.6369908349132536

In [21]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,10000,0.99,0.001,10.0,0.99,2.37323,0.413879,"(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(2.834231347169487, 3.32248512576726, 0.955349...","[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
1,10000,0.99,0.0001,10.0,0.99,2.418333,0.403912,"(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(1.5495575209847219, 2.033320863132644, 0.6393...","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
2,10000,0.999,0.001,10.0,0.99,2.263987,0.420714,"(0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","(16.907953348791143, 17.410540049802375, 9.970...","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
3,10000,0.999,0.0001,10.0,0.99,2.328984,0.4069,"(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","(16.22665067998551, 16.727568036336354, 8.6701...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,10000,0.99,0.001,10.0,0.999,0.22,0.428959,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(2.289265969349061, 2.7755156265839425, 0.7706...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
5,10000,0.99,0.0001,10.0,0.999,2.403161,0.405916,"(0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","(0.7252003348620663, 1.1548614164256386, 0.640...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,10000,0.999,0.001,10.0,0.999,2.357076,0.416878,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","(13.389244570259184, 13.888528589467198, 8.969...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
7,10000,0.999,0.0001,10.0,0.999,2.448436,0.403052,"(0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","(12.053253684170484, 12.549650847392025, 8.606...","[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
8,10000,0.99,0.001,1.0,0.99,2.339166,0.415883,"(0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","(2.932819165996761, 3.4193641302056026, 0.8765...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
9,10000,0.99,0.0001,1.0,0.99,2.298715,0.406909,"(0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","(1.261896239571919, 1.7405272419379623, 0.5885...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ..."
