In [1]:
from grid import Grid
import numpy as np
from agent import Agent
import time
import pickle
from matplotlib import pyplot as plt

In [2]:
def create_test_grids(amount=1000, size=5):
    grids = []
    for j in range(amount):
        grids.append(Grid(size, random=True))
    with open('test_grids.pickle', 'wb') as fil:
        pickle.dump(grids, fil)


In [3]:
#create_test_grids(1000)


In [4]:
def load_test_grids():
    with open('test_grids.pickle', 'rb') as fil:
        grids = pickle.load(fil)
    return grids

In [5]:
grids = load_test_grids()

In [6]:
def test_MC_first_visit(testgrids, iterations=100, model_based=True, nn_init=False):
    reward_dist = {}
    rewards = []
    for testgrid in testgrids:
        a = Agent()
        if model_based:
            Q, policy = a.mc_first_visit_control(testgrid.copy(), iterations, nn_init=nn_init) # Q value key is (self.agent_pos,self.train.pos,list(self.other_agents.positions)[0])
            grids, action_values, reward = a.run_final_policy(testgrid.copy(), Q, nn_init=nn_init)
        else:
            reward = a.run_model_free_policy(testgrid.copy())
        if reward not in reward_dist:
            reward_dist[reward] = 1
        else:
            reward_dist[reward] += 1
        rewards.append(reward)
        
    return np.mean(rewards), reward_dist

In [None]:
iterations = [0, 1, 3, 5, 10, 30, 100]#, 300, 1000]
scores = []

agent_deaths = []
other_deaths = []
switch_uses = []
pushes = []
nothing = []

for it in iterations:
    score, reward_dist = test_MC_first_visit(grids, it, nn_init=True)
    print(it, score, reward_dist)
    agent_deaths.append(0)
    other_deaths.append(0)
    switch_uses.append(0)
    pushes.append(0)
    nothing.append(0)
    for key in reward_dist:
        if key<=-5:
            agent_deaths[-1] += reward_dist[key]
        elif key <=-2:
            other_deaths[-1] += reward_dist[key]
        if key == -0.5:
            pushes[-1] += reward_dist[key]
        if key ==-0.2:
            switch_uses[-1] += reward_dist[key]
        if key ==0:
            nothing[-1] += reward_dist[key]
        
    scores.append(score)
    

1 -0.3364 {0: 732, -0.5: 20, -2: 134, -2.2: 7, -0.2: 76, -2.4: 2, -0.4: 13, -0.6000000000000001: 6, -0.8999999999999999: 1, -5.6000000000000005: 1, -1.0: 7, -0.7: 1}
3 -0.2926 {0: 769, -0.5: 16, -2: 121, -5: 2, -0.2: 72, -0.4: 9, -0.6000000000000001: 5, -2.2: 1, -2.4000000000000004: 2, -1.2: 1, -0.7: 1, -2.7: 1}
5 -0.27 {0: 789, -2: 116, -5.2: 2, -0.2: 69, -0.5: 18, -0.4: 3, -2.2: 1, -0.7: 2}


In [None]:
plt.plot(iterations, scores)
plt.plot(iterations, scores)
plt.xscale("log")

In [None]:
mb_scores = []

for it in iterations:
    score, reward_dist = test_MC_first_visit(grids, it, nn_init=False)
    print(it, score, reward_dist)
    mb_scores.append(score)

In [None]:
plt.plot(iterations, mb_scores, label = "Original MC-first visit")
plt.plot(iterations, scores, label = "MC-first visit with NN-initialization")
plt.xlabel("MC-first visit iterations")
plt.ylabel("Average reward on test scenarios")

plt.legend()
plt.xscale("log")

In [None]:
# Model based results, no init
# it, score
# 1 -1.0611
# 3 -0.9010999999999999
# 10 -0.44580000000000003
# 30 -0.2856
# 100 -0.2311
# 300 -0.21
# 1000 -0.2019

# Model based results, with init
# 1 -0.3517
# 3 -0.2993
# 5 -0.27679999999999993
# 10 -0.2627
# 30 -0.2398
# 100 -0.2105

In [None]:
print(test_MC_first_visit(grids, model_based=False))

In [None]:
print(test_MC_first_visit(grids, 0, nn_init=True))