In [1]:
from grid import Grid
import numpy as np
from agent import Agent
import time
import pickle
from matplotlib import pyplot as plt

In [2]:
def create_test_grids(amount=1000, size=5):
    grids = []
    for j in range(amount):
        grids.append(Grid(size, random=True))
    with open('test_grids.pickle', 'wb') as fil:
        pickle.dump(grids, fil)


In [3]:
#create_test_grids(1000)


In [4]:
def load_test_grids():
    with open('test_grids.pickle', 'rb') as fil:
        grids = pickle.load(fil)
    return grids

In [5]:
grids = load_test_grids()

In [6]:
def test_MC_first_visit(testgrids, iterations=100, model_based=True, nn_init=False):
    reward_dist = {}
    rewards = []
    for testgrid in testgrids:
        a = Agent()
        if model_based:
            Q, policy = a.mc_first_visit_control(testgrid.copy(), iterations, nn_init=nn_init) # Q value key is (self.agent_pos,self.train.pos,list(self.other_agents.positions)[0])
            grids, action_values, reward = a.run_final_policy(testgrid.copy(), Q, nn_init=nn_init)
        else:
            reward = a.run_model_free_policy(testgrid.copy())
        if reward not in reward_dist:
            reward_dist[reward] = 1
        else:
            reward_dist[reward] += 1
        rewards.append(reward)
        
    return np.mean(rewards), reward_dist

In [7]:
iterations = [0, 1, 3, 10, 30, 100]#, 300, 1000]
mb_scores = []

mb_agent_deaths = []
mb_other_deaths = []
mb_switch_uses = []
mb_pushes = []
mb_nothing = []
mb_other = []

for it in iterations:
    score, reward_dist = test_MC_first_visit(grids, it, nn_init=False)
    print(it, score, reward_dist)
    mb_agent_deaths.append(0)
    mb_other_deaths.append(0)
    mb_switch_uses.append(0)
    mb_pushes.append(0)
    mb_nothing.append(0)
    mb_other.append(0)
    for key in reward_dist:
        if key<=-5:
            mb_agent_deaths[-1] += reward_dist[key]
        elif key <=-2:
            mb_other_deaths[-1] += reward_dist[key]
        elif key == -0.5:
            mb_pushes[-1] += reward_dist[key]
        elif key ==-0.2:
            mb_switch_uses[-1] += reward_dist[key]
        elif key ==0:
            mb_nothing[-1] += reward_dist[key]
        else:
            mb_other[-1] += reward_dist[key]
        
    mb_scores.append(score)

0 -0.933 {0: 718, -5: 123, -2: 159}
1 -0.8985 {0: 718, -5: 114, -2: 156, -0.2: 4, -2.2: 1, -0.5: 5, -5.5: 2}
3 -0.7858999999999999 {0: 733, -5: 91, -2: 151, -5.2: 1, -0.5: 9, -0.2: 10, -7.5: 1, -5.5: 1, -2.2: 1, -1.0: 2}
10 -0.43439999999999995 {0: 787, -5.5: 1, -2: 138, -0.2: 34, -5: 24, -5.2: 3, -0.5: 11, -2.5: 2}
30 -0.27640000000000003 {0: 817, -0.2: 57, -2: 115, -5: 5, -0.5: 5, -7.5: 1}
100 -0.23160000000000003 {0: 814, -0.2: 73, -2: 107, -0.5: 6}


In [None]:
scores = []

agent_deaths = []
other_deaths = []
switch_uses = []
pushes = []
nothing = []
other = []

for it in iterations:
    score, reward_dist = test_MC_first_visit(grids, it, nn_init=True)
    print(it, score, reward_dist)
    agent_deaths.append(0)
    other_deaths.append(0)
    switch_uses.append(0)
    pushes.append(0)
    nothing.append(0)
    other.append(0)
    for key in reward_dist:
        if key<=-5:
            agent_deaths[-1] += reward_dist[key]
        elif key <=-2:
            other_deaths[-1] += reward_dist[key]
        elif key == -0.5:
            pushes[-1] += reward_dist[key]
        elif key ==-0.2:
            switch_uses[-1] += reward_dist[key]
        elif key ==0:
            nothing[-1] += reward_dist[key]
        else:
            other[-1] += reward_dist[key]
        
    scores.append(score)
    

0 -0.30629999999999996 {0: 778, -2: 127, -5: 2, -0.2: 59, -0.4: 12, -2.2: 8, -0.8: 2, -0.5: 8, -0.7: 1, -0.6000000000000001: 3}
1 -0.2653 {0: 804, -2: 118, -0.2: 58, -2.2: 4, -0.8: 2, -0.5: 7, -0.4: 5, -1.2: 1, -0.6000000000000001: 1}
3 -0.2562 {0: 806, -0.2: 62, -2: 113, -2.4000000000000004: 1, -0.4: 6, -2.4: 1, -2.2: 3, -0.5: 8}
10 -0.22690000000000002 {0: 820, -0.2: 66, -2: 101, -0.4: 1, -2.2: 3, -0.5: 8, -0.7: 1}
30 -0.20650000000000002 {0: 828, -0.2: 69, -2: 94, -0.5: 8, -0.7: 1}


In [None]:
plt.plot(iterations, agent_deaths)
plt.plot(iterations, other_deaths)
plt.plot(iterations, switch_uses)
plt.plot(iterations, pushes)
plt.plot(iterations, nothing)
plt.xscale("log")

In [None]:
events = [agent_deaths[0], other_deaths[0], switch_uses[0], pushes[0], nothing[0], other[0]]
labels = ["Agent dies", "Other person dies", "Switch used", "Agent pushes other", "Nothing", "Other"]
plt.pie(events, labels=labels)
#plt.bar([i for i in range(len(events))], events)

In [None]:
events = [mb_agent_deaths[-1], mb_other_deaths[-1], mb_switch_uses[-1], mb_pushes[-1], mb_nothing[-1], mb_other[-1]]
labels = ["Agent dies", "Other person dies", "Switch used", "Agent pushes other", "Nothing", "Other"]
plt.pie(events, labels=labels)

In [None]:
plt.plot(iterations, mb_scores, label = "Original MC-first visit")
plt.plot(iterations, scores, label = "MC-first visit with NN-initialization")
plt.xlabel("MC-first visit iterations")
plt.ylabel("Average reward on test scenarios")

plt.legend()
plt.xscale("log")

In [None]:
events = [mb_agent_deaths[0], agent_deaths[0], mb_agent_deaths[5], agent_deaths[5]]
plt.bar([i for i in range(len(events))], events, tick_label=['Do nothing','NN','MC 100 iter'
                                                             ,'MC-NN 100 iter'])

plt.ylabel('Agent deaths/1000')

In [None]:
events = [mb_other_deaths[0], other_deaths[0], mb_other_deaths[5], other_deaths[5]]
plt.bar([i for i in range(len(events))], events, tick_label=['Do nothing','NN','MC 100 iter'
                                                             ,'MC-NN 100 iter'])

plt.ylabel('Other deaths/1000')

In [None]:
events = [mb_switch_uses[0], switch_uses[0], mb_switch_uses[5], switch_uses[5]]
plt.bar([i for i in range(len(events))], events, tick_label=['Do nothing','NN','MC 100 iter'
                                                             ,'MC-NN 100 iter'])

plt.ylabel('Switch used and no one dies/1000')

In [None]:
events = [mb_pushes[0], pushes[0], mb_pushes[5], pushes[5]]
plt.bar([i for i in range(len(events))], events, tick_label=['Do nothing','NN','MC 100 iter'
                                                             ,'MC-NN 100 iter'])

plt.ylabel('Push other used and no one dies/1000')

In [None]:
events = [mb_other[0], other[0], mb_other[5], other[5]]
plt.bar([i for i in range(len(events))], events, tick_label=['Do nothing','NN','MC 100 iter'
                                                             ,'MC-NN 100 iter'])

plt.ylabel('Other behavior/1000')

In [None]:
# Model based results, no init
# it, score
# 1 -1.0611
# 3 -0.9010999999999999
# 10 -0.44580000000000003
# 30 -0.2856
# 100 -0.2311
# 300 -0.21
# 1000 -0.2019

# Model based results, with init
# 1 -0.3517
# 3 -0.2993
# 5 -0.27679999999999993
# 10 -0.2627
# 30 -0.2398
# 100 -0.2105