In [None]:
def show_state(env, step=0, name="", info="", image=None):
    """Fn to visualize the agent playing the game in a notebook
    """
    plt.figure(10)
    plt.clf()
    if image is not None:
        im = image
    else:
        im = env.render(mode="rgb_array")[0]
    plt.imshow(im)
    plt.title("{} | Step: {} {}".format(name, step, info))
    plt.axis('off')
    display.clear_output(wait=True)
    display.display(plt.gcf())

In [None]:
import torch
from scipy import optimize
import torch.nn.functional as F
import math
import numpy as np
from functools import reduce
from collections import OrderedDict
import matplotlib.pyplot as plt
from IPython import display
from optimization.Optimizer import PyTorchObjective

In [None]:
import logging
# logging.basicConfig(level='DEBUG')

In [None]:
import gym

In [None]:
import gvgai

In [None]:
# import matplotlib.pyplot as plt


In [None]:
from generator.levels.base import Generator

In [None]:
import numpy as np

In [None]:
from agent.NNagent import NNagent

In [None]:
from generator.env_gen_wrapper import GridGame

In [None]:
from scipy.optimize import Bounds

In [None]:
_x = NNagent(GridGame(game='dzelda', 
                      play_length=1000, 
                      path='./levels',
                      lvl_name='3.txt',
                      mechanics=['1', '2', '3', '+', 'g', 'w'], # monsters, key, door, wall
                      images=False,
                  )
         )

In [None]:
print(str(_x.env.generator))

In [None]:
# _x.nn = torch.load("./25_gen_weights_5_5.pt")
# _x.nn = torch.load("./1_monster_weights_5_5.pt")
_x.nn = torch.load("./dzelda_2_agent_450.pt")

In [None]:
_x.nn

In [None]:
_x.env.depth

In [None]:
z = PyTorchObjective(_x)
# z = PyTorchObjective(_x)

In [None]:
z.fun(z.x0)

In [None]:
z.x0.shape


In [None]:
bounds = [(-5, 5)]*z.x0.shape[0]

In [None]:
from utils.diff_evo import differential_evolution

In [None]:
import time

In [None]:
# NOTE: positive values being shown as the acheived score is BAD! 
# We're trying to minimize the loss surface. 

In [None]:
start = time.time()
ans = differential_evolution(z.fun, bounds, 
                             strategy='rand1bin',
                             popsize=99, 
                             maxiter=150,
                             polish=False, 
                             x0=z.x0)

In [None]:
end = time.time() - start

In [None]:
print(end//3600)

In [None]:
ans

In [None]:
z.x0 = ans.x

In [None]:
state_dict = z.unpack_parameters(ans.x)
z.f.load_state_dict(state_dict)

In [None]:
z.f == _x.nn

In [None]:
# t = _x.env.reset()

In [None]:
def move(action):
    a, b, c, d = _x.env.step(action)
    im = d['pic']
    return im, b

In [None]:
# im, r = move(5)

In [None]:
# plt.imshow(im)

In [None]:
# r

In [None]:
s = _x.fitness(fn=show_state) if _x.env.pics else _x.fitness()
_x.vis=None
print(s)

In [None]:
# 2600 * 1000 * 100 # frames seen

In [None]:
torch.save(_x.nn, "./dzelda_3_agent_600.pt")

In [None]:
_x.env.env.close()

In [None]:
t = []
objs = []
answers = []

for i in range(1, 4):
    t.append(NNagent(GridGame(game='dzelda', 
                      play_length=1000, 
                      path='./levels',
                      lvl_name=f'{i}.txt',
                      mechanics=['1', '2', '3', '+', 'g', 'w'], # monsters, key, door, wall
                      images=False,
                             )
                    )
            )
    
    if i == 1:
        t[-1].nn = torch.load("./dzelda_base_agent_150.pt")
    else:
        t[-1].nn = torch.load(f"./dzelda_{i-1}_agent_{150*(i)}.pt") #load previous best weights
    
    objs.append(PyTorchObjective(t[-1]))
    
    
    start = time.time()
    ans = differential_evolution(objs[-1].fun, bounds, 
                                 strategy='rand1bin',
                                 popsize=99, 
                                 maxiter=150,
                                 polish=False, 
                                 x0=objs[-1].x0)
    end = time.time() - start
    
    answers.append(ans)
    state_dict = objs[-1].unpack_parameters(ans.x)
    objs[-1].f.load_state_dict(state_dict)
    
    torch.save(t[-1].nn, f"./dzelda_{i}_agent_{150*(i+1)}.pt")
    
for pair in t:
    pair.env.env.close()

In [None]:
answers[1]

```

First: Train an agent on an extremely simple level to give the agent a behavior starting point. 
    - In POET that was a flat terrain. In my case, it's a level that looks like this: 
    -   wwwwwwwwwwwww    
        w...........w    A --> agent
        w...........w    + --> key
        w.+....A....w    g --> goal
        w...........w
        w...........w    Task: Take the key to the goal
        w...........w
        w.g.........w
        wwwwwwwwwwwww
        
Second: Initialize agent-environment population with the first learned behavior
    
While True:
    
    Evaluate each agent in it's paired environment
    
    Mutate environments (every m loops). 
    
        Mutation of an environment causes the agent neural network to be copied into the new environment
        This increases the population.
        
        - An example mutation could be
            - adding/removing in an enemy (three types)
            - adding/moving a goal
            - adding/removing a key
            - moving an agent
            - An example new level could look like this: 
                - wwwwwwwwwwwww    
                  w....+..1...w    A --> agent
                  w...g.......w    + --> key
                  w...........w    g --> goal
                  w...........w
                  w...w.......w    Task: Take the key to the goal
                  w.......A...w
                  w.g.........w
                  wwwwwwwwwwwww

    (slowly) Run one step of optimization for each agent within it's paired environment.
    
    Transfer agents between environments (every k loops)
        Intuition: Agent alpha might have learned behavior in it's paired environment that is actually behavior that is very good in environment beta. 
        
        - test every agent in every environment. 
        - transfer into environment i, the agent j, who performed the best.
    
    Return to top of the loop.
    
```

# Note. 

Moving the key from top right corner to mid left created a slightly simpler env.

----  

we were not able to learn the good policy if we kept the wieght range as [-2, 2]. 

Next I am retrying the same starting point but with range [-5, 5]. --> solved extremely simple env with this range and simpler env. 



----  
Then moving the key up one spot meant that the agent needs more training. So far it has failed to take the key and get to the goal after an additional 20 generations of training (but does get the key). I am giving it another 20 generations. 

After the agent learns the new environment (key moved up one spot), I am going to take those weights and put them back into the first env (key moved back down one spot).

----  
Note: There are times that the optimization straight up fails after only a generation or two. I think this is coming from the fact that the problem is very sparely rewarded.   

----

Edited dzelda.txt: 
    - picking up key +1
    - killing monster +1
    - taking key to door +2


1)  
```
wwwwwwwwwwwww
w...........w
w...........w
w.+....A....w
w...........w
w...........w
w...........w
w.g.........w
wwwwwwwwwwwww
```
2)   
```
wwwwwwwwwwwww
w...........w
w...........w
w.+.........w
w......A....w
w...........w
w...........w
w.g.........w
wwwwwwwwwwwww
```

3)  
```
wwwwwwwwwwwww
w...........w
w...........w
w.+.........w
w......A....w
w...........w
w..1........w
w.g.........w
wwwwwwwwwwwww
```

4)  
```
wwwwwwwwwwwww
w...........w
w...........w
w.+.w.......w
w...w..A....w
w...........w
w..1........w
w.g.........w
wwwwwwwwwwwww
```

Model:
```  
   Net(
      (conv1): Conv2d(13, 8, kernel_size=(3, 3), stride=(1, 1))
      (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (conv2): Conv2d(8, 32, kernel_size=(3, 3), stride=(1, 1))
      (fc1): Linear(in_features=96, out_features=48, bias=True)
      (fc2): Linear(in_features=48, out_features=24, bias=True)
      (fc3): Linear(in_features=24, out_features=6, bias=True)
   )

```

# Differetial Evolution:

## $\theta :=$ model_weights  
## Pick $\theta_a, \theta_b, \theta_c$   
## $Proposal_\theta = \theta_a + \alpha * (\theta_b - \theta_c))$

## Pros:
Computationally efficient  
Self-adaptation and crossover due to $\theta_b - \theta_c$

## Problems:  

$\theta$ is a ~10000 dimensional vector. 

Curse of Dimensionality!   
    - As the dimension go up, vectors become equidistant  

Good weight configurations are sparse.

Rewards are sparse.
    
    

Interesting observation. Whenever the agent completes the goal, it seems to do so with the minimal path. That's suprising to me because we're giving the agent 1000 time-steps and the fitness function is not taking account (yet) of the number of steps that the agent has used as a weighting on the score it achieves. 