### 1. Imports

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')

import os

from src.agent import PongAgent

import numpy as np

import pickle
import time

# import gym pong environment
import gymnasium as gym
from ale_py.roms import Pong
from utils.functions import get_ball_and_paddle_coordinates, plot_rewards, plot_rewards_dream, plot_planning, plot_policy, act2cat, cat2act, plot_dram, plot_spikes

from tqdm import trange

import samna
import dynapse.dynapse1utils as ut

%matplotlib inline

### 2. Connect to DYNAP-SE

In [2]:
# open without GUI (for board connected to the remote machine)
model, _ = ut.open_dynapse1(gui=False, sender_port=ut.free_port(), receiver_port=ut.free_port(), select_device=True)

[0]:  Bus 1 Device 71 Dynapse1DevKit serial_number 00000020
[1]:  Bus 1 Device 76 Dynapse1DevKit serial_number 00000030
0Sender port: tcp://0.0.0.0:44179
Receiver port: tcp://0.0.0.0:48115
Opened device name: Dynapse1DevKit
SamnaNode ID: 1
PythonNode ID: 2
 Dynapse1Wrapper created! libcaer init...
Clearing chip 0... DONE.
Clearing chip 1... DONE.
Clearing chip 2... DONE.
Clearing chip 3... DONE.


## 3. Training
##### a) Create the network

In [6]:
# Create the agent
if_dream = 1 # 0 (no dreaming) or 1 (dreaming)
agent = PongAgent(model, if_dream)

monitored_neurons model:  [(3, 0, 1), (3, 0, 2), (3, 0, 3), (3, 0, 4), (3, 0, 5), (3, 0, 6), (3, 0, 7), (3, 0, 8), (3, 0, 9), (3, 0, 10), (3, 0, 11), (3, 0, 12), (3, 0, 13), (3, 0, 14), (3, 0, 15), (3, 0, 16), (3, 0, 17), (3, 0, 18), (3, 0, 19), (3, 0, 20), (3, 0, 21), (3, 0, 22), (3, 0, 23), (3, 0, 24), (3, 0, 25), (3, 0, 26), (3, 0, 27), (3, 0, 28), (3, 0, 29), (3, 0, 30), (3, 0, 31), (3, 0, 32), (3, 0, 33), (3, 0, 34), (3, 0, 35), (3, 0, 36), (3, 0, 37), (3, 0, 38), (3, 0, 39), (3, 0, 40), (3, 0, 41), (3, 0, 42), (3, 0, 43), (3, 0, 44), (3, 0, 45), (3, 0, 46), (3, 0, 47), (3, 0, 48), (3, 0, 49), (3, 0, 50), (3, 0, 51), (3, 0, 52), (3, 0, 53), (3, 0, 54), (3, 0, 55), (3, 0, 56), (3, 0, 57), (3, 0, 58), (3, 0, 59), (3, 0, 60), (3, 0, 61), (3, 0, 62), (3, 0, 63), (3, 0, 64), (3, 0, 65), (3, 0, 66), (3, 0, 67), (3, 0, 68), (3, 0, 69), (3, 0, 70), (3, 0, 71), (3, 0, 72), (3, 0, 73), (3, 0, 74), (3, 0, 75), (3, 0, 76), (3, 0, 77), (3, 0, 78), (3, 0, 79), (3, 0, 80), (3, 0, 81), (3, 0, 82)

##### b) Run the training loop

In [None]:
N_REPS = 10 # number of repetitions

for _ in range(N_REPS):
    N_ITER = 2000 # number of games
    T_awake = 100 # single game time horizon (in frames)
    start_learn = 1*50 # start training after start_learn games

    # Create a new training directory
    exists = True
    i = 0
    folder = ''
    while exists:
        folder = 'training_runs/' + str(if_dream) + '_training_' + str(i)
        exists = os.path.exists(folder)
        i += 1
    os.makedirs(folder)
    print(folder + " folder is created!")

    # Save dynapse parameters
    ut.save_parameters2txt_file(model.get_configuration(), filename="./"+folder+"/dynapse_parameters.txt")
    # Save network configuration
    with open("./"+folder+"/network_config.pkl", 'wb') as file:
        pickle.dump(agent.net_gen.network.post_neuron_dict, file)

    REWARDS = []
    REWARDS_MEAN = []
    REWARDS_STANDARD_MEAN = []
    ENTROPY = []
    ENTROPY_MEAN = []

    ERROR_RAM = []
    ERROR_R = []
    MEAN_ERROR_RAM = []
    MEAN_ERROR_R = []

    # init readout layers
    agent.init_policy_readout()
    if if_dream:
        agent.init_model_readout()

    # Create the Pong environment
    env = gym.make('Pong-ramDeterministic-v0', difficulty = 0)

    agent.start()
    agent.state_model = 0

    # Start the training loop (usually 2000 games)
    for iteration in trange(N_ITER):
        env.reset()
        agent.state_out_policy = 0
        agent.state_out_model = 0
        
        S_agent = []
        S_planner = []

        R = []
        R_PRED = []
        
        RAM = []
        RAM_PRED = []
        DRAM_PRED = []
        DRAM = []

        RTOT = 0
        
        ######### AWAKE PHASE ##########

        # skip the first few frames of the game (they're not relevant)
        t_skip = 17
        ram_all, r, done, _, _ = env.step(0)
        for _ in range(t_skip):
            ram_all, r, done, _, _ = env.step(0)

        frame = 0
        entropy=0

        OUT = []
        OUT_dream = []

        r_learn = 0
        
        ram = get_ball_and_paddle_coordinates(ram_all) # get starting state 
        agent.sink_node.get_events() # discard spike buffer
        agent.latest_spike_rates = np.zeros((2*agent.num_hidden_neurons)) # clear latest spike rates

        # play T_awake frames in real environment (usually 100 frames)
        while not done and frame<T_awake:
            frame += 1
            ram_old = ram

            # agent step to get the next action
            action, out = agent.step_policy(ram)
            S_agent.append(agent.input_policy[:])
            
            if if_dream:
                act_vec = np.array([0,0,0])
                act_vec[action] = 1
                # model step to predict next state and reward
                ds_pred,r_pred = agent.step_model(act_vec, ram)
                S_planner.append(agent.input_model[:])
            
            # perform action in real environment
            ram_all, r, done, _, _ = env.step(cat2act(action))
            ram = get_ball_and_paddle_coordinates(ram_all)

            # accumulate policy gradient
            if_learn=0
            if iteration > start_learn:
                if_learn=1
            agent.compute_gradient_policy(r*if_learn)

            entropy+=agent.entropy

            RAM.append(ram)
            OUT.append(out)
            RTOT +=r
            R += [r]
            
            if if_dream:
                # compute state change
                dram = ram-ram_old
                dram[np.abs(dram)>30]=0.
                
                r_learn = r_learn*.5 + r

                # update world model network
                agent.compute_gradient_model(ds_pred,r_pred,dram,r_learn)
                agent.update_model()

                # predicted next state
                agent.state_model = ram_old+ds_pred

                RAM_PRED.append(agent.state_model)
                R_PRED += [r_pred]
                DRAM_PRED.append(ds_pred)    
                DRAM.append(dram)
                
        REWARDS.append(RTOT)
        ENTROPY.append(entropy)

        if if_dream:
            # store state and reward prediction errors
            ERROR_RAM.append(np.mean(np.abs(np.array(DRAM)-np.array(DRAM_PRED)), axis=0))
            ERROR_R.append(np.mean(np.abs(np.array(R)-np.array(R_PRED))))
        
        if (iteration%1==0)&(iteration>-1):
            # update agent network
            agent.update_policy()

        if (iteration%50==0)&(iteration>0):
            # Print number of spikes for setting the DYNAP-SE parameters before starting the training
            # print('total #spikes (policy)', np.sum(S_agent))
            # if if_dream:
            #     print('total #spikes (model)', np.sum(S_planner))
            REWARDS_MEAN.append(np.mean(REWARDS[-50:]))
            ENTROPY_MEAN.append(np.mean(ENTROPY[-50:]))
            
            # store the mean return over the last 50 games
            np.save(os.path.join(folder,"rewards_" + "if_dream_" + str(if_dream) + ".npy"), REWARDS_MEAN)
            if if_dream:
                plot_rewards_dream(REWARDS, REWARDS_MEAN, S_agent, S_planner, OUT, RAM, RAM_PRED, R, R_PRED, ENTROPY_MEAN, filename = os.path.join(folder, 'rec_dream_' + str(iteration) + '.png') )
            
                MEAN_ERROR_RAM.append(np.mean(np.array(ERROR_RAM)[-50:,:],axis=0))
                MEAN_ERROR_R.append(np.mean(np.array(ERROR_R)[-50:]))

                plot_dram(OUT, DRAM,DRAM_PRED,R,R_PRED,MEAN_ERROR_RAM,MEAN_ERROR_R, S_planner, filename= os.path.join(folder, 'awake_dram_fit_' + str(iteration) + '.png'))
            else:
                plot_rewards(REWARDS, REWARDS_MEAN, S_agent, [[]], OUT, RAM, [], R, [], ENTROPY_MEAN, filename = os.path.join(folder, "rec_" + str(iteration) + ".png"))
            
        ######### DREAMING PHASE ##########

        plot_dream_every = 50

        for dream_times in range(if_dream):
            RAM_PLAN = []
            REWS_PLAN = []
            S_agent = []
            S_planner = []

            env.reset()
            agent.state_out_policy = 0
            agent.state_out_model = 0
            
            # skip the first few frames (this is not needed, it's just to make the plot look better)
            ram_all, r, done, _, _ = env.step(0)
            t_skip = 17
            for skip in range(t_skip):
                ram_all, r, done, _, _ = env.step(0)
            ram = get_ball_and_paddle_coordinates(ram_all)
            
            agent.sink_node.get_events()
            
            time_dream = 50
            for planning_steps in range(time_dream):
                # agent step to get the next action
                action, out = agent.step_policy(ram)
                
                # model step to predict next state and reward
                act_vec = np.array([0,0,0])
                act_vec[action]=1
                ds_pred, r_pred = agent.step_model(act_vec, ram)
                
                S_planner.append(agent.input_model[:])
                S_agent.append(agent.input_policy[:])

                ram = ram + ds_pred

                if_learn=0
                if iteration > start_learn:
                    if_learn=1

                # accumulate policy gradient
                agent.compute_gradient_policy(r_pred*if_learn)

                RAM_PLAN.append(ram)
                OUT_dream.append(out)
                REWS_PLAN.append(r_pred)
            
            # update agent network
            agent.update_policy()

            if (iteration%50==0)&(dream_times==0):
                plot_planning(OUT, OUT_dream, REWS_PLAN, R, RAM_PLAN, RAM, S_agent, S_planner, t_skip, filename = os.path.join(folder, 'dream_' + str(iteration) + '.png'))
                
    # After the training loop, store the data
    REWARDS_MEAN.append(np.mean(REWARDS[-50:]))        
    np.save(os.path.join(folder,"rewards_" + "if_dream_" + str(if_dream) + ".npy"), REWARDS_MEAN)

    agent.save(os.path.join(folder,'model_J_out_final.py'))
    
    print('REWARDS_MEAN',REWARDS_MEAN)

agent.stop()

In [None]:
agent.stop()

In [6]:
samna.device.close_device(model)