# Collecting performances using reward differences only

In [None]:
import numpy as np
import os

import model_based_agent as mba 
import worm_env as we 
import ensemble_mod_env as eme

from improc import *
import utils as ut
import tab_agents as tab
from datetime import datetime 

def reward_diff_method(
    collection_eps = 6,
    frac_on = 1/2,
    eval_on_list = [1/2,1/3,1/4],
    collection_ep_time = 600, # in seconds. Must be a multiple of worm_ep_len
    eval_ep_time = 120, # in seconds
    worm_ep_len = 300, # in seconds
    ):
    '''
    Function output:
    Saves all trajectories for collection and eval episodes. 
    collect{i}.pkl for former and mod{i}_{eval frac ind}.pkl is the model after ep i.
    eval{i}_{eval frac ind}.pkl for latter
    
    1. Collects data with light on frac_on of the time.
    2. Evaluates reward difference policy with various amounts of light penalty given
        by eval_on_list. 
    '''
    
    folder = './Data/Reals'+datetime.now().strftime('%d-%m-%H-%M')
    fbase = folder+'/realworm_'
    if os.path.isdir(folder):
        os.path.rmdir(folder)
    os.mkdir(folder)
    
    # Initialize objects
    dh = mba.DataHandler()
    worm = we.ProcessedWorm(0,ep_len=worm_ep_len,act_spacing=1) 
        # act_spacing here is only for eval episodes
    ant = tab.Q_Alpha_Agent_Agent()
    runner = mba.WormRunner(ant,worm)
    
    
    for ce in range(collection_eps):
        # Collecting random data
        #############################
        fname = f'collect{ce}.pkl'
        if collection_ep_time%worm_ep_len != 0:
            raise ValueError('Collection_ep_time is not a multiple of worm_ep_len')
        mba.get_init_traj(fname, worm, int(collection_ep_time/worm_ep_len), rand_probs=[1-frac_on,frac_on])
        dh.add_dict_to_df([fname],reward_ahead=10,timestep_gap=1,prev_act_window=3,jump_limit=100)

        # Find RDiff matrix and collect eval episodes
        #############################
        for i,ev in enumerate(eval_on_list):
            mset = ModelSet(1,frac=1,lp_frac=ev)
            mset.make_models(dh,{'lambda':.1,'iters':10})
            # Save model
            mname = f'mod{ce}_{i}.pkl'
            with open(mname,'wb') as f:
                pickle.dump(mset.models[0])
            
            rdiff = np.sign(mset.models[0]['reward_on']-mset.models[0]['reward_off'])
            runner.ant.Qtab[:,0] = np.zeros(144)
            runner.ant.Qtab[:,1] = rdiff.flatten()
            cam,task = init_instruments()
            ename = f'eval{ce}_{i}.pkl'
            runner.eval_ep(cam,task,ename,steps=eval_ep_time)
            
    