#Offline Reinforcement Learning

This program will take in *hand labelled* Epileptor data. The data will be in the form of a tuple:
    
    data = (  t,
    state[t]:  state vector at t,
    action[t]: action taken at t + 1 (decided on based on state[t], or action log) ,
    reward[t]: reward from this action-state pair (based on state[t+1] and action[t])

Where, for simplicity, 

    state vector = ( average of 10 ms ) 
    
    reward = (-100) for seizure
             (-10) - (-50) for stimulation
             


In [1]:
%pylab nbagg
from tvb.simulator.lab import *


Populating the interactive namespace from numpy and matplotlib
   INFO  log level set to INFO


In [86]:
from sklearn.ensemble import ExtraTreesRegressor

I have created a library with a few very useful functions for The Virtual Brain. This can be found in sim.py.

In [2]:
import sim

#Defining the action space
These functions translate actions to stimulation parameters, and set the penalty based on each action depending on the power delivered to the brain.

In [8]:
def set_stim_parameters(action, stim):
    if action == 0:
        stim.temporal.parameters["amp"] = 0
        stim.temporal.parameters["T"] = 10
        stim.temporal.parameters["tau"] = 0
        # Power = 0
        
    elif action == 1:
        stim.temporal.parameters["amp"] = 1
        stim.temporal.parameters["T"] = 5
        stim.temporal.parameters["tau"] = 2.5
        # Power = 5
        
    elif action == 2:
        stim.temporal.parameters["amp"] = 1
        stim.temporal.parameters["T"] = 10
        stim.temporal.parameters["tau"] = 5
        # Power = 5
        
    elif action == 3:
        stim.temporal.parameters["amp"] = 2
        stim.temporal.parameters["T"] = 5
        stim.temporal.parameters["tau"] = 2.5 
        # Power = 20
        
    elif action == 4:
        stim.temporal.parameters["amp"] = 2
        stim.temporal.parameters["T"] = 10
        stim.temporal.parameters["tau"] = 5
        # Power = 20   
        
    elif action == 5:
        stim.temporal.parameters["amp"] = 5
        stim.temporal.parameters["T"] = 5
        stim.temporal.parameters["tau"] = 2.5
        # Power = 125
            
    elif action == 6:
        stim.temporal.parameters["amp"] = 20
        stim.temporal.parameters["T"] = 10
        stim.temporal.parameters["tau"] = 10
        # Power = 125

In [9]:
def stim_penalty(action_taken):
    #stimulus penalty = - power delivered * power_coeff
    
    power_coeff = 0.2
    
    if action_taken == 0:
        return 0
    elif action_taken == 1 or action_taken == 2:
        return -5 * power_coeff
    elif action_taken == 3 or action_taken == 4:
        return -20 * power_coeff
    else:
        return -125 * power_coeff


#Make some data
Here is some code to create some dummy training data. More data with 
various modes of stimulation will be created later. Start by creating a simulation

In [75]:
epileptic = [9]
almost_epileptic = []
stimulus_target = [9]
stimulus_amplitude = -20
stimulus_weight = 2 #0-2 for each region
onset = 0
period = 500
pulse_width = 0
sim_time = 100000
dt = 100

valid_action = [0, 1,3,5]

(s, stimulus) = sim.set_sim(epileptic, almost_epileptic, stimulus_target, stimulus_amplitude, 
              stimulus_weight, onset, period, pulse_width, sim_time)



Get rid of noisy initial conditions

In [76]:
s.run(simulation_length=3000)
print("")




In [77]:
dataset = []


Create some action plan to create data

In [78]:
action_plan = []
for i in range (sim_time/dt):
    action_plan.append(0)
    if random.random() < 0.4:
        action_plan[i] = random.choice(valid_action)

Put data into tuples described above, sans reward.

In [79]:
states = []
for i in range(sim_time/dt):
    
    #Print to have an idea of how far the execution is
    if i%10 == 0:
        print "\ntime is: " + str(i*dt)
    


    #simulate for dt
    (t, a), (x1, x2), (y1, y2) = s.run(simulation_length = dt)
    
    #append the result of the short simulation to the full list
    if i == 0:
        tt = copy(t - 2999.5)
        aa = copy(a)
    else:
        tt = np.append(tt, t - 2999.5, axis=0)
        aa = np.append(aa, a, axis=0) 
     
    
    
    #create features vector
    (imins, imaxs, irng, iavg) = extract_features(t,a)
    features = (imins, imaxs, iavg, stimulus.temporal.parameters["amp"], 
                (dt/stimulus.temporal.parameters["T"])*stimulus.temporal.parameters["tau"]) 
    
    #create values to be passed to the dataset
    time = t[0] - 2999.5
    if i > 1:
        s_vector = state_vector(9, features, states[i-1], states[i-2])
    else:
        s_vector = state_vector(9, features, (1,1,1,0), (1,1,1,0))
        
    #since this is offline learning, we will follow a different action plan
    action = action_plan[i]
    set_stim_parameters(action, stimulus)
        
    states.append(s_vector)
    

    
    dataset.append((time, s_vector, action))


time is: 0

time is: 1000

time is: 2000

time is: 3000

time is: 4000

time is: 5000

time is: 6000

time is: 7000

time is: 8000

time is: 9000

time is: 10000

time is: 11000

time is: 12000

time is: 13000

time is: 14000

time is: 15000

time is: 16000

time is: 17000

time is: 18000

time is: 19000

time is: 20000

time is: 21000

time is: 22000

time is: 23000

time is: 24000

time is: 25000

time is: 26000

time is: 27000

time is: 28000

time is: 29000

time is: 30000

time is: 31000

time is: 32000

time is: 33000

time is: 34000

time is: 35000

time is: 36000

time is: 37000

time is: 38000

time is: 39000

time is: 40000

time is: 41000

time is: 42000

time is: 43000

time is: 44000

time is: 45000

time is: 46000

time is: 47000

time is: 48000

time is: 49000

time is: 50000

time is: 51000

time is: 52000

time is: 53000

time is: 54000

time is: 55000

time is: 56000

time is: 57000

time is: 58000

time is: 59000

time is: 60000

time is: 61000

time is: 62000

time

In [80]:
figure = figsize(10, 8)

plot((tt[:]),aa[:,0,9,0], 'k')

#plot(10*(tt[1:(len(avg[9])+1)]-3000), np.array(action_log)/6. - 3, 'k')
(amp_plot, freq_plot) = get_freq_amp(action_plan)


plot( 100*(tt[:(len(action_plan))]), np.array(states)[:,0]/2. - 5, 'b')
plot( dt+100*(tt[:(len(action_plan))]), np.array(amp_plot)/6. - 4, 'b')


#plot(10*(tt[1:(len(avg[9])+1)]-3000), np.array(freq_plot)/200. - 3, 'g')
title("2-state 2-action Q-learning Algorithm \n Simplified state (memoryless)")

show()

<IPython.core.display.Javascript object>

Now, we need to add rewards to the set based on our hand-labelling of the results.

    reward = -100 (seizure at state t+1?) + (penalty for action at t)

IRL, we would put the seizure start and end times here. But, epileptor doesn't require hand labelling, so we can just plug in our seizure detection indicator (avg of the lfp) in the loop.

In [61]:

t_seizures = ( (-50,150), (2650,2850), (4750,5150),(7250,7350),(9150,10000000) ) #tuples representing seizure onset-offset.
                                                  #put 0 if seizure starts before frame, 
                                                  #very big number if ends after frame
                                                  #offset by -dt/2 for some reason
            
            #Bug: every 20th stimulation, dt = 99 instead of 100. What the hell TVB?
            #Offset by -dt/2 for now. This makes sures that frame onsets are caught within these bounds.

In [81]:
i = 0
for point in dataset:
    
    if ( i < len(dataset) - 1):
        time = dataset[i+1][0]
    else:
        time = dataset[len(dataset)-1]
    
    action_taken = point[2]
    reward = 0
    
    #determine whether this point is a seizure
    ictal = 0
#    for seizure in t_seizures:
#        if seizure[0] <= time and time < seizure[1]:
#            ictal = 1
    if point[1][0] == 0:
        ictal = 1
    #we can cheat since this means seizure


    if ictal:
        reward -= 100
    else:
        reward = 0
    
    reward += stim_penalty(action_taken)
    
    dataset[i] = point[:3] + (reward, )
    
    i += 1      

Now, learn the dataset:

In [82]:
#q_table = {}

In [83]:
learn_dataset(q_table,dataset)

In [24]:
def learn_dataset(q_table, dataset):
    for i in range(len(dataset) - 1):
        learn_from_frame(q_table, dataset[i], dataset[i+1])
        
        

In [23]:
def get_q(q_table, state, action):
    return q_table.get((state,action), 0)
    
def update_q(q_table, frame, newval):
    q_table[(frame[1], frame[2])] = newval

def learn_from_frame(q_table, frame1, frame2):
    
    oldval = get_q(q_table, frame1[1], frame1[2])
    
    maxq_frame2 = max([get_q(q_table, frame2[1], a) for a in valid_action]) # = V(s') 
    
    # Q(s,a) <- Q(s,a) + alpha * [reward + gamma * V(s') - Q(s,a)]
    
    newval = oldval + alpha * (frame1[3] + gamma * maxq_frame2 - oldval)
    #should it be different for the first try??
    
    update_q(q_table, frame1, newval)
    

#Test Regressor

In [87]:
etrees = ExtraTreesRegressor()

In [88]:
xtrain = []
ytrain = []
for point in dataset:
    xtrain.append((point[1]+(point[2],)))
    ytrain.append(point[3])

In [90]:
for i in range(len(xtrain)):
    print str(xtrain[i]) + " and " + str(ytrain[i]) + "\n"

(0, 1, 1, 0, 0) and -100

(0, 1, 1, 0, 0) and -100

(0, 0, 1, 0, 3) and -104.0

(0, 0, 1, 0, 0) and -100

(1, 0, 0, 0, 0) and 0

(1, 1, 0, 0, 0) and 0

(1, 1, 0, 0, 0) and 0

(1, 1, 1, 0, 0) and 0

(1, 1, 1, 0, 3) and -4.0

(0, 1, 1, 0, 0) and -100

(0, 0, 1, 0, 0) and -100

(0, 0, 1, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 5) and -125.0

(1, 0, 0, 0, 1) and -1.0

(0, 1, 0, 0, 0) and -100

(1, 0, 0, 0, 0) and 0

(1, 1, 1, 0, 0) and 0

(1, 1, 0, 0, 0) and 0

(1, 1, 1, 0, 5) and -25.0

(1, 1, 1, 0, 0) and 0

(1, 1, 1, 0, 0) and 0

(1, 1, 1, 0, 3) and -4.0

(0, 1, 1, 0, 3) and -104.0

(0, 0, 1, 0, 0) and -100

(0, 0, 1, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(0, 0, 0, 0, 0) and -100

(1, 0, 0, 0, 0) and 0

(1, 1, 0, 0, 5) and -25.0

(1, 1, 0, 0, 0) and 0

(1, 1, 1, 0, 0) and 0

(1, 1, 1, 0, 0) and 0

(1, 1, 1, 0, 0) a

In [91]:
etrees.fit(xtrain, ytrain)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)

In [97]:
etrees.predict([(0,0,1,0,5)])

array([-125.])

In [92]:
q_table

{((0, 0, 0, 0), 0): -195.75132783042696,
 ((0, 0, 0, 0), 1): -318.22903730265114,
 ((0, 0, 0, 0), 3): -324.3368314813066,
 ((0, 0, 0, 0), 5): -250.54482860949875,
 ((0, 0, 0, 0), 6): -105.19043190819988,
 ((0, 0, 1, 0), 0): -245.2678718794266,
 ((0, 0, 1, 0), 1): -259.5814649921335,
 ((0, 0, 1, 0), 3): -271.85634043782056,
 ((0, 0, 1, 0), 5): -159.5507209691458,
 ((0, 0, 1, 0), 6): -45.0,
 ((0, 1, 0, 0), 0): -165.59793323658127,
 ((0, 1, 0, 0), 1): -148.11231268006475,
 ((0, 1, 0, 0), 3): -88.1129267838798,
 ((0, 1, 0, 0), 5): -25.0,
 ((0, 1, 1, 0), 0): -220.5164273960171,
 ((0, 1, 1, 0), 1): -189.9852509513861,
 ((0, 1, 1, 0), 3): -173.41543355698332,
 ((0, 1, 1, 0), 5): -210.36172219341944,
 ((0, 1, 1, 0), 6): -25.0,
 ((1, 0, 0, 0), 0): -3.834862782508206,
 ((1, 0, 0, 0), 1): -9.197455323558422,
 ((1, 0, 0, 0), 3): -10.4616,
 ((1, 0, 0, 0), 5): -21.611008587377185,
 ((1, 0, 0, 0), 6): -10.113704663578865,
 ((1, 0, 1, 0), 0): -4.912960818750435,
 ((1, 0, 1, 0), 1): -29.959822432205602

#Set up the Q-learning algorithm

In [7]:
gamma = 0.9
alpha = 0.2
epsilon = 0
q_table = {}
actions = [0,6]

In [7]:
def state_vector_complicated(region, features):
    
    s_vector = [0,0,0,0]
    
    #minimum
    if features[0][region][0] < -1:
        s_vector[0] = 0
    else:
        s_vector[0] = 1
        
    #maximum 
    if features[1][region][0] < 1:
        s_vector[1] = 0
    else:
        s_vector[1] = 1
    
    #average
    if features[2][region][0] < 0:
        s_vector[2] = 0
    else: 
        s_vector[2] = 1
    
    power = features[4] * features[3]**2
    
    #power
#    if power == 0:
#        s_vector[3] = 0
#    elif power <= 5:
#        s_vector[3] = 1
#    else:
#        s_vector[3] = 2
    s_vector[3] = power
    
    return (s_vector[0], s_vector[1], s_vector[2], s_vector[3])
    

Simplified state vector using only average - this will be easier to code for now as it 
gives a total of only 6 states.

In [5]:
def state_vector(region, features, oldstate, olderstate):
    s_vector = [0,0]
    if features[2][region][0] < 0:
        s_vector[0] = 0
    else:
        s_vector[0] = 1
        
    power = features[4] * features[3]**2 #time * amp^2
    
    
    #for actions above, power is 0, 5 or 250
    
#    if power == 0:
#        s_vector[1] = 0
#    elif power < 10:
#        s_vector[1] = 1
#    else:
#        s_vector[1] = 2
 
    s_vector[1] = power
    
#    return (s_vector[0],oldstate[0],olderstate[1], s_vector[1])
    return (s_vector[0],oldstate[0],olderstate[1], 0)

#    return  (s_vector[0],0,0, s_vector[1])


In [17]:
def get_freq_amp(action_log):
    freq = []
    amp = []
    for action in action_log:
        if action == 0:
            freq.append(0)
            amp.append(0)
        if action == 1:
            freq.append(500)
            amp.append(1)
        if action == 2:
            freq.append(250)
            amp.append(1)
        if action == 3:
            freq.append(500)
            amp.append(2)
        if action == 4:
            freq.append(250)
            amp.append(2)
        if action == 5:
            freq.append(500)
            amp.append(5)
        if action == 6:
            freq.append(250)
            amp.append(5)
        
    return (amp, freq)

In [6]:
def extract_features(raw_time, raw_data):
    data_min = np.min(raw_data[:],0)[0]
    data_max = np.max(raw_data[:],0)[0]
    data_rng = data_max - data_min
    data_avg = np.mean(raw_data[:], 0)[0]
    return (data_min, data_max, data_rng, data_avg)