In [1]:
import gym
import tensorflow as tf
from tensorflow import keras
import random
import numpy as np
import datetime as dt
import math
import scipy as sp

In [2]:
def do(f,n):
    b = []
    for i in range(n):
        b.append(f())
    return np.array(b)

def rrow(a):
    ri = np.random.randint(0,np.shape(a)[0])
    return a[ri]

def rmult(r,n):
    es=np.random.uniform(0,1,r)
    es2 = es/np.mean(es)
    np.random.seed(514)
    x = sp.stats.random_correlation.rvs(es2)

    return do(lambda : sp.stats.multivariate_normal.rvs(mean=[100.0]*d, cov=x),n)

def irange(a,b):
    return np.arange(a,b+1)

In [30]:
Rts = irange(0,30)
Fts = [37,52,73,84,90,109,128,140,152,155]
Fts2 = np.concatenate(([30],Fts))
Fvs = []
for i in range(len(Fts2)-1):
    Fvs.append(irange(Fts2[i]+1,Fts2[i+1]-1))
Ft_v = {}
Ft_i = {}
for i,t,v in zip(range(len(Fts)),Fts,Fvs):
    Ft_v[t] = v
    Ft_i[t] = i

    
actions = np.concatenate((Rts,Fts))
states = np.concatenate((Fts, np.concatenate(Fvs)))

var_to_state = {}
for F in states:
    var_to_state[F] = np.where(states==F)[0][0]

def action_to_choice(action):
    x=actions[action]
    if x in Rts:
        return "exploit", x
    else:
        return "explore", Ft_i[x]
    
class SAT_env:
    def __init__(self, da):
        self.da = da
        #self.num_r = num_r
        #self.num_f = num_f
        #self.n = num_r + num_f
        #self.s = num_f
        self.unknown = -100.0
        self.n = len(actions)
        self.s = len(states)
    def reset(self):
        self.state = np.array([self.unknown] * len(states))
        self.truth = rrow(self.da)
        return self.state
    
    def step(self, action):
        choice_type,choice_i = action_to_choice(action)
        
        if(choice_type == "exploit"):
            Rt = Rts[choice_i]
            done=True
            reward = -self.truth[Rt]
        else:
            done = False
            Ft = Fts[choice_i]
            #if self.state[var_to_state[Ft]] != self.unknown:
                #return self.step(np.random.randint(0,self.n))
            
            Fvs = Ft_v[Ft]
            reward = -self.truth[Ft]
            for F in np.concatenate(([Ft],Fvs)):
                self.state[var_to_state[F]] = self.truth[F]
        '''    
        if(self.choose_solver(action)):
            done = True
            #self.timesofar += self.solver_time(action)
            reward = -self.solver_time(action)#self.timesofar
            done = True
        else:
            if self.state[action-self.num_r] != self.unknown:
                return self.step(np.random.randint(0,self.n))

            done = False
            #self.timesofar += self.feature_time(action)
            reward = -self.feature_time(action)
            self.state[action - self.num_r] = self.feature_value(action)
        ,""
        '''
        return self.state,reward,done
        
    def solver_time(self,action):
        return self.truth[action]
    def choose_solver(self,action):
        return action < self.num_r
    def feature_value(self,action):
        return self.truth[action]
    def feature_time(self,action):
        return self.truth[action + self.num_f]

In [33]:
iss=[]
for i in range(len(actions)):
    if actions[i] in Rts:
        iss.append(i)

In [29]:
excoef = 1
#adapted with modifications from https://adventuresinmachinelearning.com/double-q-reinforcement-learning-in-tensorflow-2/
def dqn(env):
    STORE_PATH = ''
    MAX_EPSILON = 1
    MIN_EPSILON = 0.01
    LAMBDA = 0.0005
    GAMMA = 0.95
    BATCH_SIZE = 32
    TAU = 0.08
    RANDOM_REWARD_STD = 1.0

    #env = gym.make("CartPole-v0")
    state_size = env.s
    num_actions = env.n

    primary_network = keras.Sequential([
        keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
        keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
        keras.layers.Dense(num_actions)
    ])

    target_network = keras.Sequential([
        keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
        keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
        keras.layers.Dense(num_actions)
    ])

    primary_network.compile(optimizer=keras.optimizers.Adam(), loss='mse')


    class Memory:
        def __init__(self, max_memory):
            self._max_memory = max_memory
            self._samples = []

        def add_sample(self, sample):
            self._samples.append(sample)
            if len(self._samples) > self._max_memory:
                self._samples.pop(0)

        def sample(self, no_samples):
            if no_samples > len(self._samples):
                return random.sample(self._samples, len(self._samples))
            else:
                return random.sample(self._samples, no_samples)

        @property
        def num_samples(self):
            return len(self._samples)

    memory = Memory(500000)
    #actions = np.concatenate((Rts,Fts))
    #states = np.concatenate((Fts, np.concatenate(Fvs)))
    min_explore = 0
    def choose_action(state, primary_network, eps):
        allqs = np.array(primary_network(state.reshape(1, -1)))[0]
        choices_i = []
        qs = []
        known = 0
        for i in range(len(allqs)):
            if ((not actions[i] in Rts) and state[var_to_state[actions[i]]] != env.unknown):
                known += 1
        #print(known)

        for i in range(len(allqs)):
            if (known >= min_explore and actions[i] in Rts) or ((not actions[i] in Rts) and state[var_to_state[actions[i]]] == env.unknown):
                if actions[i] in Rts:
                    qs.append(excoef * allqs[i])
                else:
                    qs.append(allqs[i])
                choices_i.append(i) 
        if random.random() < eps:
            exs = np.setdiff1d(choices_i,iss)
            if random.random() < 0.5 and not len(exs)==0:
                ret = np.random.choice(exs,1)[0] 
            else:
                ret = np.random.choice(iss,1)[0]
            #return random.randint(0, num_actions - 1)
        else:
            #print(allqs)
            #print(qs)
            #print(state)
            ret= choices_i[np.argmax(qs)]
        #print(ret)

        return int(ret)
        
    def plus(a,b):
        return np.add(a, b, out=a, casting="unsafe")

    def train(primary_network, memory, target_network=None):
        if memory.num_samples < BATCH_SIZE * 3:
            return 0
        batch = memory.sample(BATCH_SIZE)
        states = np.array([val[0] for val in batch])
        actions = np.array([val[1] for val in batch])
        rewards = np.array([val[2] for val in batch])
        next_states = np.array([(np.zeros(state_size)
                                if val[3] is None else val[3]) for val in batch])
        # predict Q(s,a) given the batch of states
        prim_qt = primary_network(states)
        # predict Q(s',a') from the evaluation network
        prim_qtp1 = primary_network(next_states)
        # copy the prim_qt tensor into the target_q tensor - we then will update one index corresponding to the max action
        target_q = prim_qt.numpy()
        updates = rewards
        valid_idxs = np.array(next_states).sum(axis=1) != 0
        batch_idxs = np.arange(BATCH_SIZE)
        if target_network is None:
            #updates[valid_idxs] += GAMMA * np.amax(prim_qtp1.numpy()[valid_idxs, :], axis=1)
            #arr = np.add(arr, image.flatten(), out=arr, casting="unsafe")
            updates[valid_idxs] = plus(updates[valid_idxs], GAMMA * np.amax(prim_qtp1.numpy()[valid_idxs, :], axis=1))
        else:
            prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
            q_from_target = target_network(next_states)
            #updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], prim_action_tp1[valid_idxs]]
            updates[valid_idxs] = plus(updates[valid_idxs], GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], prim_action_tp1[valid_idxs]])
        target_q[batch_idxs, actions] = updates
        loss = primary_network.train_on_batch(states, target_q)
        if target_network is not None:
            # update target network parameters slowly from primary network
            for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
                t.assign(t * (1 - TAU) + e * TAU)
        return loss

    num_episodes = 1000
    eps = MAX_EPSILON
    render = False
    train_writer = tf.summary.create_file_writer(STORE_PATH + f"/DoubleQ_{dt.datetime.now().strftime('%d%m%Y%H%M')}")
    double_q = False
    steps = 0
    for i in range(num_episodes):
        state = env.reset()
        
        cnt = 1
        avg_loss = 0
        t_reward = 0
        while True:
            if render:
                env.render()

            action = choose_action(state, primary_network, eps)
            
            next_state, reward, done = env.step(action)
            
            t_reward += reward
            #reward = np.random.normal(1.0, RANDOM_REWARD_STD)
            if done:
                next_state = None
            # store in memory
            memory.add_sample((state, action, reward, next_state))

            loss = train(primary_network, memory, target_network if double_q else None)
            avg_loss += loss

            state = next_state

            # exponentially decay the eps value
            steps += 1
            eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * steps)

            if done:
                avg_loss /= cnt
                print(f"Episode: {i}, Reward: {t_reward}, avg loss: {avg_loss:.3f}, eps: {eps:.3f}")
                with train_writer.as_default():
                    tf.summary.scalar('reward', t_reward, step=i)
                    tf.summary.scalar('avg loss', avg_loss, step=i)
                break

            cnt += 1
    def Ksks_to_state(Ks,ks):
        state = np.full(env.s,env.unknown)
        for Ksv,ksv in zip(Ks,ks):
            state[var_to_state[int(Ksv)]] = ksv
        return state
    
                                                 
    def f(Ks,ks):
        k_state = Ksks_to_state(Ks,ks)
        action = choose_action(k_state, primary_network, 0)
        return action_to_choice(action)

        '''
        def lf(action):
            typ,i = action_to_choice(action)
            if typ == "exploit":
                return typ,i
            else:
                Ft = Fts[i]
                if k_state[var_to_state[Ft]] != env.unknown:
                    print("TRY AGAIN!")
                    return lf(np.random.randint(0,env.n))
                else:
                    return typ,i
        return lf(action)
        '''
                                                 
    return f

In [11]:
def ddqn_get_model(data):
    myS = SAT_env(data)
    return dqn(myS)

In [None]:
def setexcoef(v):
    global excoef
    excoef = v

In [None]:
def getexcoef():
    return excoef