In [1]:
import numpy as np
import copy
import datetime

In [2]:
def Sigmoid(x):
    res = 1 / (1 + np.exp(-x))
    return res

def d_Sigmoid(x):
    y = Sigmoid(x) * (1 - Sigmoid(x))
    return y

def ReLU(x):
    x = np.maximum(0, x)
    return x
    
def d_ReLU(x):
    y=x.copy()
    y[y<=0] = 0
    y[y>0] = 1
    return y


def Linear(x):
    return x

def d_Linear(x):
    y = np.ones(shape=(x.shape), dtype = x.dtype)
    return y

activations_dict = {
'Sigmoid': [Sigmoid, d_Sigmoid],
'ReLU': [ReLU, d_ReLU], 
'Linear': [Linear, d_Linear]
}


## input layer

In [3]:

class input_layer:
    def __init__(s, size):
        s.size = size
        #s.values = np.zeros(shape=(size), dtype = float)
    
    def add_neuron(s):
        #add_v = np.zeros(shape=(n_of_neurons), dtype=float)
        #s.values = np.concatenate((s.values, add_v.T))
        s.size += 1

    def delete_neuron(s, neuron_number):
        s.size-=1
        
    def delete_new_prev_size(s):     
        return


    def add_new_prev_size(s):     
        return

        
    def print_info(s):
        print("IN LAYER\nsize: ", s.size)

    
    def print_pic(s):
        print_size = min(2, s.size)

        for i in range(print_size): 
            print("| |\t", end='')
        print("")
        for i in range(print_size):
            print(" v \t", end='')
        print("")
        for i in range(print_size):
            print(' @\t', end='')
        print ("--", format(s.size, ' 5d') , "--\t", end='')

    def forward(s, x, to_print = False):
        s.values = x
        return x
        
    def forward_nu(s, x):
        return x

    def get_info(s):
        return s.size

## layers

In [4]:
class layer:
    def __init__(s, lr = 0.1, prev_size = 2, my_size=2, activation_type = "Sigmoid", weights = None, bias = None):
        s.lr = lr
        s.size = my_size
        s.prev_size = prev_size
        if (np.all(weights == None)):
            # s.weights = np.random.random((prev_size, s.size))
            s.w = np.random.random((s.size, prev_size))
        else:
            s.w = weights.copy()
            s.w = s.w.reshape((s.size, prev_size))
            
        if (np.all(bias == None)):
            s.b = np.random.random((s.size, 1))
        else:
            s.b = bias.copy()
            s.b = s.b.reshape((s.size, 1))
            
        s.activation_type = activation_type
        funcs = activations_dict.get(activation_type)
        s.activation_f = funcs[0]
        s.d_activation_f = funcs[1]
        
        s.optimizer_reset()
        s.epsilon = 1e-8

    def optimizer_reset(s):
        s.Vdw = np.zeros(shape=(s.size, s.prev_size))
        s.Vdb = np.zeros(shape=(s.size, 1))
        
        s.Sdw = np.zeros(shape=(s.size, s.prev_size))
        s.Sdb = np.zeros(shape=(s.size, 1))
        s.t = 1

    def activate(s, x):
        return s.activation_f(x)
        
    def d_activate(s, x):
        return s.d_activation_f(x)  

    
    def forward(s, x, to_print = False):
        s.x = np.asarray(x)
        s.z = np.dot(s.w, s.x) + s.b
        
        if (to_print): 
            print('wT * x + b', s.z)

        s.a = s.activate(s.z)

        if (to_print): 
            print('s.a ',s.a)
            
        return s.a

    def backprop(s, da):
        s.dz = da * s.d_activate(s.z)
        s.da_ = np.dot(s.w.T, s.dz) 

        
        return s.da_
        
    def update_weights(s, optimizer = "SGD", beta1 = 0.9, beta2 = 0.999):
        
        m = s.x.shape[1]
        if (optimizer == "SGD"):
            
            s.dw = (1/m)*np.dot(s.dz, s.x.T)
            s.db = (1/m)*np.sum(s.dz, axis = 1, keepdims = True)
    
            s.w = s.w - s.lr * s.dw
            s.b = s.b - s.lr * s.db
            
        elif (optimizer=="SGDwM"):

            s.dw = (1/m)*np.dot(s.dz, s.x.T)
            s.db = (1/m)*np.sum(s.dz, axis = 1, keepdims = True)

            
            s.Vdw = beta1 * s.Vdw + (1 - beta1)*s.dw
            s.Vdb = beta1 * s.Vdb + (1 - beta1)*s.db
            
            s.w = s.w - s.lr * s.Vdw
            s.b = s.b - s.lr * s.Vdb
            
        elif (optimizer=="RMSProp"):

            s.dw = (1/m)*np.dot(s.dz, s.x.T)
            s.db = (1/m)*np.sum(s.dz, axis = 1, keepdims = True)

            s.Sdw = beta2 * s.Sdw + (1-beta2) * np.square(s.dw)
            s.Sdb = beta2 * s.Sdb + (1-beta2) * np.square(s.db)
            
            s.w = s.w - s.lr * s.dw / (np.sqrt(s.Sdw))
            s.b = s.b - s.lr * s.db / (np.sqrt(s.Sdb))
            
        elif (optimizer=="Adam"):

            s.dw = (1/m)*np.dot(s.dz, s.x.T)
            s.db = (1/m)*np.sum(s.dz, axis = 1, keepdims = True)
            
            s.Vdw = beta1 * s.Vdw + (1 - beta1)*s.dw
            s.Vdb = beta1 * s.Vdb + (1 - beta1)*s.db
            
            s.Sdw = beta2 * s.Sdw + (1-beta2) * np.square(s.dw)
            s.Sdb = beta2 * s.Sdb + (1-beta2) * np.square(s.db)

            # correct

            s.Vdw_ = s.Vdw / (1 - beta1**s.t)
            s.Vdb_ = s.Vdb / (1 - beta1**s.t)
            s.Sdw_ = s.Sdw / (1 - beta2**s.t)
            s.Sdb_ = s.Sdb / (1 - beta2**s.t)
            
            s.w = s.w - s.lr * s.Vdw_ / (np.sqrt(s.Sdw_) + s.epsilon)
            s.b = s.b - s.lr * s.Vdb_ / (np.sqrt(s.Sdb_)  + s.epsilon)
            s.t += 1
        else:
            print("NO SUCH OPTIMIZER!")
            return




   
            
        
    def print_info(s):
        print("my size: ", s.size)
        print("prev size: ", s.prev_size)
        print("w: ", s.w.shape, s.w, "\n")
        print("b: ", s.b.shape, s.b, "\n")

              
    def print_pic(s):
        print_size = min(2, s.size)
        print("\n╻...\nv...")
        for i in range(print_size):
            print('O\t', end='')
        print ("--", format(s.size, ' 5d') , "--\t", end='')



    def correct_prev_size(s, new_prev_szie):
        dif = new_prev_szie - s.prev_size
        if dif > 0: # new prev is greater
            for i in range(dif):
                s.add_new_prev_size()
        elif dif < 0:
            dif*=-1
            for i in range(dif):
                s.delete_new_prev_size()
        s.prev_size = new_prev_szie


    def delete_neuron(s, neuron_number):
        s.w = np.delete(s.w, neuron_number, axis = 0)
        s.b = np.delete(s.b, neuron_number, axis = 0)
        s.size-=1
        
    def delete_new_prev_size(s):     
        s.w = np.delete(s.w, 0, axis = 1)
        s.prev_size -=1


    def add_neuron(s):     
        add_w = np.zeros(shape=(1, s.prev_size), dtype=float) + 0.1 # np.random.random((s.prev_size, n_of_neurons)) #
        s.w = np.concatenate((s.w, add_w), axis = 0)
        add_b = np.zeros(shape=(1, 1), dtype=float) + 0.1 
        s.b = np.concatenate((s.b, add_b))
        s.size+=1

    def add_new_prev_size(s):     
        add_w = np.zeros(shape=(1, s.size), dtype=float) + 0.1
        s.w = np.concatenate((s.w, add_w.T), axis = 1)
        s.prev_size += 1

    def get_info(s):
        return s.prev_size, s.size, s.w, s.b, s.activation_type, s.lr


        
        


## plastic nn

In [5]:

class plastic_nn:
    def __init__(s, optimizer = "SGD", beta = 0.9):

        opt_list = ['SGD', 'SGDwM', 'RMSProp', 'Adam']
        if optimizer in opt_list:
            s.optimizer = optimizer
        else:
            print('no such optimizer, available are: ')
            for each in opt_list:
                print(each)
            return
        
        s.layers = []
            
        s.n_of_layers = 0
        s.name = 'noname'
        s.optimizer = optimizer
        
        s.beta = beta
        pass

    def give_name(s, name):
        s.name = name
        
    def set_num_of_layers(s, num):
        s.n_of_layers = num
        
    def deep_copy(s):
        return copy.deepcopy(s)

    
    def forward(s, x, to_print = False):
        for lay in s.layers:
            x = lay.forward(x, to_print)
        s.last_result = x
        return s.last_result
        
    def forward_print(s, x, to_print = False):
        print('in: ',data)
        cnt = 0
        for lay in s.layers:
            x = lay.forward(x, to_print)
            print(cnt, ' ', x)
            cnt+=1
        s.last_result = x
        return s.last_result



    
    def backprop(s, correct):
        m = correct.shape[1]
        err = (s.last_result - correct) # a - y
        cnt = 0
        for lay in reversed(s.layers[1:]):
            #print(cnt)
            err = lay.backprop(err)
            cnt+=1

    def backprop_error(s, err):
        for lay in reversed(s.layers[1:]):
            err = lay.backprop(err)

    def update(s):
        i = 0
        for lay in reversed(s.layers[1:]):
            #print('layer idx: ', i)
            i+=1
            lay.update_weights(s.optimizer,s.beta)
            #print('\n')


    
    def learn_one(s, in_data, target_data):
        s.forward(in_data)
        s.backprop(target_data)
        s.update()   



    
    def append_one(s, new_layer, check = False):
        if check and s.n_of_layers!=0:
            last_layer_size = s.layers[-1].size
            if last_layer_size != new_layer.prev_size:
                print("size not match, layer ", s.n_of_layers)
                return
        s.layers.append(new_layer)
        s.n_of_layers+=1
        return

    def check_layers_sizes(s, check_layers):
        for i in range(1, len(check_layers)):
            if (check_layers[i-1].size != check_layers[i].prev_size):
                print("error between ", i-1, "and ", i)
                return False
        return True

    
    def append_layers(s, new_layers, to_print = False):
        test_layers = np.array([])
        if s.n_of_layers != 0: # if has layers
            test_layers = s.layers[-1] # get last layers
        
        test_layers = np.append(test_layers, new_layers) 
                
        if (s.check_layers_sizes(test_layers)):
            for lay in new_layers:
                s.append_one(lay)
            if (to_print):
                print("added LAYERS succesfully")
            return True
        else:
            if (to_print):
                print("ERROR adding layers, check info above")
            return False

    def add_layer_by_pos(s, pos, new_layer):
        if (pos <= 0 or pos > s.n_of_layers): # if input or more than 'to last'
            print("ERROR addning layer: invalid layer number!")
            if (pos == 0):
                print("input layer cannot be replaced by different layer")
            return
            
        if (pos == s.n_of_layers): # if add to the last
            s.append(new_layer)
            return
            
        if (new_layer.prev_size!=s.layers[pos-1].size):
            print("ERROR addning layer: invalid prev_size!")
            return 
            
        s.layers.insert(pos, new_layer)
        prev_size = new_layer.size
        
        # update next layer prev_size and w matrix
        next_lay = s.layers[pos+1]
        next_lay.correct_prev_size(prev_size)
        s.n_of_layers += 1

    def delete_layer_by_pos(s, pos):
        if (pos <= 0 or pos >= s.n_of_layers): # if input or more than 'to last'
            print("ERROR deleting layer: invalid layer number!")
            if (pos == 0):
                print("input layer cannot be deleted")
            return
        
        new_prev_size = s.layers[pos].prev_size 
        if (pos != s.n_of_layers-1): #if not last
            next_lay = s.layers[pos+1]
            next_lay.correct_prev_size(new_prev_size)

        del s.layers[pos]
        s.n_of_layers -= 1


    def add_neuron(s, layer_number, n_of_neurons = 1):
        if (layer_number < 0 or layer_number>= s.n_of_layers):
            print("ERROR addning neuron: invalid layer number!")
            return
        
        main_lay = s.layers[layer_number]  
        
        for i in range(n_of_neurons):
            main_lay.add_neuron()           
            if (layer_number+1 != s.n_of_layers): # if main is not last
                # update next layer prev_size and w matrix
                next_lay = s.layers[layer_number+1]
                next_lay.add_new_prev_size()
    
    def delete_neuron(s, layer_number, neuron_number):
        if (layer_number < 0 or layer_number>= s.n_of_layers):
            print("ERROR deleting neuron: invalid layer number!")
            return
            
        main_lay = s.layers[layer_number] 
        
        if (neuron_number >= main_lay.size):
            print("ERROR deleting neuron: invalid neuron number!")
            return

        main_lay.delete_neuron(neuron_number)
        if (layer_number+1 != s.n_of_layers): # if main is not last
                # update next layer prev_size and w matrix
                next_lay = s.layers[layer_number+1]
                next_lay.delete_new_prev_size()
        
        

    
    def print_info(s):
        print('NAME: ', s.name, ' (', s.n_of_layers, ')')
        for cnt in range(s.n_of_layers):
            print("#", cnt)
            s.layers[cnt].print_info()
            print("")
    
    def print_pic(s):
        print('NAME: ', s.name, ' (', s.n_of_layers, ')')
        cnt = 0
        for lay in s.layers:
            lay.print_pic()
            print("#", cnt, end='')
            cnt+=1
        print("\nOUT |#|\nOUT  v")

    def optimizer_reset(s):
        for lay in reversed(s.layers[1:]):
            lay.optimizer_reset()

    
    def save(s, file_path):       
        f = open(file_path, "w").close()
        
        f = open(file_path, "a")       
        f.write("{}\n{}\n".format(s.name, s.n_of_layers))       
        input_layer_size = s.layers[0].get_info()
        f.write("{}\n".format(input_layer_size))

        for lay in s.layers[1:]:            
            prev_size, size, weights, bias, activation_type, lr = lay.get_info()
            f.write("{}\n".format(prev_size))
            f.write("{}\n".format(size))
            
            np.savetxt(f, weights)#, fmt='%f')
            np.savetxt(f, bias)#, fmt='%f')
            f.write("{}\n".format(activation_type))
            f.write("{}\n".format(lr))
        
        ct = datetime.datetime.now()


        f.write("\nct {}\n".format(ct))
        f.close()

        
    def load(s, file_path):
        s.layers = None
        s.layers = []
        s.n_of_layers = 0       
        layers = []
        f = open(file_path, "r")       
        name = f.readline().split()[0]
        total_n_of_layers = int(f.readline().split()[0])
        input_layer_size = int(f.readline().split()[0])

        in_layer = input_layer(input_layer_size)
        layers.append(in_layer)
        
        s.give_name(name)

        for i in range(total_n_of_layers-1):
            prev_size = int(f.readline().split()[0])
            size = int(f.readline().split()[0])
            weights = np.loadtxt(f, max_rows = size)
            bias = np.loadtxt(f, max_rows = size)
            activation_type = f.readline().split()[0]
            lr = float(f.readline().split()[0])

            layers.append(layer(lr = lr, prev_size = prev_size, my_size = size, 
                                activation_type = activation_type, weights = weights, bias = bias))
            
        s.append_layers(layers)             
        f.close()






## TESt


In [6]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random
import torch
from torch import nn
import torch.nn.functional as F
import os
import datetime



In [7]:
# learning_rate_a = 0.0001
# in_states = 3
# h1_nodes = 7
# h2_nodes = 7
# out_actions = 5

# a_type1 = 'ReLU'
# a_type2 = 'Linear'

# layers_net = [input_layer(in_states), 
# layer(lr = learning_rate_a, prev_size = in_states, my_size=h1_nodes, activation_type=a_type1), 
# #layer(lr = learning_rate_a, prev_size = h1_nodes, my_size=h2_nodes, activation_type=a_type1), 
# layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)]

# target_dqn = plastic_nn(optimizer="Adam")
# target_dqn.append_layers(layers_net)

# target_dqn.print_info()

In [8]:
# add_layer = layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)
# target_dqn.delete_layer_by_pos(1)
# target_dqn.print_info()

In [9]:

# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)




class DQN():
    # Hyperparameters (adjustable)
    def __init__(s, ct = 0, tag='tag', path = r'test', 
                 game_name = 'MountainCar-v0', 
                  
                 mini_batch_size = 32,  num_divisions = 1, 
                 replay_memory_size = 100000, 
                 network_sync_rate = 50000, discount_factor_g = 0.9):
        
        if (ct == 0):
            ct = datetime.datetime.now()
            ct = str(ct)
            ct = ct.replace(":", "-")
            ct = ct.replace(" ", "_")
            ct = ct[:-7]

        s.ct = ct
        current_directory = os.getcwd()
        final_directory_pics = os.path.join(current_directory, s.ct)
        
        final_directory_pics = os.path.join(final_directory_pics, 'pics')
        if not os.path.exists(final_directory_pics):
            os.makedirs(final_directory_pics)
            
        s.set_tag(tag)

        s.game_name = game_name
        s.discount_factor_g = discount_factor_g
         
        s.mini_batch_size = mini_batch_size 
        s.num_divisions = num_divisions

        s.replay_memory_size =  replay_memory_size 
        s.network_sync_rate = network_sync_rate
        
    def set_tag(s, tag):
        s.tag = tag
        s.path = s.ct+'/'+s.tag
        current_directory = os.getcwd()
        final_directory = os.path.join(current_directory, s.path)
        if not os.path.exists(final_directory):
            os.makedirs(final_directory)
        
    
    def plot_progress(self, rewards_per_episode_, epsilon_history_):
        
        plt.figure()
        
        plt.xlabel('epochs')
        plt.ylabel('reward')
        plt.plot(rewards_per_episode_)

        plt.savefig(f'{self.path}/info_rew_{self.tag}.png'.format(self.path, self.tag))
        plt.close()

        plt.figure()
        
        plt.xlabel('epochs')
        plt.ylabel('epsilon')
        plt.plot(epsilon_history_)
        plt.savefig(f'{self.path}/info_eps_{self.tag}.png'.format(self.path, self.tag))
        plt.close()

    

    

    def train(self, policy_dqn, target_dqn, episodes, render=False):
        # Create FrozenLake instance
        env = gym.make(self.game_name, render_mode='human' if render else None)

        #env.action_space.seed(42)
        
        num_states = env.observation_space.shape[0] # expecting 2: position & velocity
        num_actions = env.action_space.n

        # Divide position and velocity into segments
        self.pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], self.num_divisions)    # Between -1.2 and 0.6
        self.vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], self.num_divisions)    # Between -0.07 and 0.07

        epsilon = 1 # 1 = 100% random actions
        memory = ReplayMemory(self.replay_memory_size)

        rewards_per_episode = []
        epsilon_history = []

        # Track number of steps taken. Used for syncing policy => target network.
        step_count = 0
        goal_reached = False
        best_rewards = -200

        for i in range(episodes+1):
            state = env.reset()[0]  # Initialize to state 0 seed=int(i+10)
            terminated = False      # True when agent falls in hole or reached goal
            truncated = False
            rewards = 0

            while(not terminated and rewards > -1000):
                if random.random() < epsilon:
                    action = env.action_space.sample() # actions: 0=left,1=idle,2=right
                else:
                    res = policy_dqn.forward(self.state_to_dqn_input(state))
                    action = res.argmax().item()

# Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)

#Truncation: The length of the episode is 200.

                new_state,reward,terminated,truncated,_ = env.step(action)
                rewards += reward
                memory.append((state, action, new_state, reward, terminated))
                state = new_state
                
                step_count+=1


            rewards_per_episode.append(rewards)
            
            if(terminated):
                goal_reached = True

            # Graph training progress
            if(i!=0 and i%1000==0):
                print(f'Episode {i} Epsilon {epsilon}')
                policy_dqn.save(f'{self.path}/mc_policy_{i}'.format(self.path, i))
                self.add_log_data(f'Episode {i} Epsilon {epsilon}')
                self.plot_progress(rewards_per_episode, epsilon_history)

            if rewards>best_rewards:
                best_rewards = rewards
                print(f'Best rewards so far: {best_rewards}')
                self.add_log_data(f'Best rewards so far: {best_rewards}')
                policy_dqn.save(f'{self.path}/mc_policy_{i}'.format(self.path, i))
                policy_dqn.save(f'{self.path}/mc_policy_best'.format(self.path))
                

            # Check if enough experience has been collected
            if len(memory)>self.mini_batch_size and goal_reached:
                
                #print(f'OPTIMIZE Episode {i} Epsilon {epsilon} rewards {rewards}') # print(rewards)

                mini_batch = memory.sample(self.mini_batch_size)
                self.optimize(mini_batch, policy_dqn, target_dqn)

                # Decay epsilon
                epsilon = epsilon = max(epsilon - 1/episodes, 0.01) # max(epsilon*0.99996, 0.05)#
                epsilon_history.append(epsilon)

                # Copy policy network to target network after a certain number of steps
                if step_count > self.network_sync_rate:
                    target_dqn = policy_dqn.deep_copy()
                    step_count = 0
                   
                
                
        env.close()
        policy_dqn.save(f'{self.path}/mc_policy_last_{self.tag}'.format(self.path, self.tag))
        if (best_rewards == -200):
            policy_dqn.save(f'{self.path}/mc_policy_best'.format(self.path))
        self.save_reward_data(rewards_per_episode)
        #print(rewards_per_episode)



    
    def optimize(self, mini_batch, policy_dqn, target_dqn):
        target_q_list = []
        input_list = []
        
        for state, action, new_state, reward, terminated in mini_batch:
            if terminated:
                target = reward
            else:
                target = reward + self.discount_factor_g * target_dqn.forward(self.state_to_dqn_input(new_state)).max()

            # Get the target set of Q values
            state_dsc = np.asarray(self.state_to_dqn_input(state))
            input_list.append(state_dsc)
            
            target_q = target_dqn.forward(state_dsc)
            
            # Adjust the specific action to the target that was just calculated
            target_q[action] = target            
            target_q_list.append(target_q)

        #BACKPOP AND UPDATE on minibatch
        x = np.asarray(input_list)
        x = x[:, :, 0]
        x = x.T

        y = np.asarray(target_q_list)
        y = y[:, :, 0]
        y = y.T

        policy_dqn.learn_one(x, y)



    def state_to_dqn_input(self, state):
        state_p = np.digitize(state[0], self.pos_space)
        state_v = np.digitize(state[1], self.vel_space)
        
        # state_p = state[0]
        # state_v = state[1]

        return np.asarray([[state_p], [state_v]])

        

    def test(self, policy_dqn, episodes, render = False):
        env = gym.make(self.game_name, render_mode='human' if render else None)
        
        num_states = env.observation_space.shape[0]
        num_actions = env.action_space.n

        self.pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], self.num_divisions)    # Between -1.2 and 0.6
        self.vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], self.num_divisions)    # Between -0.07 and 0.07

        done_count = 0
        medium_reward = 0
        reward_list = []
        
        for i in range(episodes):
            state, info = env.reset()  # Initialize to state 0
            done = False      # True when agent falls in hole or reached goal
            truncated = False       # True when agent takes more than 200 actions
            rewards = 0
            # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
            
            while(not done and rewards > -500):
            #while(not done and not truncated):
                state = self.state_to_dqn_input(state)
                #print('state shape', state.shape)
               
                res = policy_dqn.forward(state)

                action = res.argmax().item()
                
# Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)

#Truncation: The length of the episode is 200.

                #new_state,reward,terminated,truncated,_ = env.step(action)
                
                state, reward, done, truncated, _ = env.step(action)
                rewards+=reward
                if (done):
                    done_count += 1
                    break

            medium_reward += rewards
            reward_list.append(rewards)
                

        
        env.close()
        medium_reward = medium_reward / episodes
        return done_count*100.0/episodes, medium_reward, reward_list
        
    def save_info(s, info):
        file_path = f'{s.path}/info_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a")   
       


        f.write("data {}\n".format(s.ct))
        f.write("tag {}\n".format(s.tag))
        f.write("game_name {}\n".format(s.game_name))
        f.write("reward discount factor {}\n".format(s.discount_factor_g))
        f.write("minibatch size {}\n".format(s.mini_batch_size))
        f.write("num divisions{}\n".format(s.num_divisions))
        f.write("replay memory size {}\n".format(s.replay_memory_size))
        f.write("network sync rate {}\n".format(s.network_sync_rate))
        f.write("info {}\n".format(info))
                         
        f.close()
    
    def add_log_data(s, data):
        file_path = f'{s.path}/info_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a") 
        f.write("{}\n".format(data))
                         
        f.close()
        
    def save_reward_data(s, data):
        file_path = f'{s.path}/rewards_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a") 
        f.write("{}\n".format(data))
                         
        f.close()


In [17]:
learning_rate_a = 0.001
in_states = 2
h1_nodes = 3
h2_nodes = 3
out_actions = 3

a_type1 = 'ReLU'
a_type2 = 'Linear'

layers_net = [input_layer(in_states), 
layer(lr = learning_rate_a, prev_size = in_states, my_size=h1_nodes, activation_type=a_type1), 
layer(lr = learning_rate_a, prev_size = h1_nodes, my_size=h2_nodes, activation_type=a_type1), 
layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)]

policy_dqn = plastic_nn(optimizer="Adam")
policy_dqn.append_layers(layers_net)

target_dqn = plastic_nn()
target_dqn = policy_dqn.deep_copy()


In [18]:

mountaincar = DQN(game_name = 'MountainCar-v0', discount_factor_g = 0.9, mini_batch_size = 64, 
                  num_divisions = 50, replay_memory_size = 100000, network_sync_rate = 50000)
mountaincar.set_tag('00')

In [19]:
mountaincar.save_info(f'lr: {learning_rate_a} \nin:{in_states} \nh:{h1_nodes}x2 (--) \nout:{out_actions} \na1:{a_type1} \na2:{a_type2} \n'.format(
    learning_rate_a, in_states, h1_nodes, out_actions, a_type1, a_type2))
mountaincar.train(policy_dqn, target_dqn, 5000, False)


Episode 1000 Epsilon 1
Episode 2000 Epsilon 1
Episode 3000 Epsilon 1
Episode 4000 Epsilon 0.8942000000000117
Episode 5000 Epsilon 0.6942000000000337


In [20]:
mountaincar.set_tag('01')

In [21]:
policy_dqn.print_info()

policy_dqn.add_neuron(1)
policy_dqn.add_neuron(2)
policy_dqn.print_info()


NAME:  noname  ( 4 )
# 0
IN LAYER
size:  2

# 1
my size:  3
prev size:  2
w:  (3, 2) [[ 0.64758615  0.03189151]
 [ 0.35066547  0.64768358]
 [-0.10836567  0.12662346]] 

b:  (3, 1) [[-0.07047263]
 [ 0.51074756]
 [ 0.66746286]] 


# 2
my size:  3
prev size:  3
w:  (3, 3) [[-0.07386445  0.78649327  0.16749688]
 [ 0.03803286  0.38445125 -0.02373191]
 [ 0.63849218  0.0528816   0.69475779]] 

b:  (3, 1) [[ 0.50139307]
 [-0.2119452 ]
 [-0.04805645]] 


# 3
my size:  3
prev size:  3
w:  (3, 3) [[ 0.0185373   0.64281852  0.27252808]
 [ 0.19867912 -0.01490822  0.52957165]
 [ 0.05522255  0.58953189  0.25340823]] 

b:  (3, 1) [[0.70239928]
 [0.41880444]
 [0.47917633]] 


NAME:  noname  ( 4 )
# 0
IN LAYER
size:  2

# 1
my size:  4
prev size:  2
w:  (4, 2) [[ 0.64758615  0.03189151]
 [ 0.35066547  0.64768358]
 [-0.10836567  0.12662346]
 [ 0.1         0.1       ]] 

b:  (4, 1) [[-0.07047263]
 [ 0.51074756]
 [ 0.66746286]
 [ 0.1       ]] 


# 2
my size:  4
prev size:  4
w:  (4, 4) [[-0.07386445  0.786

In [22]:
target_dqn = policy_dqn.deep_copy()
policy_dqn.optimizer_reset()
target_dqn.optimizer_reset()

In [None]:
mountaincar.save_info(f'lr: {learning_rate_a} \nin:{in_states} \nh:{h1_nodes}x2 (--) \nout:{out_actions} \na1:{a_type1} \na2:{a_type2} \n'.format(
    learning_rate_a, in_states, 4, out_actions, a_type1, a_type2))
mountaincar.train(policy_dqn, target_dqn, 5000, False)


Episode 1000 Epsilon 0.8154000000000203
Episode 2000 Epsilon 0.6154000000000424
Episode 3000 Epsilon 0.4154000000000644
Episode 4000 Epsilon 0.2154000000000816
