In [1]:
!pip install gymnasium
!pip install gymnasium[classic-control]

Defaulting to user installation because normal site-packages is not writeable




Defaulting to user installation because normal site-packages is not writeable




In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random
import torch
from torch import nn
import torch.nn.functional as F
import os
import datetime


import import_ipynb
from TrulyPlastic_allOpt_5 import plastic_nn
from TrulyPlastic_allOpt_5 import input_layer
from TrulyPlastic_allOpt_5 import layer

importing Jupyter notebook from TrulyPlastic_allOpt_5.ipynb


In [3]:
#np.random.seed(42)
#random.seed(42)

In [4]:

# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)




class DQN():
    # Hyperparameters (adjustable)
    def __init__(s, ct = 0, tag='tag', path = r'test', 
                 game_name = 'MountainCar-v0', 
                  
                 mini_batch_size = 32,  num_divisions = 1, 
                 replay_memory_size = 100000, 
                 network_sync_rate = 50000, discount_factor_g = 0.9):
        
        if (ct == 0):
            ct = datetime.datetime.now()
            ct = str(ct)
            ct = ct.replace(":", "-")
            ct = ct.replace(" ", "_")
            ct = ct[:-7]

        s.ct = ct
        current_directory = os.getcwd()
        final_directory_pics = os.path.join(current_directory, s.ct)
        
        final_directory_pics = os.path.join(final_directory_pics, 'pics')
        if not os.path.exists(final_directory_pics):
            os.makedirs(final_directory_pics)
            
        s.set_tag(tag)
        # s.tag = tag
        # s.path = ct+'/'+s.tag
        # current_directory = os.getcwd()
        # final_directory = os.path.join(current_directory, s.path)
        # if not os.path.exists(final_directory):
        #     os.makedirs(final_directory)
            

        s.game_name = game_name
        s.discount_factor_g = discount_factor_g
         
        s.mini_batch_size = mini_batch_size 
        s.num_divisions = num_divisions

        s.replay_memory_size =  replay_memory_size 
        s.network_sync_rate = network_sync_rate
        
    def set_tag(s, tag):
        s.tag = tag
        s.path = s.ct+'/'+s.tag
        current_directory = os.getcwd()
        final_directory = os.path.join(current_directory, s.path)
        if not os.path.exists(final_directory):
            os.makedirs(final_directory)
        
   
    def plot_progress(self, rewards_per_episode_, epsilon_history_):
        
        plt.figure()
        
        plt.xlabel('epochs')
        plt.ylabel('reward')
        plt.plot(rewards_per_episode_)

        plt.savefig(f'{self.path}/info_rew_{self.tag}.png'.format(self.path, self.tag))
        plt.savefig(f'{self.ct}/pics/info_rew_{self.tag}.png')
        plt.close()

        plt.figure()
        
        plt.xlabel('epochs')
        plt.ylabel('epsilon')
        plt.plot(epsilon_history_)
        plt.savefig(f'{self.path}/info_eps_{self.tag}.png'.format(self.path, self.tag))
        plt.close()

    
    

    

    def train(self, policy_dqn, target_dqn, episodes, render=False):
        # Create FrozenLake instance
        env = gym.make(self.game_name, render_mode='human' if render else None)

        #env.action_space.seed(42)
        
        num_states = env.observation_space.shape[0] # expecting 2: position & velocity
        num_actions = env.action_space.n

        # Divide position and velocity into segments
        self.pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], self.num_divisions)    # Between -1.2 and 0.6
        self.vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], self.num_divisions)    # Between -0.07 and 0.07

        epsilon = 1 # 1 = 100% random actions
        memory = ReplayMemory(self.replay_memory_size)

        rewards_per_episode = []
        epsilon_history = []

        # Track number of steps taken. Used for syncing policy => target network.
        step_count = 0
        goal_reached = False
        best_rewards = -200

        for i in range(episodes+1):
            state = env.reset()[0]  # Initialize to state 0 seed=int(i+10)
            terminated = False      # True when agent falls in hole or reached goal
            truncated = False
            rewards = 0

            while(not terminated and rewards > -1000):
                if random.random() < epsilon:
                    action = env.action_space.sample() # actions: 0=left,1=idle,2=right
                else:
                    res = policy_dqn.forward(self.state_to_dqn_input(state))
                    action = res.argmax().item()

# Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)

#Truncation: The length of the episode is 200.

                new_state,reward,terminated,truncated,_ = env.step(action)
                rewards += reward
                memory.append((state, action, new_state, reward, terminated))
                state = new_state
                
                step_count+=1


            rewards_per_episode.append(rewards)
            
            if(terminated):
                goal_reached = True

            # Graph training progress
            if(i!=0 and i%1000==0):
                print(f'Episode {i} Epsilon {epsilon}')
                policy_dqn.save(f'{self.path}/mc_policy_{i}'.format(self.path, i))
                self.add_log_data(f'Episode {i} Epsilon {epsilon}')
                self.plot_progress(rewards_per_episode, epsilon_history)

            if rewards>best_rewards:
                best_rewards = rewards
                print(f'Best rewards so far: {best_rewards}')
                self.add_log_data(f'Best rewards so far: {best_rewards}')
                policy_dqn.save(f'{self.path}/mc_policy_{i}'.format(self.path, i))
                policy_dqn.save(f'{self.path}/mc_policy_best'.format(self.path))
                

            # Check if enough experience has been collected
            if len(memory)>self.mini_batch_size and goal_reached:
                
                #print(f'OPTIMIZE Episode {i} Epsilon {epsilon} rewards {rewards}') # print(rewards)

                mini_batch = memory.sample(self.mini_batch_size)
                self.optimize(mini_batch, policy_dqn, target_dqn)

                # Decay epsilon
                epsilon = epsilon = max(epsilon - 1/episodes, 0.01) # max(epsilon*0.99996, 0.05)#
                epsilon_history.append(epsilon)

                # Copy policy network to target network after a certain number of steps
                if step_count > self.network_sync_rate:
                    target_dqn = policy_dqn.deep_copy()
                    step_count = 0
                   
                
                
        env.close()
        policy_dqn.save(f'{self.path}/mc_policy_last_{self.tag}'.format(self.path, self.tag))
        if (best_rewards == -200):
            policy_dqn.save(f'{self.path}/mc_policy_best'.format(self.path))
        self.save_reward_data(rewards_per_episode)
        #print(rewards_per_episode)



    
    def optimize(self, mini_batch, policy_dqn, target_dqn):
        target_q_list = []
        input_list = []
        
        for state, action, new_state, reward, terminated in mini_batch:
            if terminated:
                target = reward
            else:
                target = reward + self.discount_factor_g * target_dqn.forward(self.state_to_dqn_input(new_state)).max()

            # Get the target set of Q values
            state_dsc = np.asarray(self.state_to_dqn_input(state))
            input_list.append(state_dsc)
            
            target_q = target_dqn.forward(state_dsc)
            
            # Adjust the specific action to the target that was just calculated
            target_q[action] = target            
            target_q_list.append(target_q)

        #BACKPOP AND UPDATE on minibatch
        x = np.asarray(input_list)
        x = x[:, :, 0]
        x = x.T

        y = np.asarray(target_q_list)
        y = y[:, :, 0]
        y = y.T

        policy_dqn.learn_one(x, y)



    def state_to_dqn_input(self, state):
        state_p = np.digitize(state[0], self.pos_space)
        state_v = np.digitize(state[1], self.vel_space)
        
        # state_p = state[0]
        # state_v = state[1]

        return np.asarray([[state_p], [state_v]])

        

    def test(self, policy_dqn, episodes, render = False):
        env = gym.make(self.game_name, render_mode='human' if render else None)
        
        num_states = env.observation_space.shape[0]
        num_actions = env.action_space.n

        self.pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], self.num_divisions)    # Between -1.2 and 0.6
        self.vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], self.num_divisions)    # Between -0.07 and 0.07

        done_count = 0
        medium_reward = 0
        reward_list = []
        
        for i in range(episodes):
            state, info = env.reset()  # Initialize to state 0
            done = False      # True when agent falls in hole or reached goal
            truncated = False       # True when agent takes more than 200 actions
            rewards = 0
            # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
            
            while(not done and rewards > -500):
            #while(not done and not truncated):
                state = self.state_to_dqn_input(state)
                #print('state shape', state.shape)
               
                res = policy_dqn.forward(state)

                action = res.argmax().item()
                
# Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)

#Truncation: The length of the episode is 200.

                #new_state,reward,terminated,truncated,_ = env.step(action)
                
                state, reward, done, truncated, _ = env.step(action)
                rewards+=reward
                if (done):
                    done_count += 1
                    break

            medium_reward += rewards
            reward_list.append(rewards)
                

        
        env.close()
        medium_reward = medium_reward / episodes
        return done_count*100.0/episodes, medium_reward, reward_list
        
    def save_info(s, info):
        file_path = f'{s.path}/info_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a")   
       


        f.write("data {}\n".format(s.ct))
        f.write("tag {}\n".format(s.tag))
        f.write("game_name {}\n".format(s.game_name))
        f.write("reward discount factor {}\n".format(s.discount_factor_g))
        f.write("minibatch size {}\n".format(s.mini_batch_size))
        f.write("num divisions{}\n".format(s.num_divisions))
        f.write("replay memory size {}\n".format(s.replay_memory_size))
        f.write("network sync rate {}\n".format(s.network_sync_rate))
        f.write("info {}\n".format(info))
                         
        f.close()
    
    def add_log_data(s, data):
        file_path = f'{s.path}/info_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a") 
        f.write("{}\n".format(data))
                         
        f.close()
        
    def save_reward_data(s, data):
        file_path = f'{s.path}/rewards_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a") 
        f.write("{}\n".format(data))
                         
        f.close()


In [5]:
# #sizes_list = [4, 8, 12, 16, 20, 24, 28, 32, 36]
# sizes_list = [2]
# learning_rate_a = 0.001
# in_states = 2
# out_actions = 3
        
# a_type1 = 'ReLU'
# a_type2 = 'Linear'

# ct = datetime.datetime.now()
# ct = str(ct)
# ct = ct.replace(":", "-")
# ct = ct.replace(" ", "_")
# ct = ct[:-7]

# n_models = 5

# n_tests_for_model = 100
# epochs = 5000


# for each in sizes_list:
#     for i in range(n_models):
#         # if (i < 5):
#         #     learning_rate_a = 0.001 
#         # elif (i >= 10):# and i < 20):
#         #     learning_rate_a = 0.00005

#         # if (i < 5):
#         learning_rate_a = 0.001 
#         # else:
#         #     learning_rate_a = 0.01

#         exp_name = "{:02d}_{:02d}_{:02d}".format(each, each, i)
#         print(exp_name)
        
#         h1_nodes = each
#         h2_nodes = each

        
#         layers_net = [input_layer(in_states), 
#         layer(lr = learning_rate_a, prev_size = in_states, my_size=h1_nodes, activation_type=a_type1), 
#         layer(lr = learning_rate_a, prev_size = h1_nodes, my_size=h2_nodes, activation_type=a_type1), 
#         layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)]
        
#         policy_dqn = plastic_nn(optimizer="Adam")
#         policy_dqn.append_layers(layers_net)
        
#         target_dqn = plastic_nn()
#         target_dqn = policy_dqn.deep_copy()

 
#         folder_path = ct+'/'+exp_name

        
#         mountaincar = DQN(ct, tag = exp_name, path = folder_path, game_name = 'MountainCar-v0', 
#                           discount_factor_g = 0.9, mini_batch_size = 64,  num_divisions = 50, replay_memory_size = 100000, network_sync_rate = 50000)
        
#         mountaincar.save_info(ct, f'lr: {learning_rate_a} \nin:{in_states} \nh:{h1_nodes}x2 (--) \nout:{out_actions} \na1:{a_type1} \na2:{a_type2} \n'.format(
#         learning_rate_a, in_states, h1_nodes, out_actions, a_type1, a_type2))



        
#         mountaincar.train(policy_dqn, target_dqn, epochs, False)
        

#         policy_dqn.load(f'{folder_path}/mc_policy_best')
#         test_res, medium_reward, reward_list = mountaincar.test(policy_dqn, n_tests_for_model, render = False)

#         file_path = '{}/tests_{}.txt'.format(folder_path, exp_name)
#         f = open(file_path, "a") 
#         f.write("n_tests {}\n".format(n_tests_for_model))
#         f.write("done % {}\n".format(test_res))
#         f.write("medium_reward {}\n".format(medium_reward))   
#         f.write("reward_list {}\n".format(reward_list))   
        
#         f.close()
        
#         del policy_dqn
#         del target_dqn
#         del mountaincar
        



In [6]:
# learning_rate_a = 0.0001
# in_states = 2
# h1_nodes = 2
# h2_nodes = 2
# out_actions = 3

# a_type1 = 'ReLU'
# a_type2 = 'Linear'

# layers_net = [input_layer(in_states), 
# layer(lr = learning_rate_a, prev_size = in_states, my_size=h1_nodes, activation_type=a_type1), 
# layer(lr = learning_rate_a, prev_size = h1_nodes, my_size=h2_nodes, activation_type=a_type1), 
# layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)]

# policy_dqn = plastic_nn(optimizer="Adam")
# policy_dqn.append_layers(layers_net)

# target_dqn = plastic_nn()
# target_dqn = policy_dqn.deep_copy()


In [7]:
# ct = datetime.datetime.now()
# ct = str(ct)
# ct = ct.replace(":", "-")
# ct = ct.replace(" ", "_")
# ct = ct[:-7]

## alg

In [8]:
def coin_flip():
    return 100*random.random() 

In [9]:

def test(n_tests):
    mean = 0
    for i in range(n_tests):
        mean += coin_flip()
    return mean/n_tests

In [100]:
learning_rate_a = 0.0001
in_states = 2
h1 = 2
h2 = 2
out_actions = 3

a_type1 = 'ReLU'
a_type2 = 'Linear'

layers_net = [input_layer(in_states), 
layer(lr = learning_rate_a, prev_size = in_states, my_size=h1, activation_type=a_type1), 
layer(lr = learning_rate_a, prev_size = h1, my_size=h2, activation_type=a_type1), 
layer(lr = learning_rate_a, prev_size = h2, my_size=out_actions, activation_type=a_type2)]

policy_dqn = plastic_nn(optimizer="Adam")
policy_dqn.append_layers(layers_net)

target_dqn = plastic_nn()




n_models = 2
final_shape = np.array([2, 4, 4, 3]) # shape of nn (somehow)
TSR = 50.0 # target success rate
n_tests_per_model = 100
start_net_shape = [in_states, h1, h2, out_actions]

model_ = DQN(game_name = 'MountainCar-v0', mini_batch_size = 64, num_divisions = 50)

def search(start_shape = 0, shape_steps = np.array([[0, 1, 1, 0]])):
    shape = start_shape

    model = shape
    steps_dif = shape_steps.shape[0]
    step_i = 0
    safety = 100
    idx = 0
    while not (np.array_equal(shape, final_shape)):
        
        for m in range(n_models):
            if (idx >= safety): # against looping forever
                print("YOU STUPID BITCH")
                return False, shape, model
                
            idx+=1
            print('model i: ', m)
            
            model = 0 # new model
            
            model_.set_tag(tag = str(shape[1])+'_'+str(m))
            model_.save_info(f'lr: {learning_rate_a} \nshape:{shape} \na1:{a_type1} \na2:{a_type2} \n')


            

            # set shape
            model = shape
            print(model)
            target_dqn = policy_dqn.deep_copy()
            

            

            # mountaincar.train(policy_dqn, target_dqn, epochs, False)
            
            # test model
            test_res = test(n_tests_per_model)
            if (test_res >= TSR):
                print('success shape: ', shape)
                return test_res, shape, m
        
        #shape step
        

        sh_step = shape_steps[step_i]
        step_i += 1
        step_i %= steps_dif
        shape+=sh_step#shape_step
    
        
    return False, shape, model

In [109]:
test_res, shape, model = search(start_shape = np.array(start_net_shape))
print(test_res, shape, model)

model i:  0
[2 2 2 3]
model i:  1
[2 2 2 3]
model i:  0
[2 3 3 3]
model i:  1
[2 3 3 3]
False [2 4 4 3] [2 4 4 3]


In [110]:
learning_rate_a = 0.001
in_states = 2
h1_nodes = 3
h2_nodes = 3
out_actions = 3

a_type1 = 'ReLU'
a_type2 = 'Linear'

layers_net = [input_layer(in_states), 
layer(lr = learning_rate_a, prev_size = in_states, my_size=h1_nodes, activation_type=a_type1), 
layer(lr = learning_rate_a, prev_size = h1_nodes, my_size=h2_nodes, activation_type=a_type1), 
layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)]

policy_dqn = plastic_nn(optimizer="Adam")
policy_dqn.append_layers(layers_net)

target_dqn = plastic_nn()
target_dqn = policy_dqn.deep_copy()


In [112]:

mountaincar = DQN(game_name = 'MountainCar-v0', discount_factor_g = 0.9, mini_batch_size = 64, 
                  num_divisions = 50, replay_memory_size = 100000, network_sync_rate = 50000)
mountaincar.set_tag('00')

In [114]:
mountaincar.save_info(f'lr: {learning_rate_a} \nin:{in_states} \nh:{h1_nodes}x2 (--) \nout:{out_actions} \na1:{a_type1} \na2:{a_type2} \n'.format(
    learning_rate_a, in_states, h1_nodes, out_actions, a_type1, a_type2))
mountaincar.train(policy_dqn, target_dqn, 5000, False)


Episode 1000 Epsilon 1
Episode 2000 Epsilon 0.8586000000000156
Episode 3000 Epsilon 0.6586000000000376
Episode 4000 Epsilon 0.4586000000000596
Episode 5000 Epsilon 0.25860000000008165


In [115]:
mountaincar.set_tag('01')

In [116]:
# policy_dqn.print_info()

# policy_dqn.add_neuron(1)
# policy_dqn.add_neuron(2)
# policy_dqn.print_info()


NAME:  noname  ( 4 )
# 0
IN LAYER
size:  2

# 1
my size:  3
prev size:  2
w:  (3, 2) [[ 0.67925268  0.13842754]
 [ 0.45634281 -0.06332325]
 [ 0.55661498  0.25473126]] 

b:  (3, 1) [[ 0.29784162]
 [-0.18513533]
 [ 0.10361444]] 


# 2
my size:  3
prev size:  3
w:  (3, 3) [[0.35846795 0.8414906  0.67718721]
 [0.55038868 0.23889477 0.22907045]
 [0.47092587 0.64333241 0.45709044]] 

b:  (3, 1) [[0.67079317]
 [0.14765448]
 [0.45500972]] 


# 3
my size:  3
prev size:  3
w:  (3, 3) [[ 0.29354209  0.34231282  0.23158635]
 [ 0.23449256 -0.00523411  0.57487218]
 [ 0.5044174   0.23017876  0.06636201]] 

b:  (3, 1) [[0.69387427]
 [0.14069825]
 [0.59514437]] 


NAME:  noname  ( 4 )
# 0
IN LAYER
size:  2

# 1
my size:  4
prev size:  2
w:  (4, 2) [[ 0.67925268  0.13842754]
 [ 0.45634281 -0.06332325]
 [ 0.55661498  0.25473126]
 [ 0.1         0.1       ]] 

b:  (4, 1) [[ 0.29784162]
 [-0.18513533]
 [ 0.10361444]
 [ 0.1       ]] 


# 2
my size:  4
prev size:  4
w:  (4, 4) [[0.35846795 0.8414906  0.677187

In [117]:
target_dqn = policy_dqn.deep_copy()

In [118]:
mountaincar.save_info(f'lr: {learning_rate_a} \nin:{in_states} \nh:{h1_nodes}x2 (--) \nout:{out_actions} \na1:{a_type1} \na2:{a_type2} \n'.format(
    learning_rate_a, in_states, 4, out_actions, a_type1, a_type2))
mountaincar.train(policy_dqn, target_dqn, 5000, False)


Episode 1000 Epsilon 1
Episode 2000 Epsilon 1
Episode 3000 Epsilon 1
Episode 4000 Epsilon 1


ValueError: operands could not be broadcast together with shapes (3,3) (3,4) 