In [1]:
!pip install gymnasium
!pip install gymnasium[classic-control]

Defaulting to user installation because normal site-packages is not writeable




Defaulting to user installation because normal site-packages is not writeable




In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random
import torch
from torch import nn
import torch.nn.functional as F
import os
import datetime


import import_ipynb
from TrulyPlastic_allOpt_4 import plastic_nn
from TrulyPlastic_allOpt_4 import input_layer
from TrulyPlastic_allOpt_4 import layer

importing Jupyter notebook from TrulyPlastic_allOpt_4.ipynb


In [3]:
#np.random.seed(42)
#random.seed(42)

In [4]:

# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)




class DQN():
    # Hyperparameters (adjustable)
    def __init__(s, ct, tag='tag',path = r'test', game_name = 'MountainCar-v0', discount_factor_g = 0.9, 
                 mini_batch_size = 32, 
                  num_divisions = 20, replay_memory_size = 100000, network_sync_rate = 50000):
        s.ct = ct
        s.tag = tag
        s.path = path
        current_directory = os.getcwd()
        final_directory = os.path.join(current_directory, s.path)
        if not os.path.exists(final_directory):
            os.makedirs(final_directory)
    
        s.game_name = game_name
        s.discount_factor_g = discount_factor_g
         
        s.mini_batch_size = mini_batch_size 
        s.num_divisions = num_divisions

        s.replay_memory_size =  replay_memory_size 
        s.network_sync_rate = network_sync_rate
        
    
    def plot_progress(self, rewards_per_episode_, epsilon_history_):
        
        plt.figure()
        
        plt.xlabel('epochs')
        plt.ylabel('reward')
        plt.plot(rewards_per_episode_)

        plt.savefig(f'{self.path}/info_rew_{self.tag}.png'.format(self.path, self.tag))
        plt.close()

        plt.figure()
        
        plt.xlabel('epochs')
        plt.ylabel('epsilon')
        plt.plot(epsilon_history_)
        plt.savefig(f'{self.path}/info_eps_{self.tag}.png'.format(self.path, self.tag))
        plt.close()

    

    

    def train(self, policy_dqn, target_dqn, episodes, render=False):
        # Create FrozenLake instance
        env = gym.make(self.game_name, render_mode='human' if render else None)

        #env.action_space.seed(42)
        
        num_states = env.observation_space.shape[0] # expecting 2: position & velocity
        num_actions = env.action_space.n

        # Divide position and velocity into segments
        self.pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], self.num_divisions)    # Between -1.2 and 0.6
        self.vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], self.num_divisions)    # Between -0.07 and 0.07

        epsilon = 1 # 1 = 100% random actions
        memory = ReplayMemory(self.replay_memory_size)

        rewards_per_episode = []
        epsilon_history = []

        # Track number of steps taken. Used for syncing policy => target network.
        step_count = 0
        goal_reached = False
        best_rewards = -200

        for i in range(episodes+1):
            state = env.reset()[0]  # Initialize to state 0 seed=int(i+10)
            terminated = False      # True when agent falls in hole or reached goal
            truncated = False
            rewards = 0

            while(not terminated and rewards > -1000):
                if random.random() < epsilon:
                    action = env.action_space.sample() # actions: 0=left,1=idle,2=right
                else:
                    res = policy_dqn.forward(self.state_to_dqn_input(state))
                    action = res.argmax().item()

# Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)

#Truncation: The length of the episode is 200.

                new_state,reward,terminated,truncated,_ = env.step(action)
                rewards += reward
                memory.append((state, action, new_state, reward, terminated))
                state = new_state
                
                step_count+=1


            rewards_per_episode.append(rewards)
            
            if(terminated):
                goal_reached = True

            # Graph training progress
            if(i!=0 and i%1000==0):
                print(f'Episode {i} Epsilon {epsilon}')
                policy_dqn.save(f'{self.path}/mc_policy_{i}'.format(self.path, i))
                self.add_log_data(f'Episode {i} Epsilon {epsilon}')
                self.plot_progress(rewards_per_episode, epsilon_history)

            if rewards>best_rewards:
                best_rewards = rewards
                print(f'Best rewards so far: {best_rewards}')
                self.add_log_data(f'Best rewards so far: {best_rewards}')
                policy_dqn.save(f'{self.path}/mc_policy_{i}'.format(self.path, i))
                policy_dqn.save(f'{self.path}/mc_policy_best'.format(self.path))
                

            # Check if enough experience has been collected
            if len(memory)>self.mini_batch_size and goal_reached:
                
                #print(f'OPTIMIZE Episode {i} Epsilon {epsilon} rewards {rewards}') # print(rewards)

                mini_batch = memory.sample(self.mini_batch_size)
                self.optimize(mini_batch, policy_dqn, target_dqn)

                # Decay epsilon
                epsilon = epsilon = max(epsilon - 1/episodes, 0.01) # max(epsilon*0.99996, 0.05)#
                epsilon_history.append(epsilon)

                # Copy policy network to target network after a certain number of steps
                if step_count > self.network_sync_rate:
                    target_dqn = policy_dqn.deep_copy()
                    step_count = 0
                   
                
                
        env.close()
        policy_dqn.save(f'{self.path}/mc_policy_last_{self.tag}'.format(self.path, self.tag))
        if (best_rewards == -200):
            policy_dqn.save(f'{self.path}/mc_policy_best'.format(self.path))
        self.save_reward_data(rewards_per_episode)
        #print(rewards_per_episode)



    
    def optimize(self, mini_batch, policy_dqn, target_dqn):
        target_q_list = []
        input_list = []
        
        for state, action, new_state, reward, terminated in mini_batch:
            if terminated:
                target = reward
            else:
                target = reward + self.discount_factor_g * target_dqn.forward(self.state_to_dqn_input(new_state)).max()

            # Get the target set of Q values
            state_dsc = np.asarray(self.state_to_dqn_input(state))
            input_list.append(state_dsc)
            
            target_q = target_dqn.forward(state_dsc)
            
            # Adjust the specific action to the target that was just calculated
            target_q[action] = target            
            target_q_list.append(target_q)

        #BACKPOP AND UPDATE on minibatch
        x = np.asarray(input_list)
        x = x[:, :, 0]
        x = x.T

        y = np.asarray(target_q_list)
        y = y[:, :, 0]
        y = y.T

        policy_dqn.learn_one(x, y)



    def state_to_dqn_input(self, state):
        state_p = np.digitize(state[0], self.pos_space)
        state_v = np.digitize(state[1], self.vel_space)
        
        # state_p = state[0]
        # state_v = state[1]

        return np.asarray([[state_p], [state_v]])

        

    def test(self, policy_dqn, episodes, render = False):
        env = gym.make(self.game_name, render_mode='human' if render else None)
        
        num_states = env.observation_space.shape[0]
        num_actions = env.action_space.n

        self.pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], self.num_divisions)    # Between -1.2 and 0.6
        self.vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], self.num_divisions)    # Between -0.07 and 0.07

        done_count = 0
        medium_reward = 0
        reward_list = []
        
        for i in range(episodes):
            state, info = env.reset()  # Initialize to state 0
            Termination = False      # True when agent falls in hole or reached goal
            truncated = False       # True when agent takes more than 200 actions
            rewards = 0
            # Agent navigates map until it falls into a hole (terminated), reaches goal (terminated), or has taken 200 actions (truncated).
            
            while(not done and rewards > -500):
            #while(not done and not truncated):
                state = self.state_to_dqn_input(state)
                #print('state shape', state.shape)
               
                res = policy_dqn.forward(state)

                action = res.argmax().item()
                
# Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)

#Truncation: The length of the episode is 200.

                #new_state,reward,terminated,truncated,_ = env.step(action)
                
                state, reward, Termination, truncated, _ = env.step(action)
                rewards+=reward
                if (Termination):
                    done_count += 1
                    break

            medium_reward += rewards
            reward_list.append(rewards)
                

        
        env.close()
        medium_reward = medium_reward / episodes
        return done_count*100.0/episodes, medium_reward, reward_list
        
    def save_info(s, ct, topology):
        file_path = f'{s.path}/info_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a")   
       


        f.write("data {}\n".format(s.ct))
        f.write("tag {}\n".format(s.tag))
        f.write("{}\n".format(s.game_name))
        f.write("{}\n".format(s.discount_factor_g))
        f.write("{}\n".format(s.mini_batch_size))
        f.write("{}\n".format(s.num_divisions))
        f.write("{}\n".format(s.replay_memory_size))
        f.write("{}\n".format(s.network_sync_rate))
        f.write("{}\n".format(topology))
                         
        f.close()
    
    def add_log_data(s, data):
        file_path = f'{s.path}/info_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a") 
        f.write("{}\n".format(data))
                         
        f.close()
        
    def save_reward_data(s, data):
        file_path = f'{s.path}/rewards_{s.tag}.txt'.format(s.path, s.tag)
        f = open(file_path, "a") 
        f.write("{}\n".format(data))
                         
        f.close()


In [5]:
#sizes_list = [4, 8, 12, 16, 20, 24, 28, 32, 36]
sizes_list = [4, 8, 16, 32]
learning_rate_a = 0.001
in_states = 2
out_actions = 3
        
a_type1 = 'ReLU'
a_type2 = 'Linear'

ct = datetime.datetime.now()
ct = str(ct)
ct = ct.replace(":", "-")
ct = ct.replace(" ", "_")
ct = ct[:-7]

n_models = 5

n_tests_for_model = 100
epochs = 5000


for each in sizes_list:
    for i in range(n_models):
        # if (i < 5):
        #     learning_rate_a = 0.001 
        # elif (i >= 10):# and i < 20):
        #     learning_rate_a = 0.00005

        # if (i < 5):
        learning_rate_a = 0.001 
        # else:
        #     learning_rate_a = 0.01

        exp_name = "{:02d}_{:02d}_{:02d}".format(each, each, i)
        print(exp_name)
        
        h1_nodes = each
        h2_nodes = each

        
        layers_net = [input_layer(in_states), 
        layer(lr = learning_rate_a, prev_size = in_states, my_size=h1_nodes, activation_type=a_type1), 
        layer(lr = learning_rate_a, prev_size = h1_nodes, my_size=h2_nodes, activation_type=a_type1), 
        layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)]
        
        policy_dqn = plastic_nn(optimizer="Adam")
        policy_dqn.append_layers(layers_net)
        
        target_dqn = plastic_nn()
        target_dqn = policy_dqn.deep_copy()

 
        folder_path = ct+'/'+exp_name

        
        mountaincar = DQN(ct, tag = exp_name, path = folder_path, game_name = 'MountainCar-v0', 
                          discount_factor_g = 0.9, mini_batch_size = 64,  num_divisions = 50, replay_memory_size = 100000, network_sync_rate = 50000)
        
        mountaincar.save_info(ct, f'lr: {learning_rate_a} \nin:{in_states} \nh:{h1_nodes}x2 (--) \nout:{out_actions} \na1:{a_type1} \na2:{a_type2} \n'.format(
        learning_rate_a, in_states, h1_nodes, out_actions, a_type1, a_type2))



        
        mountaincar.train(policy_dqn, target_dqn, epochs, False)
        

        policy_dqn.load(f'{folder_path}/mc_policy_best')
        test_res, medium_reward, reward_list = mountaincar.test(policy_dqn, n_tests_for_model, render = False)

        file_path = '{}/tests_{}.txt'.format(folder_path, exp_name)
        f = open(file_path, "a") 
        f.write("n_tests {}\n".format(n_tests_for_model))
        f.write("done % {}\n".format(test_res))
        f.write("medium_reward {}\n".format(medium_reward))   
        f.write("reward_list {}\n".format(reward_list))   
        
        f.close()
        
        del policy_dqn
        del target_dqn
        del mountaincar
        



02_02_00
Episode 1000 Epsilon 0.8470000000000169
Episode 2000 Epsilon 0.6470000000000389
Episode 3000 Epsilon 0.4470000000000609
Episode 4000 Epsilon 0.24700000000008251
Episode 5000 Epsilon 0.04700000000007733
02_02_01
Episode 1000 Epsilon 0.9012000000000109
Episode 2000 Epsilon 0.7012000000000329
Episode 3000 Epsilon 0.5012000000000549
Episode 4000 Epsilon 0.30120000000007696
Episode 5000 Epsilon 0.10120000000007834
02_02_02
Episode 1000 Epsilon 0.9962000000000004
Episode 2000 Epsilon 0.7962000000000224
Episode 3000 Epsilon 0.5962000000000445
Episode 4000 Epsilon 0.3962000000000665
Best rewards so far: -128.0
Episode 5000 Epsilon 0.19620000000008106
02_02_03
Episode 1000 Epsilon 1
Episode 2000 Epsilon 1
Episode 3000 Epsilon 0.809400000000021
Episode 4000 Epsilon 0.609400000000043
Best rewards so far: -180.0
Episode 5000 Epsilon 0.40940000000006505
02_02_04
Episode 1000 Epsilon 0.9786000000000024
Episode 2000 Epsilon 0.7786000000000244
Episode 3000 Epsilon 0.5786000000000464
Episode 4

In [16]:
# learning_rate_a = 0.0001
# in_states = 2
# h1_nodes = 2
# h2_nodes = 2
# out_actions = 3

# a_type1 = 'ReLU'
# a_type2 = 'Linear'

# layers_net = [input_layer(in_states), 
# layer(lr = learning_rate_a, prev_size = in_states, my_size=h1_nodes, activation_type=a_type1), 
# layer(lr = learning_rate_a, prev_size = h1_nodes, my_size=h2_nodes, activation_type=a_type1), 
# layer(lr = learning_rate_a, prev_size = h2_nodes, my_size=out_actions, activation_type=a_type2)]

# policy_dqn = plastic_nn(optimizer="Adam")
# policy_dqn.append_layers(layers_net)

# target_dqn = plastic_nn()
# target_dqn = policy_dqn.deep_copy()


In [17]:
# target_dqn.print_info()
# target_dqn.add_neuron(1)
# target_dqn.print_info()

NAME:  noname  ( 4 )
# 0
IN LAYER
size:  2

# 1
my size:  2
prev size:  2
w:  (2, 2) [[0.66347946 0.871685  ]
 [0.79404968 0.85809928]] 

b:  (2, 1) [[0.57904746]
 [0.12627123]] 


# 2
my size:  2
prev size:  2
w:  (2, 2) [[0.84139939 0.69102174]
 [0.07629438 0.65596232]] 

b:  (2, 1) [[0.12966704]
 [0.91321854]] 


# 3
my size:  3
prev size:  2
w:  (3, 2) [[0.65020909 0.47210349]
 [0.97109923 0.81246104]
 [0.2746013  0.7848105 ]] 

b:  (3, 1) [[0.37482507]
 [0.67176289]
 [0.62880545]] 




AttributeError: 'layer' object has no attribute 'weights'

In [7]:
# ct = datetime.datetime.now()
# ct = str(ct)
# ct = ct.replace(":", "-")
# ct = ct.replace(" ", "_")
# ct = ct[:-7]

In [8]:

# mountaincar = DQN(ct, path = ct, game_name = 'MountainCar-v0', discount_factor_g = 0.9, mini_batch_size = 64, 
#                   num_divisions = 50, replay_memory_size = 100000, network_sync_rate = 50000)


In [9]:
# mountaincar.save_info(ct, f'lr: {learning_rate_a} \nin:{in_states} \nh:{h1_nodes}x2 (--) \nout:{out_actions} \na1:{a_type1} \na2:{a_type2} \n'.format(
#     learning_rate_a, in_states, h1_nodes, out_actions, a_type1, a_type2))
# mountaincar.train(policy_dqn, target_dqn, 25000, False)


In [10]:
#myADAM_andENV/31_05/2024-05-31_12-45-39/mc_policy_last

In [11]:
#policy_dqn.load(f'{ct}/mc_policy_24204') #f'{ct}/mc_policy_21560'

In [12]:
#policy_dqn.load('2024-05-31_12-45-39/mc_policy_last ') #f'{ct}/mc_policy_21560'

In [13]:
#mountaincar.test(policy_dqn, 3, render = True)

In [14]:
#policy_dqn.print_info()

In [15]:
#policy_dqn.print_info()