In [1]:
# !pip install import-ipynb
# !pip install gymnasium
# !pip install gymnasium[classic-control]

In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random
import copy
import import_ipynb
from plastic_nn import plastic_nn
from plastic_nn import input_layer
from plastic_nn import layer
import time

importing Jupyter notebook from plastic_nn.ipynb


In [3]:
# np.random.seed(42)
# random.seed(42)
# num_of_inputs = 4
# hidden1 = 6 
# hidden2 = 6
# out_n = 2

# lr = 0.1


# layers_net = [input_layer(num_of_inputs), 
#           layer(lr = lr, prev_size = num_of_inputs, my_size=hidden1, activation_type="ReLU"), 
#           layer(lr = lr, prev_size = hidden1, my_size=hidden2, activation_type="ReLU"), 
#           layer(lr = lr, prev_size = hidden2, my_size=out_n, activation_type='Linear')] #, activation_type="ReLU")]

# brain = plastic_nn()
# brain.append(layers_net)

class replay_memory:
    def __init__(s, max_len):
        s.memory = deque([], maxlen=max_len)

    def append(s, val):
        s.memory.append(val)
        
    def clear(s):
        s.memory.clear()

    def get_sample(s, sample_size):
        return random.sample(s.memory, sample_size)

    def __len__(s):
        return len(s.memory)

    #cur_state, action, next_state, reward, terminated
    def print(s):
        for m in s.memory:
            print("state: ", cart_pole_state_to_human(m[0]))
            print("action: ", cart_pole_action_to_human(m[1]))
            print("next state: ", cart_pole_state_to_human(m[2]))
            print("reward: ", m[3])
            print("termninated?: ", m[4])
            print("\n")

In [26]:
np.random.seed(42)
random.seed(42)
num_of_inputs = 4
hidden1 = 64 
hidden2 = 42
out_n = 2

lr = 0.001


layers_net = [input_layer(num_of_inputs), 
          layer(lr = lr, prev_size = num_of_inputs, my_size=hidden1, activation_type="ReLU"), 
          layer(lr = lr, prev_size = hidden1, my_size=hidden2, activation_type="ReLU"), 
          layer(lr = lr, prev_size = hidden2, my_size=out_n, activation_type='Linear')] #, activation_type="ReLU")]

brain = plastic_nn()
brain.append(layers_net)
brain.give_name('ALEX')

def discrete_state(state):
    step_size = np.array([0.25, 0.25, 0.01, 0.01])
    ds = state/step_size + np.array([15,12,1,10])
    return tuple(ds.astype(int))
    #return state

memory = replay_memory(max_len = 1000)


def learn():
    global brain


    batch_size = min(len(memory), 32)
    batch = memory.get_sample(batch_size)
    
    learning_brain = brain.deep_copy()
    learning_brain.give_name('OLEG')

    print('JUST COPIED:')
    learning_brain.print_info()
    brain.print_info()    
    
    for prev_state, action, new_state, reward, done_s in batch:
        #print(prev_state, action, now_state, reward, done_s)
        
        if (done_s):
            new_q = reward
        else:
            expected_next_max_reward = brain.forward_nu(now_state).max()
            new_q = reward + gamma*expected_next_max_reward       

        currect_res = brain.forward_nu(prev_state) # no input update
        
        target_res = learning_brain.forward(prev_state)
        target_res[action] = new_q

        err = target_res - currect_res
        learning_brain.backprop_error(err)
        learning_brain.update_w()
    
    print('AFTER LEARNING')
    learning_brain.print_info()
    brain.print_info()
    
    brain = learning_brain.deep_copy()
    brain.give_name('ALEX')
    
    print('AFTER DEEP_COPY')
    learning_brain.print_info()
    brain.print_info()



seed_val = 42

epochs = 10
env = gym.make('CartPole-v0') #, render_mode="human")

explore_disc_rate = 0.999
explore_prob = 1
gamma = 0.95




for e in range(epochs+1):
    truncated = False
    terminated = False
    
    prev_state, info = env.reset(seed = seed_val) # seed = seed_val

    
    prev_reward = 0
    total_sum = 0
    rand_action_count = 0
    while not truncated and not terminated:
        #time.sleep(0.5)
        #print('\t',time_sum)
        
        if (random.random() < explore_prob): 
            rand_action_count +=1
            action = env.action_space.sample() # explore      
        else:

            last_res = brain.forward(discrete_state(prev_state))
            action = last_res.argmax().item()      

        
        
        now_state, reward, terminated, done_s, info = env.step(action)
        
        if not done_s and not terminated:
            total_sum += 1
            
        if done_s:
            reward = total_sum + 20
        elif terminated:
            reward = -1#total_sum
        else:
            reward = 1#total_sum
            
        memory.append((prev_state, action, now_state, reward, done_s))
        
        
        prev_state = now_state

        if terminated or done_s:
            print(len(memory))
            if (len(memory) >= 32):
                print('BEFORE ---------')
                brain.print_info()
                
                learn()
                
                print('AFTER ----------')
                brain.print_info()
                if (explore_prob > 0.01):
                    explore_prob *= explore_disc_rate

    
    if (e%1 == 0):
        print('\ne: ', e, ' explore_prob: ', explore_prob, 
              ' sum: ', total_sum, ' rand/total: ', rand_action_count/total_sum,'\n----------------------\n')        
        
        


env.close()
    

added LAYERS succesfully
18

e:  0  explore_prob:  1  sum:  17  rand/total:  1.0588235294117647 
----------------------

36
BEFORE LEARNING
NAME:  ALEX  ( 4 )
# 0
IN LAYER
size:  4
[0. 0. 0. 0.]

# 1
my size:  64
w:  [[0.37454012 0.95071431 0.73199394 0.59865848 0.15601864 0.15599452
  0.05808361 0.86617615 0.60111501 0.70807258 0.02058449 0.96990985
  0.83244264 0.21233911 0.18182497 0.18340451 0.30424224 0.52475643
  0.43194502 0.29122914 0.61185289 0.13949386 0.29214465 0.36636184
  0.45606998 0.78517596 0.19967378 0.51423444 0.59241457 0.04645041
  0.60754485 0.17052412 0.06505159 0.94888554 0.96563203 0.80839735
  0.30461377 0.09767211 0.68423303 0.44015249 0.12203823 0.49517691
  0.03438852 0.9093204  0.25877998 0.66252228 0.31171108 0.52006802
  0.54671028 0.18485446 0.96958463 0.77513282 0.93949894 0.89482735
  0.59789998 0.92187424 0.0884925  0.19598286 0.04522729 0.32533033
  0.38867729 0.27134903 0.82873751 0.35675333]
 [0.28093451 0.54269608 0.14092422 0.80219698 0.07455064

In [5]:
env.close()