-
Notifications
You must be signed in to change notification settings - Fork 0
/
CartPole-v2.py
138 lines (127 loc) · 4.69 KB
/
CartPole-v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import numpy as np
import gym
import random
import time
L_R = 1 # Learning rate is variable in range(0, 1) used in Belman equation it's mean how much we want to update the value.
GAMMA = 1 # We set GaMMA to 1 because we are interested to next state.
ENV_NAME = "CartPole-v1" # The enviroment tht we want to include from GYM library.
STATE_NUMBER = 5 # We make very parametere of the state between 0 and 5 to make the numbers of states limited.
EPSILONE = 0.05 # This variable means how mutch we want to explore new move or exploit our move that we are already explore.
TRINING_LEN = 5000 # How mutch we want to trine our agent.
SCORE_TO_ACHEIVE = 195 # The score that We want to achieve
PINALTY = -300 # The pinalty That we will give to our agent after losing
Q_table = {} # Where we will store are our state and action
def updtae_q_table_value(observation, env, action, state, reward):
# Function to update the Q_table using belman equation
# BELMAN EQUATION
# Q_table[state][action_chosed] = Q_table[state][action_chosed] + learning_rate * (reward_that_we_get_from_that_move +
# GAMMA * expected_reward_that_we_can_get_from_next_move - Q_table[state][action_chosed])
state = str(state)
state_2 = convert_observation_data(observation, env)
if state not in Q_table:
Q_table[state] = [0, 0]
old_value = Q_table[state][action]
Q_table[state][action] = old_value + L_R * (reward + GAMMA * chose_action_from_table(state_2) - old_value)
def chose_action_from_table(state):
# chose action that havae most expected reward based on the table and we have just to move left or right (0 | 1)
state = str(state)
if state not in Q_table:
Q_table[state] = [0, 0]
actions = Q_table[state]
if actions[0] >= actions[1]:
return 0
return 1
def convert_observation_data(observation, env):
# convert status to int numbers in spescefique range to make number of state limited in 5^4 = 625
range_len = STATE_NUMBER
low = [-2.4, -3, -0.418, -4]
hight = [2.4, 3, 0.418, 4]
data = []
for i in range(len(observation)):
a = (observation[i] - low[i]) / (hight[i] - low[i])
a = int(a * range_len)
data.append(a)
return (data)
def training(env):
global EPSILONE, L_R
score = 0
test_score = 0
while test_score < 200:
for i in range(TRINING_LEN):
total_reward = 0
observation = env.reset()
done = False
while not done:
state = convert_observation_data(observation, env)
# we select random move if random number < EPSILON and just in the first 75% game (exploring new moves).
if np.random.uniform(0, 1) < EPSILONE and i < TRINING_LEN - (TRINING_LEN / 4):
action = np.random.choice(env.action_space.n)
# Select action from table (EXPLOITING).
else:
action = chose_action_from_table(state)
observation, reward, done, info = env.step(int(action))
# for every move our agent get 1 like a reward event in the last move, so we change the last move reward to PINALTI VALUE like.
if done and total_reward < SCORE_TO_ACHEIVE:
reward = PINALTY
updtae_q_table_value(observation, env, int(action), state, reward)
total_reward += reward
if done:
break
if total_reward >= SCORE_TO_ACHEIVE:
score += 1
# We decrease the Learning_rate and Epsilon value after each episode.
EPSILONE -= (EPSILONE / (TRINING_LEN))
L_R -= (L_R / TRINING_LEN)
if i and i % 1000 == 0:
print("Number of game where we achive the goal in 1000 game is {}".format(score))
test_score = score
score = 0
if test_score >= 200:
break;
print("\nFnish trining")
# Calcule The average of 100 match, it's should pass 195.
def calcule_average(env):
note = 0
average = 0
for i in range(100):
observation = env.reset()
score = 0
done = False
while not done:
state = convert_observation_data(observation, env)
action = chose_action_from_table(state) #swap # env.action_space.sample() # your agent here (this takes random actions)
observation, reward, done, info = env.step(action)
convert_observation_data(observation, env)
score += reward
if done:
break
if score >= 195:
note += 1
average += score
average /= 100
print("\nAVERAGE IS {} and note {}".format(average, note))
def simulate_our_agent(env):
print("\nSTART SIMULATION\n")
observation = env.reset()
score = 0
done = False
while not done:
env.render()
time.sleep(0.05)
state = convert_observation_data(observation, env)
action = chose_action_from_table(state)
observation, reward, done, info = env.step(action)
convert_observation_data(observation, env)
score += reward
if done:
break
print("\n OUR SCORE IS {}".format(score))
# SET ENVIREMENT
env = gym.make(ENV_NAME)
total = 0
done = False
env.seed(0)
np.random.seed(0)
training(env)
calcule_average(env)
simulate_our_agent(env)