In [1]:
import tensorflow as tf #Tensorflow handles the Training and Testing
from tensorflow import keras #Keras handles the importing of Data
import numpy as np #NumPy does funny math good
import gym #imports OpenAI Gym which has a bunch of environments(games) to play with
import matplotlib.pyplot as plt
from statistics import mean, median 
from tqdm import tqdm
from keras.models import load_model
from keras.activations import relu, linear
from keras.optimizers import Adam
from keras.losses import mean_squared_error
import random
from collections import deque 

# Environment Notes:
  - `new_state` is an array of 8 observations
    - Num   Observation
    - 0     Lander X Coord
    - 1     Lander Y Coord
    - 2     Lander X Velocity
    - 3     Lander Y Velocity
    - 4     Lander Angle
    - 5     Lander Angular Velocity
    - 6     Left Lander Leg Grounded
    - 7     Right Lander Leg Grounded

  - `action_space` is an array of 4 actions 
    - Num   Observation
    - 0	    Do Nothing/Coast
    - 1	    Fire Left Engine
    - 2	    Fire Bottom Engine
    - 3	    Fire Right Engine

  - Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main engine is -0.3 points each frame. Solved is 200 points. Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land on its first attempt. Episodes are terminated if episode length is greater than `MAX_STEPS` or lander exceedes boundaries.
      
```
print("Number of States:", env.observation_space.shape[0])  ->  Number of States: 8
print("Number of Actions per State:", env.action_space.n)   ->  Number of Actions per State: 4
```

In [2]:
env = gym.make('LunarLander-v2') #Creates Environment "LunarLander-v2" from OpenAI Gym

In [3]:
# #Runs 5 games with bottom thruster firing 
# for _ in range(10000):
#     env.reset()
#     for s in range(300):
        
#         env.render(True) #Renders Environment. CAUTION: Rendering takes more time to train
        
#         #Picks Action based on max reward
#         action = 0
#         if s%3==0: #Fires every 3rd frame
#             action = 2
        
#         new_state, reward, done, _ = env.step(action) #Takes the action

#         #changes states
#         state = new_state

#         # #Handles if game finished
#         # if done:
#         #     break

In [4]:
#Setting up the DQN
class DQN():
    def __init__(self, env, lr, gamma, epsilon, epsilon_decay):
        # Hyperparameters
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = 0.01

        # Environment variables
        self.env = env
        self.action_space = env.action_space
        self.observation_space = env.observation_space
        self.num_action_space = env.action_space.n
        self.num_observation_space = env.observation_space.shape[0]

        # Training Variables 
        self.training_data = deque(maxlen=500000)
        self.rewards_list = []
        self.batch_size = 64
        self.high_score = -8000

        # Creating DQN with Architecture 512-256-4
        model = keras.Sequential()
        model.add(keras.layers.Dense(512, input_dim=self.num_observation_space, activation=relu))
        model.add(keras.layers.Dense(256, activation=relu))
        model.add(keras.layers.Dense(self.num_action_space, activation=linear))

        # Compiling Model using MSE Loss and Adam Optimizer
        model.compile(loss=mean_squared_error, optimizer=Adam(lr=self.lr))

        self.model = model
        print(model.summary())
    
    # Chooses an action based on the Epsilon value (Random action Epsilon% of the time)
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return random.randrange(self.num_action_space)
        return np.argmax(self.model.predict(state)[0])

    # Trains model based off of Cumulative Training Data
    def learn(self): #COME BACK HERE AND FIND OUT WTF HAPPENED

        # Cancels Training if there is insufficient data or if ther model is sufficiently trained
        if len(self.training_data) < self.batch_size:
            return
        if np.mean(self.rewards_list[-10:]) > 180:
            return

        # Randomly Samples frames out of Training Data based on self.batch_size
        sample = random.sample(self.training_data, self.batch_size)
        
        # Extracts components from each frame and condenses them into arrays
        states = np.squeeze(np.squeeze(np.array([i[0] for i in sample])))
        actions = np.array([i[1] for i in sample])
        rewards = np.array([i[2] for i in sample])
        new_states = np.squeeze(np.array([i[3] for i in sample]))
        done_list = np.array([i[4] for i in sample])
        
        # Creates "targets" for model.fit()
        targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(new_states), axis=1)) * (1 - done_list)
        target_vec = self.model.predict_on_batch(states)
        indexes = np.array([i for i in range(self.batch_size)])
        target_vec[[indexes], [actions]] = targets
        
        self.model.fit(states, target_vec, epochs=1, verbose=0)

    # Handles Generating Training Episodes and Trains Model
    def train(self, episodes = 500):
        progress = tqdm(total=episodes, position=0, leave=False)
        
        # Epsiodes Loop
        for e in range(episodes):
            progress.update(1)

            state = env.reset()
            episode_reward = 0
            MAX_STEPS = 1000
            state = np.reshape(state, [1, self.num_observation_space])

            # Step Loop
            for s in range(MAX_STEPS):
                #env.render()

                action = self.get_action(state) # Chooses action

                new_state, reward, done, _ = env.step(action) # Takes Action and records New State
                new_state = np.reshape(new_state, [1, self.num_observation_space])

                self.training_data.append((state, action, reward, new_state, done)) # adds information about the fram to training data
                
                episode_reward += reward # Reward tally

                state = new_state #Progressing of game
                
                self.learn() 

                if done:
                    break

            self.rewards_list.append(episode_reward) # Tracks rewards and keeps a high score
            if self.high_score < episode_reward:
                self.high_score = episode_reward

            if self.epsilon > self.epsilon_min: # Handles epsilon decay over the course of the episode
                self.epsilon *= self.epsilon_decay #(episodes-e)/episodes 
            
            if np.mean(self.rewards_list[-100:]) > 200: # Stops training if Scores are above 200
                print("Average Score: 200. Training Completed...")
                break
            
            
    
            print(" || Reward: ", "%.2f" % episode_reward, "\t|| Average Reward: ", "%.2f" % np.mean(self.rewards_list[-100:]), "\t epsilon: ", "%.4f" % self.epsilon )

        print("Training Complete...")
        print("Highest Training Score:", self.high_score)

    # Saves Model in .h5 format
    def save(self, name):
        self.model.save("./LLModels"+name)


In [5]:
#Hyperparameters
lr = .001
epsilon = 1.0
epsilon_decay = 0.995
gamma = 0.99

model = DQN(env, lr, gamma, epsilon, epsilon_decay)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               4608      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 1028      
Total params: 136,964
Trainable params: 136,964
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
model.train(episodes = 2000)
model.save("LLtrainedmodel.h5")

ward:  -579.78 	|| Average Reward:  -1697.23 	 epsilon:  0.0100
 92%|█████████▏| 1842/2000 [10:02:27<45:10, 17.16s/it] || Reward:  -367.02 	|| Average Reward:  -1697.04 	 epsilon:  0.0100
 92%|█████████▏| 1843/2000 [10:03:01<57:52, 22.12s/it] || Reward:  -1577.97 	|| Average Reward:  -1710.27 	 epsilon:  0.0100
 92%|█████████▏| 1844/2000 [10:03:22<56:59, 21.92s/it] || Reward:  -1862.88 	|| Average Reward:  -1727.33 	 epsilon:  0.0100
 92%|█████████▏| 1845/2000 [10:03:41<53:53, 20.86s/it] || Reward:  -2216.69 	|| Average Reward:  -1744.91 	 epsilon:  0.0100
 92%|█████████▏| 1846/2000 [10:03:49<43:32, 16.97s/it] || Reward:  -87.06 	|| Average Reward:  -1743.65 	 epsilon:  0.0100
 92%|█████████▏| 1847/2000 [10:04:28<1:00:16, 23.64s/it] || Reward:  -4672.42 	|| Average Reward:  -1784.35 	 epsilon:  0.0100
 92%|█████████▏| 1848/2000 [10:04:34<46:20, 18.29s/it]   || Reward:  -651.15 	|| Average Reward:  -1785.74 	 epsilon:  0.0100
 92%|█████████▏| 1849/2000 [10:04:43<39:07, 15.55s/it] || Rew

In [9]:
trained_model = load_model("./LLModels/LLtrainedmodel.h5") #chooses model to run

env = gym.make("LunarLander-v2")

rewards_list = []
high_score = -8000
MAX_EPSIODES = 5
MAX_STEPS = 1000

print("Starting Testing of the trained model...")
for e in range(MAX_EPSIODES):
    state = env.reset()
    num_observation_space = env.observation_space.shape[0]
    state = np.reshape(state, [1, num_observation_space])
    episode_reward = 0

    for s in range(MAX_STEPS):
        env.render() # Renders Environment with Box2D

        action = np.argmax(trained_model.predict(state)[0]) # Executes "best" action for given state using trained_model's prediction
        new_state, reward, done, _ = env.step(action)
        new_state = np.reshape(new_state, [1, num_observation_space])

        state = new_state

        episode_reward += reward # Reward Tally

        if done:
            break

    rewards_list.append(episode_reward) # Keeps track of Scores and High Score
    if high_score < episode_reward:
        high_score = episode_reward

    print(e, "\t: Episode || Reward: ", "%.2f" % episode_reward, "\t|| Average Reward: ", "%.2f" % np.mean(rewards_list))
    
print("Testing Complete...")
print("Highest Testing Score:", high_score)

env.close() # Shuts Down Environment

Starting Testing of the trained model...
0 	: Episode || Reward:  -301.87 	|| Average Reward:  -301.87
1 	: Episode || Reward:  -2878.54 	|| Average Reward:  -1590.20
2 	: Episode || Reward:  -406.71 	|| Average Reward:  -1195.71
3 	: Episode || Reward:  -434.74 	|| Average Reward:  -1005.47
4 	: Episode || Reward:  -1995.58 	|| Average Reward:  -1203.49
Testing Complete...
Highest Testing Score: -301.8677143786214


# Saved Model Logs

1. Threshold Set to +300. No episodes accepted into training set. Model was making randomized actions
2. Threshold set to -200. 500 Episodes. Best model so far. It is able to control its vertical velocity well, but is still shaky on        roll and targetting the pad (High Score: 270.715)
3. Threshold set to -200. 1000 Epsiodes.
4. Reformatted all the code. Training is now done per step rather than per episode. All steps are accepted, but training_data will only hold the 500,000 most recent steps. Great improvements in consistency.
5. Changed LR to .001, 500 episodes
6. Running 2000 Episodes, same hyperparameters. (High Score: 280.1251846858248). tried hopping along the floor. 6h:34m runtime
7. Runnign 4000 Episodes, ||. 16h:26m:28s runtime. (High Score: 298.89118380132743). Testing was POOOOOR. AVG Rew: 
8. Fixed training. model is hot garbage