In [None]:
import random
import gym
import math
import numpy as np
import pandas as pd
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import matplotlib.animation as animation
import imageio
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class CartPoleDQN():
    def __init__(self, episodes=1000, win_cond=195, gamma=.9, 
                epsilon=1, min_eps=.01, eps_decay=.995, alpha=.001, batch_size=32):
        '''
        An implementation of deep-Q learning to solve openai-gym's CartPole-v1
        
        
        episodes: max number of episodes to run
        win_cond: the scoring criteria to consider the environment solved
        gamma: discount factor on future rewards
        epsilon: agent chooses greedy action with probability(1-epsilon)
        min_eps: minimum epsilon value
        eps_decay: rate of decay for epsilon term (how quickly the agent shifts from exploration to exploitation)
        alpha: learning rate for tf.keras.optimizers.Adam
        batch_size: how many states to train on during experience replay
        '''
        self.episodes = episodes
        self.win_cond = win_cond
        self.gamma = gamma
        self.epsilon = epsilon
        self.min_eps = min_eps
        self.eps_decay = eps_decay
        self.alpha = alpha
        self.batch_size = batch_size
        self.memory = deque(maxlen=100000)
        self.env = gym.make('CartPole-v1')
        
        self.model = Sequential()
        self.model.add(Dense(24, input_dim=4, activation='relu'))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(2, activation='linear'))
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.alpha))
        
        self.history = []
        
    def save_memory(self, state, action, reward, new_state, done):
        self.memory.append((state, action, reward, new_state, done))
        
    def choose_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.model.predict(state))
    
    
    def reshape_state(self, state):
        return np.reshape(state, [1,4])
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        batch = random.sample(self.memory, batch_size)
        for state, action, reward, new_state, done in batch:
            new_q = reward
            if not done:
                new_q = reward + self.gamma*np.max(self.model.predict(new_state)[0])
            q_values = self.model.predict(state)
            q_values[0][action] = new_q            
            self.model.fit(state, q_values, verbose=0)
        
        if self.epsilon > self.min_eps:
            self.epsilon *= self.eps_decay
            
            
    def run(self):
        scores = deque(maxlen=100)
        for episode in range(self.episodes):
            state = self.reshape_state(self.env.reset())
            done = False
            i = 0
            while not done:
#                 self.env.render()
                action = self.choose_action(state, self.epsilon)
                new_state, reward, done, _ = self.env.step(action)
                new_state = self.reshape_state(new_state)
                self.save_memory(state, action, reward, new_state, done)
                state = new_state
                i += 1
                
            scores.append(i)
            self.history.append(i)
            mean_score = np.mean(scores)
            if episode % 10 == 0:
                print(f'Episode {episode}, Score: {i}, Epsilon: {self.epsilon}')
            if episode % 100 == 0 and episode >= 100:
                print(f'Rolling mean (last 100 trials): {mean_score} after {episode} total trials')
                if mean_score >= self.win_cond:
                    return episode
            self.replay(self.batch_size)
        self.env.close()
            
    def test(self, trials=5):
        frames = []
        for trial in range(1, trials+1):
            state = self.reshape_state(self.env.reset())
            done = False
            score = 0

            while not done:
                frames.append(self.env.render(mode='rgb_array'))
                action = self.choose_action(state, self.min_eps)
                new_state, reward, done, _ = self.env.step(action)
                new_state = self.reshape_state(new_state)
                self.save_memory(state, action, reward, new_state, done)
                state = new_state
                score += 1
            print(f'Trial: {trial} Score: {score}')
        self.env.close()
        imageio.mimsave('./videos/solved.gif', frames)
        

In [None]:
agent = CartPoleDQN(episodes=1000)

In [None]:
agent.model = keras.models.load_model('./cartpole/')

In [None]:
agent.run()

In [None]:
df = pd.DataFrame(agent.history)

In [None]:
df.head()

In [None]:
df.to_csv('./data/cartpole.csv', index=False)

In [None]:
agent.model.save('./cartpole')

In [None]:
agent.test(1)

In [None]:
agent.env.close()

In [None]:
env.close()

In [None]:
sns.set_style('darkgrid')

In [None]:
plt.figure(figsize=(16,12))
plt.axhline(195, color='green', linestyle='--', label='Win Criterion')
plt.plot(df, alpha=.3, label='Actual Values', color='orange')
plt.plot(df.rolling(10).mean(), alpha=.5, label='Rolling Mean (10)', color='blue')
plt.plot(df.rolling(100).mean(), label='Rolling Mean (100)', color='green')
plt.legend(fontsize='x-large')
plt.xlabel('Episodes', fontsize=24)
plt.xticks(fontsize=16)
plt.ylabel('Scores', fontsize=24)
plt.yticks(fontsize=16)
plt.title('Cart Pole DQN', fontsize=30);