In [None]:
import os
import math
import random
import pandas as pd
import numpy as np
import gym

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, RMSprop

from sklearn.metrics import accuracy_score

In [None]:
env = gym.make('CartPole-v0')

In [None]:
env.action_space

In [None]:
env.observation_space

In [None]:
env.observation_space.low.astype(np.float16)

In [None]:
env.observation_space.high.astype(np.float16)

In [None]:
env.reset()

In [None]:
class NNAgent:
    
    def __init__(self):
        self.max = 0
        self.scores = list()
        self.memory = list()
        self.model = self._build_model()
        
    
    def _build_model(self):
        model = Sequential()
        model.add(
            Dense(24, input_dim=4, activation='sigmoid')
        )
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=RMSprop(learning_rate=0.001))
        return model
    
    def act(self, state):
        if random.random() <= 0.5:
            return env.action_space.sample()
        action = np.where(
            self.model.predict(state, batch_size=None)[0, 0] > 0.5, 1, 0
        )
        return action
    
    def train_model(self, state, action):
        self.model.fit(state, np.array([action,]), epochs=1, verbose=False)
        
        
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state = env.reset()[0]
            for _ in range(201):
                state = np.reshape(state, [1, 4])
                action = self.act(state)
                next_state, reward, done, info, temp = env.step(action)
                
                if done:
                    score = _ + 1
                    self.scores.append(score)
                    self.max = max(score, self.max)
                    print(f'episode : {e}/{episodes} | score : {score} | max : {self.max}')
                    break
                
                self.memory.append((state, action))
                self.train_model(state, action)
                state = next_state
                
                            
    

In [None]:
episodes = 500

agent = NNAgent()
agent.learn(episodes)


In [None]:
np.mean(agent.scores)

In [None]:
np.std(agent.scores)

In [None]:
np.min(agent.scores)
np.max(agent.scores)

In [None]:
agent.memory[2]

In [None]:
f = np.array([m[0][0] for m in agent.memory])
f

In [None]:
l = np.array([m[1] for m in agent.memory])
l

In [None]:
accuracy_score(np.where(agent.model.predict(f)>0.5, 1, 0), l)

## DQL Agent

In [None]:
from collections import deque


class DQLAgent:
    
    def __init__(self, gamma=0.95, hu=24, opt=Adam, lr=0.001, finish=False):
        self.finish = finish
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.gamma = gamma
        self.batch_size = 32
        self.max_treward = 0
        self.averages = list()
        self.memory = deque(maxlen=2000)
        self.osn = env.observation_space.shape[0]
        self.model = self._build_model(hu, opt, lr)
        
    def _build_model(self, hu, opt, lr):
        model = Sequential()
        model.add(
            Dense(hu, input_dim=self.osn, activation="relu")
        )
        model.add(
            Dense(hu, activation='relu')
        )
        model.add(
            Dense(env.action_space.n, activation='linear')
        )
        model.compile(loss='mse', optimizer=opt(learning_rate=lr))
        return model
    
    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        action = self.model.predict(state)[0]
        return np.argmax(action)
    
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)
                )
            target = self.model.predict(state)
            target[0, action] = reward
            self.model.fit(state, target, epochs=1, verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    
    def learn(self, episodes):
        trewards = []
        for e in range(1, episodes+1):
            state = env.reset()[0]
            state = np.reshape(state, [1, self.osn])
            for _ in range(5000):
                action = self.act(state)
                next_state, reward, done, info, temp = env.step(action)
                next_state = np.reshape(next_state, [1, self.osn])
                
                self.memory.append([state, action, reward, next_state, done])
                
                state = next_state
                if done:
                    treward = _ +1
                    trewards.append(treward)
                    av = np.mean(trewards[-25:])
                    self.averages.append(av)
                    
                    self.max_treward = max(self.max_treward, treward)
                    print(f"Episodes : {e}/{episodes} | treward : {treward}, av : {av}, max : {self.max_treward}")
                    break
            
            if av > 195 and self.finish:
                break
            
            if len(self.memory) > self.batch_size:
                self.replay()
                
    
    def test(self, episodes):
        trewards = []
        for e in range(1, episodes+1):
            state = env.reset()[0]
            for _ in range(5001):
                state = np.reshape(state, [1, self.osn])
                action = np.argmax(self.model.predict(state)[0])
                next_state, reward, done, info, temp = env.step(action)
                state = next_state
                
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    print(f"episodes : {e}/{episodes} | treward : {treward}")
                    break
        return trewards
                    

In [None]:
episodes = 1000

agent = DQLAgent(finish=True)
agent.learn(episodes)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (10, 6))
x = range(len(agent.averages))
y=np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='trend')
plt.xlabel('episodes')
plt.plot(x, y, 'r--', label='moving average')
plt.legend()

In [54]:
trewards = agent.test(100)

episodes : 10/100 | treward : 193
episodes : 11/100 | treward : 170
episodes : 12/100 | treward : 187
episodes : 13/100 | treward : 615
episodes : 14/100 | treward : 169
episodes : 15/100 | treward : 188
episodes : 16/100 | treward : 168
episodes : 17/100 | treward : 162
episodes : 18/100 | treward : 712
episodes : 19/100 | treward : 156
episodes : 20/100 | treward : 209
episodes : 21/100 | treward : 181
episodes : 22/100 | treward : 672
episodes : 23/100 | treward : 169
episodes : 24/100 | treward : 171
episodes : 25/100 | treward : 172
episodes : 26/100 | treward : 185
episodes : 27/100 | treward : 513
episodes : 28/100 | treward : 544
episodes : 29/100 | treward : 187
episodes : 30/100 | treward : 189
episodes : 31/100 | treward : 179
episodes : 32/100 | treward : 177
episodes : 33/100 | treward : 686
episodes : 34/100 | treward : 493
episodes : 35/100 | treward : 212
episodes : 36/100 | treward : 227
episodes : 37/100 | treward : 156
episodes : 38/100 | treward : 184
episodes : 39/

In [56]:
np.mean(trewards)

print(f"min : {}")

285.23

In [59]:
np.std(trewards)

251.37318293724172

In [58]:
max(trewards)

1975

In [57]:
min(trewards)

156