In [None]:
'''
Jacob Berman
CSCI 3202
Cart-Pole Q-Learning
'''
import numpy as np
import gym
import math
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env = gym.make('CartPole-v0')
env.reset()

for _ in range(1000):
    env.render()
    action = env.action_space.sample()
    env.step(action)

In [None]:
finalRewards = []
timeReq  = []
meanScores = []
meanScores200 = []

class Qlearn:
    
    def __init__(self, buckets=(1, 1, 6, 12,), nEpisodes=1000, nWins=195, minAlpha=0.1, minEpsilon=0.1,
                 gamma=1.0, adaDivisor=25, maxEnvSteps=None, quiet=False, monitor=False):
        
        self.buckets = buckets
        self.nEpisodes = nEpisodes
        self.nWins = nWins
        self.minAlpha = minAlpha
        self.minEpsilon = minEpsilon
        
        self.gamma = gamma
        self.adaDivisor = adaDivisor
        self.quiet = quiet
        
        self.env = gym.make('CartPole-v0')
        
        if maxEnvSteps is not None:
            self.env.maxEpisodeSteps = maxEnvSteps
            
        if monitor:
            self.env = gym.wrappers.Monitor(self.env, 'tmp/cartpole-1', force=True)
                
        self.Q = np.zeros(self.buckets + (self.env.action_space.n,)) #Initialize Q-table
        
    
    def discretize(self, obs):
        
        upperBounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50)]
        lowerBounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50)]
        
        ratios = [(obs[i] + abs(lowerBounds[i])) / (upperBounds[i] - lowerBounds[i]) for i in range(len(obs))]
        
        newObsv = [int(round((self.buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        newObsv = [min(self.buckets[i] - 1, max(0, newObsv[i])) for i in range(len(obs))]
        
        return tuple(newObsv)
            
    def chooseAction(self, state, epsilon):
        
        return self.env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(self.Q[state])
    
    def updateQ(self, stateOld, action, reward, stateNew, alpha):
        
        self.Q[stateOld][action] += alpha * (reward + self.gamma * np.max(self.Q[stateNew]) - self.Q[stateOld][action])
        
    def getEpsilon(self, t):
        
        return max(self.minEpsilon, min(1, 1.0 - math.log10((t + 1) / self.adaDivisor)))
    
    def getAlpha(self, t):
        
        return max(self.minAlpha, min(1.0, 1.0 - math.log10((t + 1) / self.adaDivisor)))
    
    def run(self):
        
        scores = deque(maxlen=100)
        
        for e in range(self.nEpisodes):
            
            currentState = self.discretize(self.env.reset())
            
            alpha = self.getAlpha(e)
            epsilon = self.getEpsilon(e)
            done = False
            i = 0
            
            for time in range(500):
                
                self.env.render()
                action = self.chooseAction(currentState, epsilon)
                obs, reward, done, _ = self.env.step(action)
                newState = self.discretize(obs)
                self.updateQ(currentState, action, reward, newState, alpha)
                currentState = newState
                i += reward
                
                if done:
                    print('Episode:{}/{} finished with reward:{}'.format(e, self.nEpisodes, time))
                    break
                
            scores.append(i)
            finalRewards.append(i)
            
            meanScore = np.mean(scores)
            meanScore200 = np.mean(scores)

            
            if meanScore >= self.nWins and e >= 100:
                if not self.quiet: print('\nRan {} episodes. Solved after {} episodes'.format(e, e - 100))
                return e - 100
            
            if e == 100:
                print('\n[Episode {}] - Mean time over last 100 episodes was {} ticks.'.format(e, meanScore))
                meanScores.append(meanScore)
                #print(meanScores)
            if e == 200:
                print('\n[Episode {}] - Mean time over last 100 episodes was {} ticks.'.format(e, meanScore200))
                meanScores200.append(meanScore200)
                #print(meanScores200)


        if not self.quiet: print('\nDid not solve after {} episodes'.format(e))
        return e
            
if __name__ == "__main__":
    qlearn = Qlearn()
    qlearn.run()
    #for (x) in range(10): //Uncomment to run 10 tests. Commented out becuase it messes with other graphs.
    #    qlearn.run()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(finalRewards)
plt.title('Q-Learning Performance')
plt.xlabel('Episode')
plt.ylabel('Rewards')

plt.figure(figsize=(10, 4))
plt.plot(meanScores)
plt.title('Mean scores for 0-100')
plt.xlabel('Episode')
plt.ylabel('Rewards')

plt.figure(figsize=(10, 4))
plt.plot(meanScores200)
plt.title('Mean scores for 100-200')
plt.xlabel('Episode')
plt.ylabel('Rewards')

