In [23]:
import sys
import gym 
import numpy as np
from collections import defaultdict 
from utils import plot_blackjack_values,plot_policy

env= gym.make('Blackjack-v0')

In [24]:
print(env.observation_space)
print(env.action_space)

Tuple(Discrete(32), Discrete(11), Discrete(2))
Discrete(2)


In [25]:
### Random Policy 
for episodes in range(3):
  state = env.reset()
  while True:
    print(state)
    action = env.action_space.sample() 
    state, reward , done, info = env.step(action)
    if done:
      print("End Game : Reward -", reward)
      print("You won ! Hola :D") if reward > 0 else print("You lost ! :(")
      break

(15, 5, False)
End Game : Reward - 1.0
You won ! Hola :D
(12, 3, False)
End Game : Reward - 1.0
You won ! Hola :D
(10, 10, False)
End Game : Reward - -1.0
You lost ! :(


In [26]:
## Monte-Carlo Prediction (For the present env , First visit and every visit result in the same solution)
##Stochastic episode generation  

def generate_episode(env_instance):
  episode =[]
  state= env.reset()
  while True:
    probs  = [0.8, 0.2] if state[0] > 18 else [0.2,0.8]
    action = np.random.choice(np.arange(2),p =probs)
    next_state , reward,done , info = env.step(action)
    episode.append((state, action , reward))
    state = next_state
    if done:
      break
  
  return episode

In [27]:
##Monte-carlo every visit Control 

def every_visit_monte_carlo_control(env , num_episodes , generate_episode, gamma =1.0):
  total_sum = defaultdict(lambda: np.zeros(env.action_space.n))
  iters = defaultdict(lambda: np.zeros(env.action_space.n))
  Q = defaultdict(lambda: np.zeros(env.action_space.n))


  for episodes in range(1, num_episodes+1):
    if episodes %10000 ==0:
      print("\rEpisode {}/{}.".format(episodes,num_episodes),end = "")
    episode = generate_episode(env)
    states , actions, rewards = zip(*episode)
  
    #consideing the case of discounting 
    discounts = np.array([gamma ** i for i in range(len(rewards)+1)])

    for i , state in enumerate(states):

      total_sum[state][actions[i]] += sum(rewards[i:]* discounts[:-(1+i)])
      iters[states][actions[i]] += 1.0
      Q[states][actions[i]] += total_sum[state][actions[i]]/iters[states][actions[i]]
  return Q

In [28]:
#result
Q = every_visit_monte_carlo_control(env, 500000,generate_episode)
# for k, v in Q.items():
#   print(k)
#   print(v)

Episode 500000/500000.

**Constant Alpha Monte-carlo Control and GLIE (Greedy in the limit with infinite Exploration)** 

In [32]:
def generate_episode_for_glie(env, Q, epsilon , nA):
  """generates a state for the epsilon-greedy policy"""
  episode_glie =[]
  state = env.reset()
  while True:
    # action = env.action_space.sample()
    action = np.random.choice(np.arange(nA), p = get_probs(Q[state],epsilon,nA)) if state in Q else env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    episode_glie.append((state, action, reward))
    state = next_state
    if done:
      break

  return episode_glie    

def get_probs(Q_s , epsilon , nA):
  policy = np.ones(nA) * epsilon/nA
  best_action =  np.argmax(Q_s)
  policy[best_action] = 1- epsilon + (epsilon/nA)
  return policy 

In [36]:
def update_step_q(env,episode,Q,alpha, gamma):
  #updating Q step 
  states , actions , rewards  = zip(*episode)
  discounts = np.array([gamma ** i for i in range(len(rewards)+1)])
  for i , state in enumerate(states):
    prior_Q = Q[states][actions[i]]
    Q[states][actions[i]] = prior_Q + alpha*(sum(rewards[i:]*discounts[:-(1+i)]) - prior_Q)

  return Q  



def constant_alpha_monte_carlo_control(env, num_episodes, alpha, gamma =1.0, eps_start=1.0, eps_decay =0.99999,eps_min=0.05):
  nA = env.action_space.n
  Q = defaultdict(lambda: np.zeros(nA))
  epsilon = eps_start

  for episodes in range(1, num_episodes+1):

    if episodes%10000==0:
      print("\rEpisode {}/{}.".format(episodes,num_episodes),end = "")
    epsilon = max(epsilon*eps_decay,eps_min)
    episode = generate_episode_for_glie(env, Q, epsilon, nA)
    Q = update_step_q(env, episode, Q, alpha, gamma)

  policy = dict((k,np.argmax(v)) for k,v in Q.items())
  return policy , Q    
             

In [37]:
policy, Q = constant_alpha_monte_carlo_control(env, 500000,0.02)

Episode 500000/500000.

In [42]:
# plot = dict((k,np.max(v)) for k, v in Q.items())
# plot_blackjack_values(plot)

In [43]:
# plot_policy(policy)