In [2]:
#@title Download data + packages
!echo Downloading stuff...
!wget https://www.dropbox.com/sh/dlcrvrujfc8ypqf/AACcKH5Mno_qBMAuGQFe2vyqa?dl=0 -O temp > /dev/null 2>&1
!unzip temp -d rl > /dev/null 2>&1

!echo Installing packages... 

#!pip install Box2D > /dev/null 2>&1
#!pip install gym[box2d] pyvirtualdisplay > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
#!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install plotly --upgrade > /dev/null 2>&1

!echo Done!
!ls

Downloading stuff...
Installing packages...
Done!
rl  sample_data  temp  training_results  training_results.zip


In [3]:
#@title Load libraries

import gym 
import numpy as np
import warnings
from collections import Counter, defaultdict
import pandas as pd
from tqdm import tqdm
import os
import random
from gym.wrappers import Monitor

from rl.agents.dqn import DQNAgent
from rl.callbacks import Callback, FileLogger, ModelIntervalCheckpoint
from rl.memory import SequentialMemory
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy, LinearAnnealedPolicy, Policy
from rl.processors import Processor
warnings.filterwarnings("ignore")

In [None]:
def run_blackjack(policy, show=False):
    env = gym.make("Blackjack-v0")
    obs = env.reset() #setting up virtual env
    history = []
    while True:
        move = policy(obs)
        new_obs, reward, done, info = env.step(move)
        history.append((obs, move, new_obs, done, reward))
        obs = new_obs
        if done:
            break
    if show:
        result = history[-1][-1]
        if result > 0:
            print("Congrats! You win")
        if result == 0:
            print("Tie!")
        if result < 0:
            print("You lose!")
        print("Your cards: ", env.player)
        print("Dealers cards", env.dealer)

    env.close()
    return history

def human_blackjack_player(obs):
    print(f"Your total: {obs[0]}")
    print(f"Dealer card: {obs[1]}")
    print(f"Usable ace?: {obs[2]}")
    return int(input("0 for stay 1 for hit: "))

In [4]:
def q_policy(s, q):
    if q[(s, 0)] >= q[(s, 1)]:
      return 0
    else:
      return 1

def print_hist(h):
  print(f"The state was: {h[0]}")
  print(f"The action was: {h[1]}")
  print(f"The state is now: {h[2]}")
  print(f"The episode is over: : {h[3]}")
  print(f"Reward for this action: {h[4]}")

In [5]:
history = run_blackjack(human_blackjack_player, show=True)
print_hist(history[0])

Your total: 13
Dealer card: 3
Usable ace?: False
0 for stay 1 for hit: 0
Congrats! You win
Your cards:  [5, 8]
Dealers cards [3, 3, 4, 5, 7]
The state was: (13, 3, False)
The action was: 0
The state is now: (13, 3, False)
The episode is over: : True
Reward for this action: 1.0


In [6]:
def eps_policy(s, q, eps):
    num = random.random()
    if eps > num:
      return random.choice([0, 1])
    else:
      return q_policy(s, q)


In [7]:
def q_learn_blackjack(eps, gamma, alpha, epochs):
    q_table = defaultdict(int)
    training_results = []
    for _ in tqdm(range(epochs)):
        history = run_blackjack(lambda obs: eps_policy(obs, q_table, eps))
        for s, a, s_next, done, r in reversed(history):
            old = q_table[(s, a)]
            if done:
                q_table[(s, a)] = (1 - alpha) * old + alpha * (r)
            else:
                best_next = max([q_table[(s_next, 0)], q_table[(s_next, 1)]])
                q_table[(s, a)] = (1 - alpha) * old + alpha * (r + gamma * best_next)
        training_results.append(history[-1][-1])
    return q_table, training_results

In [8]:
EPS = 0.7
GAMMA = 1
ALPHA = 0.2
EPOCHS = 50000
q_table, results = q_learn_blackjack(EPS, GAMMA, ALPHA, EPOCHS)

100%|██████████| 50000/50000 [00:28<00:00, 1734.14it/s]


In [9]:
q_table

defaultdict(int,
            {((4, 1, False), 0): -0.46578227200000005,
             ((4, 1, False), 1): -0.37462317525773614,
             ((4, 2, False), 0): -0.48800000000000004,
             ((4, 2, False), 1): -0.18572556575405358,
             ((4, 3, False), 0): -0.7091552165888002,
             ((4, 3, False), 1): -0.17942767726145747,
             ((4, 4, False), 0): 0.8483876544512001,
             ((4, 4, False), 1): 0.09587112199169294,
             ((4, 5, False), 0): -0.14752000000000004,
             ((4, 5, False), 1): 0.08271819980548926,
             ((4, 6, False), 0): -0.42213419008,
             ((4, 6, False), 1): 0.11943767680023408,
             ((4, 7, False), 0): -0.30560564166656,
             ((4, 7, False), 1): -0.0360433607440372,
             ((4, 8, False), 0): -0.8926258176000001,
             ((4, 8, False), 1): 0.025096366595403218,
             ((4, 9, False), 0): -0.4702848,
             ((4, 9, False), 1): -0.27222033173824617,
             ((4, 10

In [15]:
#@title Running it multiple times to test policy
run_blackjack(lambda s : q_policy(s, q_table), show=1)

You lose!
Your cards:  [5, 7, 10]
Dealers cards [5, 10]


[((12, 5, False), 1, (22, 5, False), True, -1.0)]