In [1]:
import argparse
import gymnasium as gym
import numpy as np
import random
import pickle
from tensorboardX import SummaryWriter




if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--outfolder', type=str, default='.', help='Output folder')
    parser.add_argument('--mapsize', type=int, default=8, help='Size of the FrozenLake map')
    parser.add_argument('--episodes', type=int, default=300, help='Number of training episodes')
    parser.add_argument('--is_slippery', action='store_true', help='Enable slippery terrain')
    args = parser.parse_args()
    main(args.outfolder, args.mapsize, args.episodes, args.is_slippery)


In [2]:
range1 = range(32)  # Discrete(32)
range2 = range(11)  # Discrete(11)
range3 = range(2)   # Discrete(2)


In [3]:
idx = 0
lookup = {}
for r1 in range1:
    for r2 in range2:
        for r3 in range3:
            lookup[(r1,r2,r3)] = idx
            idx+=1

In [4]:
#state_space = env.observation_space
action_space = 2
state_space = len(lookup)
Q = np.zeros((state_space, action_space))

In [5]:
outfolder = '.'
random.seed(96)
env_args = dict(natural=False, sab=False)
env = gym.make('Blackjack-v1', **env_args, render_mode="ansi")

  logger.warn(


In [6]:
writer = SummaryWriter(logdir=f'{outfolder}/runs/BlackJack_QLearning')
alpha = 0.8
gamma = 0.95
epsilon = 0.2
steps = 0
episodes = 200000

In [7]:
avg_return = 0
for episode in range(episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0
    while not done:
        steps +=1
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[lookup[state]])
        new_state, reward, done, truncated, info = env.step(action)
        Q[lookup[state]][action] += alpha * (reward + gamma * np.max(Q[lookup[new_state]]) - Q[lookup[state]][action])
        state = new_state
        total_reward += reward
    avg_return +=total_reward

    if episode%1000==0:
        writer.add_scalar('Return',avg_return/999,steps)
        print(f'Episode:{episode}, Avg Return:{avg_return/999}')
        avg_return=0
env.close()
writer.close()

Episode:0, Avg Return:-0.001001001001001001
Episode:1000, Avg Return:-0.19019019019019018
Episode:2000, Avg Return:-0.2602602602602603
Episode:3000, Avg Return:-0.22122122122122123
Episode:4000, Avg Return:-0.1891891891891892
Episode:5000, Avg Return:-0.18118118118118118
Episode:6000, Avg Return:-0.1961961961961962
Episode:7000, Avg Return:-0.22922922922922923
Episode:8000, Avg Return:-0.21621621621621623
Episode:9000, Avg Return:-0.2002002002002002
Episode:10000, Avg Return:-0.16616616616616617
Episode:11000, Avg Return:-0.25125125125125125
Episode:12000, Avg Return:-0.24624624624624625
Episode:13000, Avg Return:-0.2092092092092092
Episode:14000, Avg Return:-0.1971971971971972
Episode:15000, Avg Return:-0.14014014014014015
Episode:16000, Avg Return:-0.22022022022022023
Episode:17000, Avg Return:-0.2012012012012012
Episode:18000, Avg Return:-0.24024024024024024
Episode:19000, Avg Return:-0.1921921921921922
Episode:20000, Avg Return:-0.22722722722722724
Episode:21000, Avg Return:-0.1751

In [8]:
# Save Q-table
q_path = f"Q_table_BlackJack.pkl"
with open(q_path, 'wb') as f:
    pickle.dump(Q, f)
print(f"Q-table saved to {q_path}")


Q-table saved to Q_table_BlackJack.pkl


In [9]:
q_path = f"Q_table_BlackJack.pkl"
with open(q_path, 'rb') as f:
    Q = pickle.load(f)


In [10]:
from time import sleep

In [11]:
env = gym.make('Blackjack-v1',natural=False,sab = False, render_mode="ansi")

In [12]:
successes = 0
for episode in range(1000000):
    state = env.reset()[0]
    done = False
    ret  = 0
    while not done:
        action = np.argmax(Q[lookup[state]])
        state, reward, done, truncated, info = env.step(action)
        #env.render()
        ret += reward
        if reward >=1:
            successes+=1
    if episode%1000==0:
        print(f'episode:{episode}, successes:{successes}')
        writer.add_scalar('Winrate',successes/(episode+1),episode)




episode:0, successes:0
episode:1000, successes:363
episode:2000, successes:722
episode:3000, successes:1092
episode:4000, successes:1458
episode:5000, successes:1849
episode:6000, successes:2253
episode:7000, successes:2622
episode:8000, successes:3023
episode:9000, successes:3405
episode:10000, successes:3785
episode:11000, successes:4178
episode:12000, successes:4562
episode:13000, successes:4953
episode:14000, successes:5341
episode:15000, successes:5733
episode:16000, successes:6121
episode:17000, successes:6470
episode:18000, successes:6836
episode:19000, successes:7204
episode:20000, successes:7604
episode:21000, successes:7977
episode:22000, successes:8339
episode:23000, successes:8710
episode:24000, successes:9084
episode:25000, successes:9453
episode:26000, successes:9838
episode:27000, successes:10189
episode:28000, successes:10571
episode:29000, successes:10952
episode:30000, successes:11328
episode:31000, successes:11719
episode:32000, successes:12103
episode:33000, success

In [13]:
print(f"Win rate:{successes/1000000}%")

Win rate:0.376442%


In [14]:
env.close()
