In [None]:
import gym
import matplotlib.pyplot as plt
import random
import numpy as np

In [None]:
env = gym.make('Blackjack-v0')
env.reset()

In [None]:
class Agent():
    def __init__(self, env, epsilon=1.0, gamma=0.2, num_episodes_to_train=30000, timestep=0):
        self.env = env
        self.valid_actions = list(range(self.env.action_space.n))
        # Q-table: dictionary of tuples
        self.Q = dict()
        # Random exploration factor
        self.epsilon = epsilon
        # Discount factor
        self.gamma = gamma
        # Keeps track of number of visits to the same state
        # Used to calculate dynamic learning rate
        self.visits = dict()
        # The moment 'num_episodes_to_train' becomes 0, the agent exploits entirely
        self.num_episodes_to_train = num_episodes_to_train
        self.timestep = timestep
        
    def updateEpsilon(self):
        epsilonStart = 1
        epsilonEnd = 0
        scalingFactor = 100
        self.epsilon = epsilonStart + (epsilonEnd-epsilonStart)*np.exp(
                       (self.timestep-self.num_episodes_to_train)/scalingFactor)

    def update_parameters(self):
        if self.timestep <= self.num_episodes_to_train:
            self.updateEpsilon()
        else:
            self.epsilon = 0

    def create_Q_if_new_state(self, state):
        """
        Set intial Q values to 0.0 if new state is observed
        """
        # If not first round, double is not allowed
        if state[3] == False:
            self.valid_actions = self.valid_actions[:2]
        else:
            self.valid_actions = list(range(self.env.action_space.n))

        if state not in self.Q:
            self.Q[state] = dict((action, 0.0) for action in self.valid_actions)
            self.visits[state] = dict((action, 1) for action in self.valid_actions)

    def get_maxQ(self, state):
        """
        Returns Maximum Q value
        """
        self.create_Q_if_new_state(state)
        return max(self.Q[state].values())

    def choose_action(self, state):
        """
        Choose which action to take, based on the observation.
        If observation is seen for the first time, initialize its Q values to 0.0
        """
        
        self.create_Q_if_new_state(state)

        # Agent will explore with a probability of epsilon
        if random.random() > self.epsilon:
            maxQ = self.get_maxQ(state)

            # Chooses random action if there is a tie
            action = random.choice([k for k in self.Q[state].keys()
                                    if self.Q[state][k] == maxQ])
        else:
            action = random.choice(self.valid_actions)
        
        self.visits[state][action] += 1
        self.update_parameters()

        return action


    def updateQ(self, state, action, reward, next_state):
        """
        Called after the agent completes an action and receives an award.
        This function does not consider future rewards
        when conducting learning.
        """

        alpha = 1/(1+self.visits[state][action])
        self.Q[state][action] = (1-alpha)*self.Q[state][action] + (
                                (alpha)*(reward+(self.gamma*self.get_maxQ(next_state))))

In [None]:
agent = Agent(env=env, epsilon=1.0, gamma=0.1, num_episodes_to_train=15000, timestep=0)

num_rounds = 1000
num_samples = 1000

average_rewards = []

state = env.reset()
for sample in range(num_samples):
    episode = 1
    total_reward = 0 # to store total payout over 'num_rounds'
    # Take action based on Q-table of the agent and learn based on that until 'num_episodes_to_train' = 0
    while episode <= num_episodes:
        action = agent.choose_action(state)
        next_state, reward, is_done, _, _ = env.step(action)
        agent.updateQ(state, action, reward, next_state)
        total_reward += reward
        state = next_state
        if is_done:
            agent.timestep += 1
            state = env.reset() # Environment deals new cards to player and dealer
            episode += 1
    average_rewards.append(total_reward)

# Plot payout per 1000 episodes for each value of 'sample'
plt.plot(average_rewards)           
plt.xlabel('num_samples')
plt.ylabel('rewards after 1000 rounds')
plt.show()      
    
print ("Average reward after {} rounds is {}".format(num_episodes, sum(average_rewards)/(num_samples)))

In [None]:
def play_game(state):
    action = agent.choose_action(state)
    
    if action==2:
        print('Agent decides to DOUBLE')
    elif action==1:
        print('Agent decides to HIT')
    elif action==0:
        print('Agent decides to STAND')
    print('\n')

    next_observation, payout, is_done, _,dealer_cards = env.step(action)
    print('Reward: ',payout,'Your Hand:',env.player,
        '\nIs the game done?',is_done)


    print('\n')
    if is_done==True:
        print('\n')
        print('Dealer\'s Hand:', dealer_cards)
    
    return next_observation, payout, is_done
    

In [None]:
#Get first observation:
observation1 = env.reset()

#Values of the first hand:
is_done=False

if observation1[0]==21:
    print('Blackjack!!')

while is_done==False:
    
    if observation1[0]==21:
        print('Blackjack!')

    else:
        print('Player\'s Cards: ',env.player,
              '\nIs your hand soft (has ace and can be used as 1 or 11)?',observation1[2],
              '\nDealer\'s Face Up Card: ', observation1[1],
              '\n')

        print('What would you do? DOUBLE=2, HIT=1, STAND=0')
        print('\n')
        a = int(input())




    #Player decision:
    if a==2:
        print('You chose to DOUBLE (2)')
    elif a==1:
        print('You chose to HIT (1)')
    elif a==0:
        print('You chose to STAND (0)')
    else:
        print('Blackjack Ended')

    print('\n')


    #Agent Decision:
    observation2, reward, is_done = play_game(observation1)
    observation1 = observation2
    
    if is_done == True:
        if reward >=1 :
            print('Agent have won! $%d!\nDo your actions match the agents?'%reward)
        elif reward < 0 :
            print('Agent have lost! -$%d'%abs(reward))
        else:
            print('A draw!')
        break