## Deep RL

Going to explore solving various games using Deep RL techniques. Starting with blackjack because how hard could solving blackjack be really?

Some Components we need:
    BlackJack State Space = [dealer_card , my total , Ace] So simple vector of length 3. It would be nice to incorporate bankroll and bet size somehow later on to try and learn how much to bet. 

In [1]:
import sys
import os

# Adding path to dealer / player
module_path = os.path.abspath(os.path.join('/Users/befeltingu/'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from DeepRL.BlackJack import Dealer

In [3]:
# We have a dealer that allows us a couple helper functions and also the dealer 
# has a set strategy.

In [5]:
# Lets build out our Deep RL player we are going to use the DQN model as described in Deepminds
# paper https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
class DeepPlayer:
    # simple DQN agent
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.actions = {'HIT':0,'STAY':1}
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(4*self.state_size, input_dim=self.state_size, activation='relu'))
        model.add(Dense(4*self.state_size, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def get_action_number(self,action_string):
        return self.actions[action_string]
    
    def get_player_state(self,cards):
        
        usable_ace = 0
        sum_cards = np.array(cards).sum()
        
        if 1 in cards:
            sum_cards_use_ace = sum_cards + 10
            if sum_cards_use_ace <= 21:
                usable_ace = 1
                sum_cards = sum_cards_use_ace
                
        return sum_cards, usable_ace
    
    def load(self, name):
        self.model.load_weights(name)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, name):
        self.model.save_weights(name)

In [6]:
####### LOOP #####################
# a.) generate an episode (single deal)
# b.) For each state s in episode append returned value
# to Returns(s)

# INIT STEPS
player = DeepPlayer(3,2)
dealer = dealer.Dealer(1)
# number of games/episodes to play
game_sample_size = 500 # number hands to deal to given policy before updating
episodes = 1000

for episode in range(episodes):

    for i in range(num_games):

        # player cards
        pc1 = dealer.deal_and_replace()
        pc2 = dealer.deal_and_replace()
        player_cards = [pc1,pc2]
        player.current_score = 0
        player.episode_states = []
        # dealer cards
        dc1 = dealer.deal_and_replace() # Use this as the visible card
        dc2 = dealer.deal_and_replace()
        dealer_cards = [dc1,dc2]

        game_state = "HIT"
        # deal cards until the player says Stay
        while(game_state=="HIT"):

            game_state = player.make_play(dc1,cards=player_cards)

            if game_state == "STAY":
                break

            elif game_state == "BUST":
                break

            else:

                player_sum, usable_ace = player.get_player_sate(player_cards)
                current_state = [dc1,player_sum,usable_ace]
                action = 0 # the player hit
                reward = 0 # no actual reward yet

                new_card = dealer.deal_and_replace()
                player_cards += [new_card]

                player_new_sum, usable_ace = player.get_player_state(player_cards)

                next_state = [dc1,player_new_sum,usable_ace]
                done = False
                player.remember(current_state, action, reward, next_state, done)

        if game_state == "BUST":
            player_sum, usable_ace = player.get_player_sate(player_cards)
            current_state = [dc1,player_sum,usable_ace]
            action = 0 # the player hit
            reward = -1 # We lose :(
            next_state = [] # no next step episode is over
            done = True
            player.remember(current_state, action, reward, next_state, done)
            continue
        # now do the same loop for the dealer
        # the dealer is using a different policy
        dealer_state = "HIT"
        while(dealer_state=="HIT"):

            dealer_state = dealer.make_play(dealer_cards)

            if dealer_state == "STAY":
                break

            elif dealer_state == "BUST":
                break

            else:
                new_card = dealer.deal_and_replace()
                dealer_cards += [new_card]


        if dealer_state == "BUST":
            # dealer busted so record the current state as a win
            player_sum, usable_ace = player.get_player_sate(player_cards)
            current_state = [dc1,player_sum,usable_ace]
            action = player.get_action_number(game_state) # could have hit or stayed to have got here
            reward = 1 # we won!
            next_state = [] # no next step episode is over
            done = True
            player.remember(current_state, action, reward, next_state, done)

        player_score = player.current_score
        dealer_score = dealer.current_score

        reward = 0

        if player_score > dealer_score:
            reward = 1
        elif dealer_score > player_score:
            reward = -1

        player_sum, usable_ace = player.get_player_sate(player_cards)
        current_state = [dc1,player_sum,usable_ace]
        action = player.get_action_number(game_state) # could have hit or stayed to have got here
        reward = 1 # we won!
        next_state = [] # no next step episode is over
        done = True
        player.remember(current_state, action, reward, next_state, done)
        
    # finished current hand sample
    if len(player.memory) > batch_size:
        player.replay(batch_size)


NameError: global name 'deque' is not defined