In [94]:
import os
import math
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import collections
from tensorflow import keras

import torch
import torch.nn as nn
import torch.optim as optim
from datetime import datetime
import keras
import keras.callbacks
from keras.callbacks import TensorBoard

In [95]:
from pypokerengine.players import BasePokerPlayer
from pypokerengine.api.emulator import Emulator
from pypokerengine.utils.game_state_utils import restore_game_state
from pypokerengine.utils.card_utils import estimate_hole_card_win_rate


In [96]:
# Program settings
global DEBUG
DEBUG = False
global WATCH_GAME
WATCH_GAME = True


# Load saved model or create new
global save_model_to_file
global load_saved_model
global num_saves
global model_pathname
save_model_to_file = False
load_saved_model = False
num_saves = 0
model_pathname = 'pathname'

# Model info
global model_input_size
model_input_size = 21
model_output_size = 4
MEMORY_SIZE = 10000
BATCH_SIZE = 32

#Training Info
NUM_EPISODES = 10000
TARGET_LAG_FACTOR = 7500

In [97]:
# Deep RL constrants
gamma = 0.2 

# Step length taken to update the estimation of Q(S, A)
alpha = 1

# Greedy policy
# Probability of choosing any action at random (vs. action with highest Q value)
epsilon = 0.1
epsilon_min = 0.05
epsilon_decay = 0.99


# Target Model Ketchup
target_n_val = 7500

In [98]:
class DQNetwork:
    def __init__(self, input_size, output_size):

        # model
        super(DQNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)

        self.target_model = DQNetwork(model_input_size, model_output_size)
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

        # set optimizer and loss functions for models
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()
    
    def model_forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
    def train_step(self, state, action, reward, next_state, done):
        state = torch.tensor(state, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        action = torch.tensor(action, dtype=torch.float32)
        reward = torch.tensor(reward, dtype=torch.float32)

        if len(state.shape) == 1:
            state = torch.unsqueeze(state, 0)
            next_state = torch.unsqueeze(next_state, 0)
            action = torch.unsqueeze(action, 0)
            reward = torch.unsqueeze(reward)
            done = (done, )

        q_values = self.model_forward(state)
        next_q_values = self.model_forward(next_state)

        target = reward + gamma * torch.max(next_q_values) * (1 - done)

        loss = self.criterion(target, q_values)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Decay exploration rate
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay
        

    def save(self, file_name):
        file_path = "./models" + file_name
        torch.save(self.state_dict(), file_path)

In [None]:
class PokerAgent:

    def __init__(self):
        
        # Initialize model and target model
        if load_saved_model:
            self.model = torch.load(model_pathname)
        else:
            self.model = DQNetwork(model_input_size, model_output_size)

        # set optimizer and loss functions for model

        # Experience replay
        self.memory = collections.deque(maxlen = MEMORY_SIZE)
        self.batch_size = BATCH_SIZE
    
    def select_action(self, state):
        if random.uniform(0, 1) < epsilon:
            return random.randint(0, model_output_size - 1)
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).view(1, -1)
                q_values = self.model.predict(state)
                return torch.argmax(q_values).item()
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_short_mem(self, state, action, reward, next_state, done):
        self.model.train_step (self, state, action, reward, next_state, done)

    def train_long_mem(self, state, action, reward, next_state, done):
        sample = None
        if len(self.memory) > BATCH_SIZE:
            sample = random.sample(self.memory, BATCH_SIZE)
        else:
            sample = self.memory
        states, actions, rewards, next_states, dones = zip(*sample)
        model = self.model.train_step(states, actions, rewards, next_states, dones)
        

    def declare_action(self, valid_actions, hole_card, round_state):
        # Prepare feature vector based on the game state
        feature_vector = self._extract_features(hole_card, round_state)


        if DEBUG:
            print("input size: " + str(len(feature_vector)))
            print("input shape: " + str(feature_vector.shape))
        
        
        # Use the model to predict the action
        # action will be number 1-4
        ## 1 -> fold
        ## 2 -> call
        ## 3 -> min raise
        ## 4 -> max raise
        action_num = self.select_action(feature_vector)

        action_map = {0: 'fold', 1: 'call', 2: 'raise', 3: 'raise'}

        action = action_map.get(action_num, 0)
        amount = 0

        #get call val
        if action_num == 1:
            amount = valid_actions[1]['amount']

        # get min raise val
        if action_num == 2:
            amount = valid_actions[2]['amount']['min']
        
        # get max raise val
        if action_num == 3:
            amount = valid_actions[2]['amount']['min']
        
        
        
        return action, amount
    
    def receive_game_start_message(self, game_info):
    
        player_num = game_info['player_num']
        max_round = game_info['rule']['max_round']
        small_blind_amount = game_info['rule']['small_blind_amount']
        ante_amount = game_info['rule']['ante']
        blind_structure = game_info['rule']['blind_structure']
    
    def _extract_features(self, hole_card, round_state):
        
        #simulate hand against 10000 flops extracting hand strength estimate
        
        hand_strength = self._hand_strength_sim(hole_card, round_state['community_card'])

        # 8 Standard features
        standard_features = [
            round_state['round_count'],
            round_state['pot']['main']['amount'],
            sum([side_pot['amount'] for side_pot in round_state['pot']['side']]),
            round_state['dealer_btn'],
            round_state['small_blind_pos'],
            round_state['big_blind_pos'],
            round_state['small_blind_amount'],
            self._street_to_feature(round_state['street'])
        ]

        # 8 Action history features (2 {# raises, # calls} for each betting stage: preflop, flop, turn, river)
        action_history_features = self._aggregate_action_histories(round_state['action_histories'])

        # Combine all features into a single fixed-size feature vector of length 34
        # Flatten the list of lists
        features = flatten([hand_strength] + standard_features + action_history_features)
        features = np.array(features)
        features = features.reshape(1, -1)
        return features
    
    # Not neccesarily useful
    def receive_round_start_message(self, round_count, hole_card, seats):
        pass

    # Not neccesarily useful
    def receive_street_start_message(self, street, round_state):
        pass

    # Can incorporate player observation in model updated with each move
    def receive_game_update_message(self, new_action, round_state):
        pass
    
    def receive_round_result_message(self, winners, hand_info, round_state):
        # Calculate net chip gain from round
        if winners[0]['uuid'] == self.uuid:
            # Player won the round
            print("Player ", winners[0]['uuid'], " won the round")
            gain = 1
        
        num_rounds += 1
        if num_rounds % TARGET_LAG_FACTOR == 0:
                
        
        if WATCH_GAME:
            print("Round actions: player " + self.uuid)
            print("Y: ", self.Y)
            print("Q: ", self.Q)
        
        # Update model
        if train_model:
            # Update model with round results
            pass
        if save_model_to_file:
            # Save model to file
            save_model()
        pass

    

In [99]:
# Helper methods
def flatten(x):
        if isinstance(x, collections.Iterable):
            return [a for i in x for a in flatten(i)]
        else:
            return [x]
        

In [104]:
from pypokerengine.api.game import setup_config, start_poker

#creat models to train

player1 = PokerAgent()
player2 = PokerAgent()
player3 = PokerAgent()

for e in range(NUM_EPISODES):

    # Declare game setup paramers
    config = setup_config(max_round=1000, initial_stack=100, small_blind_amount=5)
    config.register_player(name = 'p1', algorithm=player1)
    config.register_player(name = 'p2', algorithm=player2)
    config.register_player(name = 'p3', algorithm=player3)

    # play poker game 
    
    game_result = start_poker(config, verbose=0)


NameError: name 'PokerAgent' is not defined