In [256]:
import os
import math
import random
import numpy as np
import collections
from typing import Iterable

import torch
import torch.nn as nn
import torch.optim as optim

In [257]:
from pypokerengine.players import BasePokerPlayer
from pypokerengine.api.emulator import Emulator
from pypokerengine.utils.game_state_utils import restore_game_state
from pypokerengine.utils.card_utils import estimate_hole_card_win_rate, gen_cards


In [258]:
# Program settings
global DEBUG
DEBUG = False
global WATCH_GAME
WATCH_GAME = True


# Load saved model or create new
global save_model_to_file
global load_saved_model
global num_saves
global model_pathname
save_model_to_file = False
load_saved_model = False
num_saves = 0
model_pathname = 'pathname'

# Model info
global model_input_size
model_input_size = 21
model_output_size = 4
MEMORY_SIZE = 10000
BATCH_SIZE = 32

#Training Info
NUM_EPISODES = 10000
TARGET_LAG_FACTOR = 7500
INITIAL_STACK = 100

In [259]:
# Deep RL constrants
gamma = 0.2 

# Step length taken to update the estimation of Q(S, A)
alpha = 1

# Greedy policy
# Probability of choosing any action at random (vs. action with highest Q value)
epsilon = 0.1
epsilon_min = 0.05
epsilon_decay = 0.99


# Target Model Ketchup
target_n_val = 7500

In [260]:
class Linear_QNet(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_size, 64)
        self.fc2 = torch.nn.Linear(64, 64)
        self.fc3 = torch.nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
    def save(self, file_name='model.pth'):
        model_folder_path = './model'
        if not os.path.exists(model_folder_path):
            os.makedirs(model_folder_path)

        file_name = os.path.join(model_folder_path, file_name)
        torch.save(self.state_dict(), file_name)

In [261]:
class DQNetwork:
    def __init__(self):

        # model
        self.model = Linear_QNet(model_input_size, model_output_size)
        # set optimizer and loss functions for models
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()

    def get_model(self):
        return self.model
    
    def train_step(self, state, action, reward, next_state, done):
        state = torch.tensor(state, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        action = torch.tensor(action, dtype=torch.float32)
        reward = torch.tensor(reward, dtype=torch.float32)

        if len(state.shape) == 1:
            state = torch.unsqueeze(state, 0)
            next_state = torch.unsqueeze(next_state, 0)
            action = torch.unsqueeze(action, 0)
            reward = torch.unsqueeze(reward,0)
            done = (done, )
        
        q_values = self.model.forward(state)
        next_q_values = q_values.clone()
        for idx in range(len(done)):
            q_new = reward[idx]
            if not done[idx]:
                q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))

            next_q_values[idx][torch.argmax(action[idx]).item()] = q_new

        loss = self.criterion(next_q_values, q_values)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        

In [262]:
# Helper methods
def flatten(x):
    if isinstance(x, Iterable):
        return [a for i in x for a in flatten(i)]
    else:
        return [x]      

In [263]:
class PokerAgent(BasePokerPlayer):

    def __init__(self):
        
        # Initialize model
        self.model = DQNetwork()       

        # Experience replay
        self.memory = collections.deque(maxlen = MEMORY_SIZE)
        self.batch_size = BATCH_SIZE
        self.first_move = True
        self.last_action = [0] * model_output_size
        self.last_state = [0] * model_input_size
        self.curr_stack = INITIAL_STACK
    
    def select_action(self, state):
        if random.uniform(0, 1) < epsilon:
            return random.randint(0, model_output_size - 1)
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).view(1, -1)
                q_values = self.model.get_model().forward(state)
                return torch.argmax(q_values).item()
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_short_mem(self, state, action, reward, next_state, done):
        self.model.train_step (state, action, reward, next_state, done)

    def train_long_mem(self):
        print("Long training")
        sample = None
        if len(self.memory) > BATCH_SIZE:
            sample = random.sample(self.memory, BATCH_SIZE)
        else:
            sample = self.memory
        states, actions, rewards, next_states, dones = zip(*sample)
        self.model.train_step(states, actions, rewards, next_states, dones)
        

    def declare_action(self, valid_actions, hole_card, round_state):
        # Prepare feature vector based on the game state
        feature_vector = self._extract_features(hole_card, round_state)

        if not self.first_move:
            self.train_short_mem(self.last_state, self.last_action, 0, feature_vector, False)
            self.remember(self.last_state, self.last_action, 0, feature_vector, False)

        self.first_move = False

        if DEBUG:
            print("input size: " + str(len(feature_vector)))
            print("input shape: " + str(feature_vector.shape))
        
        
        # Use the model to predict the action
        # action will be number 1-4
        ## 1 -> fold
        ## 2 -> call
        ## 3 -> min raise
        ## 4 -> max raise
        action_num = self.select_action(feature_vector)

        action_map = {0: 'fold', 1: 'call', 2: 'raise', 3: 'raise'}

        action = action_map.get(action_num, 0)
        amount = 0

        #get call val
        if action_num == 1:
            amount = valid_actions[1]['amount']

        # get min raise val
        if action_num == 2:
            amount = valid_actions[2]['amount']['min']
        
        # get max raise val
        if action_num == 3:
            amount = valid_actions[2]['amount']['min']
        
        self.last_action = [0, 0, 0, 0]
        self.last_action[action_num] = 1
        self.last_state = feature_vector
        
        return action, amount
    
    def receive_game_start_message(self, game_info):
    
        self.roudstart_stack = INITIAL_STACK
    
    def _extract_features(self, hole_card, round_state, win = None):

        
        #simulate hand against 10000 flops extracting hand strength estimate, unless round over
        hand_strength = 0
        if win != None:
            if win:
                hand_strength = 1000
            else:
                hand_strength = 0
        else:
            hand_strength = estimate_hole_card_win_rate(1000, 3, gen_cards(hole_card), gen_cards(round_state['community_card']))

        # 8 Standard features
        
        standard_features = [
            round_state['round_count'],
            round_state['pot']['main']['amount'],
            sum([side_pot['amount'] for side_pot in round_state['pot']['side']]),
            round_state['dealer_btn'],
            round_state['small_blind_pos'],
            round_state['big_blind_pos'],
            round_state['small_blind_amount'],
            self._street_to_feature(round_state['street'])
        ]

        # 8 Action history features (2 {# raises, # calls} for each betting stage: preflop, flop, turn, river)
        action_history_features = self._aggregate_action_histories(round_state['action_histories'])

        # Combine all features into a single fixed-size feature vector of length 34
        # Flatten the list of lists
        features = flatten([hand_strength] + standard_features + action_history_features)
        features = np.array(features)
        features = features.reshape(1, -1)
        return features
    
    # Not neccesarily useful
    def receive_round_start_message(self, round_count, hole_card, seats):
        for seat in seats:
            if seat['uuid']==self.uuid:
                self.roudstart_stack = seat['stack']
        self.first_move = True


    # Not neccesarily useful
    def receive_street_start_message(self, street, round_state):
        pass

    def _street_to_feature(self, street):
        # Convert street to a numerical feature
        streets = {'preflop': 1, 'flop': 2, 'turn': 3, 'river': 4, 'showdown': 5}
        return streets.get(street, 0)
    


    def _aggregate_action_histories(self, action_histories):
        '''
        # Aggregate action histories into a fixed-length vector
        # Example: Count the number of raises, calls, etc.
        raise_count = sum(1 for action in action_histories.get('preflop', []) if action['action'] == 'raise')
        call_count = sum(1 for action in action_histories.get('preflop', []) if action['action'] == 'call')
        # Add more aggregated features as needed
        # Ensure the length of this vector is fixed
        return [raise_count, call_count]
        '''
        
        # Initialize counts
        raise_count = [0, 0, 0, 0]  # Preflop, Flop, Turn, River
        call_count = [0, 0, 0, 0]
        fold_count = [0, 0, 0, 0]

        # Define rounds
        rounds = ['preflop', 'flop', 'turn', 'river']

        # Count actions in each round
        for i, round in enumerate(rounds):
            for action in action_histories.get(round, []):
                if action['action'] == 'raise':
                    raise_count[i] += 1
                elif action['action'] == 'call':
                    call_count[i] += 1
                elif action['action'] == 'fold':
                    fold_count[i] += 1

        # Flatten and return
        return raise_count + call_count + fold_count

    # Can incorporate player observation in model updated with each move
    def receive_game_update_message(self, new_action, round_state):
        pass
    
    def receive_round_result_message(self, winners, hand_info, round_state):
        # Calculate net chip gain from round
        reward = 0
        win = False
        for w in winners:
            if w['uuid'] == self.uuid:
                win = True
        for player in round_state['seats']:
            if player['uuid'] == self.uuid:
                new_stack = player['stack']
                reward = new_stack - self.curr_stack
                self.curr_stack = new_stack
        
        final_state = self._extract_features(None, round_state, win)

        #train model with reward as net chip gain
        self.train_short_mem(self.last_state, self.last_action, reward, final_state, True)        
        self.remember(self.last_state, self.last_action, reward, final_state, True)     

    

In [264]:
from pypokerengine.api.game import setup_config, start_poker

#creat models to train

player1 = PokerAgent()
player2 = PokerAgent()
player3 = PokerAgent()

for e in range(NUM_EPISODES):

    # Declare game setup paramers
    config = setup_config(max_round=100, initial_stack=INITIAL_STACK, small_blind_amount=5)
    config.register_player(name = 'p1', algorithm=player1)
    config.register_player(name = 'p2', algorithm=player2)
    config.register_player(name = 'p3', algorithm=player3)

    # play poker game 
    
    game_result = start_poker(config, verbose=0)

    player1.train_long_mem()
    player2.train_long_mem()
    player3.train_long_mem()


TypeError: object of type 'bool' has no len()