In [65]:
import random
import numpy as np
from collections import defaultdict
import itertools
import pickle
from typing import List, Dict, Tuple, Set
from tqdm import tqdm
import os

class Card:
    def __init__(self, rank: str, suit: str):
        self.rank = rank
        self.suit = suit
        
    def __repr__(self):
        return f"{self.rank}{self.suit}"
    
    def __eq__(self, other):
        return self.rank == other.rank and self.suit == other.suit
    
    def __hash__(self):
        return hash((self.rank, self.suit))
    
    def get_value(self):
        rank_values = {'2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7,
                      '8': 8, '9': 9, 'T': 10, 'J': 11, 'Q': 12, 'K': 13, 'A': 14}
        return rank_values[self.rank]

class Deck:
    def __init__(self):
        ranks = ['2', '3', '4', '5', '6', '7', '8', '9', 'T', 'J', 'Q', 'K', 'A']
        suits = ['h', 'd', 'c', 's']
        self.cards = [Card(rank, suit) for rank in ranks for suit in suits]
        self.shuffle()
        
    def shuffle(self):
        random.shuffle(self.cards)
        
    def deal(self) -> Card:
        return self.cards.pop()

def evaluate_hand(cards: List[Card]) -> Tuple[int, List[int]]:
    if len(cards) < 5:
        raise ValueError("Need at least 5 cards to evaluate a hand")
    
    best_rank = -1
    best_tiebreakers = []
    
    for combo in itertools.combinations(cards, 5):
        values = sorted([card.get_value() for card in combo], reverse=True)
        suits = [card.suit for card in combo]
        
        flush = len(set(suits)) == 1
        
        unique_values = sorted(list(set(values)))
        straight = False
        if len(unique_values) >= 5:
            for i in range(len(unique_values) - 4):
                if unique_values[i] - unique_values[i+4] == 4:
                    straight = True
                    high_card = unique_values[i]
                    break
            if set(unique_values[-5:]) == {14, 5, 4, 3, 2}:
                straight = True
                high_card = 5
        
        value_counts = defaultdict(int)
        for v in values:
            value_counts[v] += 1
        counts = sorted(value_counts.values(), reverse=True)
        sorted_groups = sorted(value_counts.items(), key=lambda x: (-x[1], -x[0]))
        
        if straight and flush:
            rank = 8
            tiebreakers = [high_card]
        elif counts[0] == 4:
            rank = 7
            quad_val = sorted_groups[0][0]
            kicker = sorted_groups[1][0]
            tiebreakers = [quad_val, kicker]
        elif counts[0] == 3 and counts[1] == 2:
            rank = 6
            trip_val = sorted_groups[0][0]
            pair_val = sorted_groups[1][0]
            tiebreakers = [trip_val, pair_val]
        elif flush:
            rank = 5
            tiebreakers = values[:5]
        elif straight:
            rank = 4
            tiebreakers = [high_card]
        elif counts[0] == 3:
            rank = 3
            trip_val = sorted_groups[0][0]
            kickers = [x[0] for x in sorted_groups[1:]]
            tiebreakers = [trip_val] + kickers[:2]
        elif counts[0] == 2 and counts[1] == 2:
            rank = 2
            pair1 = sorted_groups[0][0]
            pair2 = sorted_groups[1][0]
            kicker = sorted_groups[2][0]
            tiebreakers = [max(pair1, pair2), min(pair1, pair2), kicker]
        elif counts[0] == 2:
            rank = 1
            pair_val = sorted_groups[0][0]
            kickers = [x[0] for x in sorted_groups[1:]]
            tiebreakers = [pair_val] + kickers[:3]
        else:
            rank = 0
            tiebreakers = values[:5]
        
        if rank > best_rank or (rank == best_rank and tiebreakers > best_tiebreakers):
            best_rank = rank
            best_tiebreakers = tiebreakers
    
    return (best_rank, best_tiebreakers)

class PokerGameState:
    def __init__(self, num_players: int = 4, initial_chips: int = 1000):
        self.num_players = num_players
        self.players_chips = [initial_chips for _ in range(num_players)]
        self.current_bets = [0] * num_players
        self.pot = 0
        self.community_cards = []
        self.deck = Deck()
        self.player_hands = [[] for _ in range(num_players)]
        self.current_player = 0
        self.round = 0
        self.last_raise = 0
        self.small_blind = 10
        self.big_blind = 20
        self.folded = [False] * num_players
        self.actions = []
        self.dealer_position = 0
        self.seen_cards = set()  # Track all cards seen in showdowns
        
    def start_new_hand(self):
        self.community_cards = []
        self.player_hands = [[] for _ in range(self.num_players)]
        self.current_bets = [0] * self.num_players
        self.pot = 0
        self.round = 0
        self.folded = [False] * self.num_players
        self.actions = []
        self.deck = Deck()
        
        # Remove seen cards from the new deck
        self.deck.cards = [card for card in self.deck.cards if card not in self.seen_cards]
        self.deck.shuffle()
        
        self.dealer_position = (self.dealer_position + 1) % self.num_players
        sb_pos = (self.dealer_position + 1) % self.num_players
        bb_pos = (sb_pos + 1) % self.num_players
        
        self.players_chips[sb_pos] -= self.small_blind
        self.players_chips[bb_pos] -= self.big_blind
        self.current_bets[sb_pos] = self.small_blind
        self.current_bets[bb_pos] = self.big_blind
        self.pot = self.small_blind + self.big_blind
        self.last_raise = self.big_blind
        self.current_player = (bb_pos + 1) % self.num_players
        
        for _ in range(2):
            for i in range(self.num_players):
                card = self.deck.deal()
                self.player_hands[i].append(card)
    
    def get_legal_actions(self, player: int) -> List[Tuple[str, int]]:
        if self.folded[player]:
            return []
            
        actions = []
        current_bet = self.current_bets[player]
        max_bet = max(self.current_bets)
        chips = self.players_chips[player]
        
        actions.append(("fold", 0))
        
        if current_bet == max_bet:
            actions.append(("check", 0))
        
        if current_bet < max_bet:
            call_amount = min(max_bet - current_bet, chips)
            actions.append(("call", call_amount))
        
        if chips > 0:
            min_raise = max(self.last_raise, self.big_blind)
            min_raise_amount = max_bet - current_bet + min_raise
            max_raise_amount = chips
            
            if min_raise_amount <= max_raise_amount:
                if min_raise_amount == max_raise_amount:
                    actions.append(("all-in", chips))
                else:
                    actions.append(("raise", (min_raise_amount, max_raise_amount)))
                    actions.append(("all-in", chips))
            elif chips > 0:
                actions.append(("all-in", chips))
        
        return actions
    
    def apply_action(self, player: int, action: str, amount: int = 0):
        if self.folded[player]:
            return
            
        self.actions.append((player, action, amount))
        
        if action == "fold":
            self.folded[player] = True
        elif action == "check":
            pass
        elif action == "call":
            call_amount = min(amount, self.players_chips[player])
            self.players_chips[player] -= call_amount
            self.current_bets[player] += call_amount
            self.pot += call_amount
        elif action == "raise":
            raise_amount = min(amount, self.players_chips[player])
            self.players_chips[player] -= raise_amount
            self.current_bets[player] += raise_amount
            self.pot += raise_amount
            self.last_raise = raise_amount - (max(self.current_bets) - self.current_bets[player])
        elif action == "all-in":
            all_in_amount = min(amount, self.players_chips[player])
            self.players_chips[player] -= all_in_amount
            self.current_bets[player] += all_in_amount
            self.pot += all_in_amount
            if (self.current_bets[player] - all_in_amount) < max(self.current_bets):
                self.last_raise = all_in_amount - (max(self.current_bets) - self.current_bets[player])
        
        self.current_player = (self.current_player + 1) % self.num_players
        while self.folded[self.current_player] and not self.is_round_over():
            self.current_player = (self.current_player + 1) % self.num_players
        
        if self.is_round_over():
            self.advance_round()
    
    def is_round_over(self) -> bool:
        if sum(1 for f in self.folded if not f) <= 1:
            return True
            
        active_players = [i for i in range(self.num_players) if not self.folded[i]]
        if len(self.actions) == 0:
            return False
            
        last_raise_index = -1
        for i, (p, a, _) in enumerate(self.actions):
            if a == "raise":
                last_raise_index = i
        
        if last_raise_index == -1:
            acted_players = set(p for p, _, _ in self.actions)
            return all(p in acted_players for p in active_players)
        else:
            required_players = active_players.copy()
            for p, _, _ in self.actions[last_raise_index+1:]:
                if p in required_players:
                    required_players.remove(p)
            return len(required_players) == 0 and all(
                self.current_bets[p] == self.current_bets[active_players[0]] 
                for p in active_players
            )
    
    def advance_round(self):
        self.round += 1
        self.actions = []
        self.last_raise = 0
        self.current_bets = [0] * self.num_players
        
        if self.round == 1:
            self.community_cards.extend([self.deck.deal() for _ in range(3)])
        elif self.round in [2, 3]:
            self.community_cards.append(self.deck.deal())
        
        active_players = [i for i in range(self.num_players) if not self.folded[i]]
        if active_players:
            self.current_player = (self.dealer_position + 1) % self.num_players
            while self.folded[self.current_player]:
                self.current_player = (self.current_player + 1) % self.num_players
    
    def is_hand_over(self) -> bool:
        return self.round >= 4 or sum(1 for f in self.folded if not f) <= 1
    
    def get_winner(self) -> List[int]:
        if sum(self.folded) == self.num_players - 1:
            return [i for i, folded in enumerate(self.folded) if not folded]
        
        active_players = [i for i in range(self.num_players) if not self.folded[i]]
        best_hand = None
        winners = []
        
        for player in active_players:
            all_cards = self.player_hands[player] + self.community_cards
            hand_rank = evaluate_hand(all_cards)
            
            if best_hand is None or hand_rank > best_hand:
                best_hand = hand_rank
                winners = [player]
            elif hand_rank == best_hand:
                winners.append(player)
        
        return winners
    
    def distribute_pot(self, winners: List[int]):
        win_amount = self.pot // len(winners)
        for winner in winners:
            self.players_chips[winner] += win_amount
        self.pot = 0
        
        # Remember all shown cards
        for player in range(self.num_players):
            if not self.folded[player]:
                for card in self.player_hands[player]:
                    self.seen_cards.add(card)
        for card in self.community_cards:
            self.seen_cards.add(card)

class PokerQLearningAgent:
    def __init__(self, player_index: int, alpha: float = 0.1, gamma: float = 0.9, 
                 epsilon: float = 0.2, initial_q_value: float = 0.0):
        self.player_index = player_index
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.initial_q_value = initial_q_value
        self.q_table = defaultdict(lambda: defaultdict(lambda: initial_q_value))
        self.last_state = None
        self.last_action = None
        self.hand_strength_memory = {}  # Stores hand strength estimates
        
    def estimate_hand_strength(self, game_state: PokerGameState) -> float:
        """Calculate hand strength without bluffing (0-1 scale)"""
        hand = game_state.player_hands[self.player_index]
        community = game_state.community_cards
        
        if not community:  # Pre-flop
            # Simplified pre-flop hand strength based on Sklansky groups
            h1, h2 = hand[0].get_value(), hand[1].get_value()
            suited = hand[0].suit == hand[1].suit
            
            if h1 == h2:  # Pair
                strength = min(0.7 + h1/30, 0.95)
            elif h1 >= 12 and h2 >= 10 and (h1-h2 <= 2 or suited):
                strength = min(0.5 + max(h1,h2)/40, 0.85)
            elif h1 >= 10 and h2 >= 8:
                strength = min(0.3 + max(h1,h2)/50, 0.7)
            else:
                strength = min(0.1 + max(h1,h2)/80, 0.4)
            return strength
        
        # Post-flop - use actual evaluation
        all_cards = hand + community
        rank, _ = evaluate_hand(all_cards)
        return min(0.1 + rank * 0.1, 0.9)  # Convert rank to 0.1-0.9 scale
    
    def get_state_key(self, game_state: PokerGameState) -> str:
        """State representation with hand strength and seen cards info"""
        hand = game_state.player_hands[self.player_index]
        h1, h2 = hand[0].get_value(), hand[1].get_value()
        suited = "s" if hand[0].suit == hand[1].suit else "o"
        
        # Hand strength category (discretized)
        hand_strength = self.estimate_hand_strength(game_state)
        strength_category = min(int(hand_strength * 10), 9)  # 0-9
        
        # Community cards info
        cc = game_state.community_cards
        cc_count = len(cc)
        
        # Pot and bet info
        to_call = max(game_state.current_bets) - game_state.current_bets[self.player_index]
        pot_odds = to_call / (game_state.pot + to_call) if (game_state.pot + to_call) > 0 else 0
        pot_odds_category = min(int(pot_odds * 10), 9)  # 0-9
        
        # Position (0-3)
        position = (self.player_index - game_state.dealer_position) % game_state.num_players
        
        # Seen cards impact (how many of our cards have been seen before)
        seen_count = sum(1 for card in hand if card in game_state.seen_cards)
        
        state_parts = [
            f"HS{strength_category}",
            f"H{max(h1,h2)}{min(h1,h2)}{suited}",
            f"CC{cc_count}",
            f"PO{pot_odds_category}",
            f"POS{position}",
            f"SC{seen_count}",
            f"RD{game_state.round}"
        ]
        return "|".join(state_parts)
    
    def get_action(self, game_state: PokerGameState, training: bool = True) -> Tuple[str, int]:
        legal_actions = game_state.get_legal_actions(self.player_index)
        if not legal_actions:
            return ("fold", 0)
        
        state_key = self.get_state_key(game_state)
        hand_strength = self.estimate_hand_strength(game_state)
        
        # Exploration with decaying epsilon
        if training and random.random() < self.epsilon:
            chosen_action = random.choice(legal_actions)
            action, amount = chosen_action
            if action in ["raise", "all-in"]:
                amount = random.randint(amount[0], amount[1]) if isinstance(amount, tuple) else amount
            self.last_state = state_key
            self.last_action = (action, amount)
            return (action, amount)
        
        # Action selection based on Q-values and hand strength
        action_values = []
        for action, amount in legal_actions:
            if action == "raise":
                action_key = ("raise", amount[0])
            elif action == "all-in":
                action_key = ("raise", amount)  # Treat all-in as max raise
            else:
                action_key = (action, amount)
            
            # Base Q-value
            q_value = self.q_table[state_key][action_key]
            
            # Hand strength modifier - no bluffing means strong correlation between hand strength and action
            if action == "fold":
                q_value *= (1 - hand_strength)  # Less likely to fold with stronger hands
            elif action in ["call", "check"]:
                q_value *= (0.5 + hand_strength * 0.5)  # Moderate correlation
            elif action in ["raise", "all-in"]:
                q_value *= (0.3 + hand_strength * 0.7)  # Strong correlation
            
            action_values.append((action, amount, q_value))
        
        # Select action with highest modified Q-value
        best_value = max(v for _, _, v in action_values)
        best_actions = [(a, amt) for a, amt, v in action_values if v == best_value]
        chosen_action = random.choice(best_actions)
        
        action, amount = chosen_action
        if action in ["raise", "all-in"]:
            amount = random.randint(amount[0], amount[1]) if isinstance(amount, tuple) else amount
        
        self.last_state = state_key
        self.last_action = (action, amount)
        return (action, amount)
    
    def learn(self, next_state: PokerGameState, reward: float):
        if self.last_state is None or self.last_action is None:
            return
            
        action, amount = self.last_action
        next_state_key = self.get_state_key(next_state)
        
        # Enhanced reward based on hand strength
        hand_strength = self.estimate_hand_strength(next_state)
        reward = reward * (0.5 + hand_strength * 0.5)  # Scale reward by hand strength
        
        # For Q-update, normalize all-in as raise
        update_action = "raise" if action == "all-in" else action
        update_amount = amount if action == "all-in" else (amount[0] if action == "raise" and isinstance(amount, tuple) else amount)
        
        current_q = self.q_table[self.last_state][(update_action, update_amount)]
        
        next_legal_actions = next_state.get_legal_actions(self.player_index)
        if next_legal_actions and not next_state.is_hand_over():
            max_next_q = max(
                self.q_table[next_state_key][
                    ("raise", amt if a == "all-in" else (amt[0] if a == "raise" else amt))
                ] for a, amt in next_legal_actions
            )
        else:
            max_next_q = 0
        
        new_q = current_q + self.alpha * (reward + self.gamma * max_next_q - current_q)
        self.q_table[self.last_state][(update_action, update_amount)] = new_q
        
        # Decay epsilon to reduce exploration over time
        self.epsilon = max(0.05, self.epsilon * 0.999)
    
    def save_q_table(self, filename: str):
        with open(filename, 'wb') as f:
            q_table_regular = {k: dict(v) for k, v in self.q_table.items()}
            pickle.dump(q_table_regular, f)
    
    def load_q_table(self, filename: str):
        with open(filename, 'rb') as f:
            q_table_regular = pickle.load(f)
            self.q_table = defaultdict(lambda: defaultdict(lambda: self.initial_q_value))
            for k, v in q_table_regular.items():
                self.q_table[k].update(v)

def train_agents(episodes=10000):
    agents = [PokerQLearningAgent(i) for i in range(4)]
    
    # Create progress bar
    with tqdm(total=episodes, desc="Training agents") as pbar:
        for episode in range(episodes):
            game = PokerGameState(num_players=4)
            game.start_new_hand()
            
            while not game.is_hand_over():
                current_player = game.current_player
                agent = agents[current_player]
                
                action, amount = agent.get_action(game)
                game.apply_action(current_player, action, amount)
            
            # Calculate rewards
            winners = game.get_winner()
            for i in range(4):
                if i in winners:
                    # Reward based on both winning and hand strength
                    all_cards = game.player_hands[i] + game.community_cards
                    rank, _ = evaluate_hand(all_cards)
                    reward = (game.pot / len(winners)) * (0.5 + rank * 0.1)
                else:
                    # Penalize based on investment and hand strength
                    hand_strength = agents[i].estimate_hand_strength(game)
                    reward = -game.current_bets[i] * (1.5 - hand_strength * 0.5)
                
                agents[i].learn(game, reward)
            
            # Update progress bar every episode
            pbar.update(1)
            
            # Optional: Update description with current epsilon (exploration rate)
            if (episode + 1) % 100 == 0:
                pbar.set_description(f"Training agents (ε={agents[0].epsilon:.2f})")
    
    # Save trained agents
    for i, agent in enumerate(agents):
        agent.save_q_table(f"no_bluff_agent_{i}.pkl")
    
    return agents

def visible_poker_game(agents: List[PokerQLearningAgent], num_hands: int = 3, initial_chips: int = 1000):
    game = PokerGameState(num_players=4, initial_chips=initial_chips)
    hand_count = 0
    
    while hand_count < num_hands:
        # Reset if any player is broke
        if any(chips <= 0 for chips in game.players_chips):
            print("\n=== Resetting chip counts (player went broke) ===")
            game = PokerGameState(num_players=4, initial_chips=initial_chips)
        
        hand_count += 1
        print(f"\n=== Starting Hand #{hand_count} ===")
        print(f"Chip counts: {[f'P{i}: ${chips}' for i, chips in enumerate(game.players_chips)]}")
        game.start_new_hand()
        
        print(f"\nDealer: Player {game.dealer_position}")
        print(f"Small Blind: Player {(game.dealer_position + 1) % 4} (${game.small_blind})")
        print(f"Big Blind: Player {(game.dealer_position + 2) % 4} (${game.big_blind})")
        
        while not game.is_hand_over():
            print("\n" + "="*50)
            print(f"Round {'Pre-flop' if game.round == 0 else 'Flop' if game.round == 1 else 'Turn' if game.round == 2 else 'River'}")
            
            if game.community_cards:
                print(f"\nCommunity Cards: {', '.join(str(card) for card in game.community_cards)}")
            else:
                print("\nCommunity Cards: None yet")
            
            for i in range(4):
                status = [
                    f"Player {i}:",
                    f"Chips: ${game.players_chips[i]}",
                    f"Current Bet: ${game.current_bets[i]}",
                    "[FOLDED]" if game.folded[i] else "",
                ]
                if game.is_hand_over() or i == game.current_player:
                    status.append(f"Hand: {', '.join(str(card) for card in game.player_hands[i])}")
                else:
                    status.append("Hand: [Hidden]")
                print(" ".join(status))
            
            current_player = game.current_player
            print(f"\nPlayer {current_player}'s turn")
            legal_actions = game.get_legal_actions(current_player)
            
            print("Available actions:")
            for i, (action, amount) in enumerate(legal_actions):
                if action == "raise":
                    print(f"  {i+1}. Raise (${amount[0]} to ${amount[1]})")
                elif action == "all-in":
                    print(f"  {i+1}. ALL-IN (${amount})")
                else:
                    print(f"  {i+1}. {action.capitalize()} {f'${amount}' if amount > 0 else ''}")
            
            action, amount = agents[current_player].get_action(game, training=False)
            if action in ["raise", "all-in"]:
                amount = random.randint(amount[0], amount[1]) if isinstance(amount, tuple) else amount
            
            print(f"Player {current_player} chooses to {action.upper()} {f'${amount}' if amount > 0 else ''}")
            game.apply_action(current_player, action, amount)
        
        # Showdown
        print("\n" + "="*50)
        print("=== Hand Results ===")
        
        print("\nFinal Board:")
        print(f"Community Cards: {', '.join(str(card) for card in game.community_cards)}")
        
        print("\nPlayer Hands:")
        for i in range(4):
            hand_desc = f"Player {i}: {', '.join(str(card) for card in game.player_hands[i])}"
            if game.folded[i]:
                hand_desc += " [FOLDED]"
            print(hand_desc)
        
        winners = game.get_winner()
        if winners:
            win_amount = game.pot // len(winners)
            print(f"\nWinner(s): {', '.join(f'Player {w}' for w in winners)}")
            print(f"Each wins: ${win_amount}")
            
            hand_names = [
                "High Card", "One Pair", "Two Pair", "Three of a Kind",
                "Straight", "Flush", "Full House", "Four of a Kind", 
                "Straight Flush"
            ]
            for winner in winners:
                all_cards = game.player_hands[winner] + game.community_cards
                rank, _ = evaluate_hand(all_cards)
                print(f"Player {winner} has: {hand_names[rank]}")
        else:
            print("\nNo winners - all players folded")
        
        game.distribute_pot(winners if winners else [])
        
        print("\nUpdated Chip Counts:")
        for i in range(4):
            print(f"Player {i}: ${game.players_chips[i]}")
        
        # Pause between hands
        if hand_count < num_hands:
            input("\nPress Enter to continue to next hand...")

if __name__ == "__main__":
    agents = []
    try:
        # Try to load all agents
        for i in range(4):
            try:
                agent = PokerQLearningAgent(i)
                agent.load_q_table(f"no_bluff_agent_{i}.pkl")
                agents.append(agent)
                print(f"Loaded agent {i} from file")
            except FileNotFoundError:
                print(f"No saved agent found for player {i}, creating new one")
                agents.append(PokerQLearningAgent(i))
        
        # If we loaded at least one agent, ask if we want to continue training
        if any(os.path.exists(f"no_bluff_agent_{i}.pkl") for i in range(4)):
            response = input("Some agents were loaded. Continue training? (y/n): ")
            if response.lower() == 'y':
                agents = train_agents(episodes=10000)
        else:
            agents = train_agents(episodes=10000)
    except Exception as e:
        print(f"Error loading agents: {e}")
        print("Creating new agents...")
        agents = train_agents(episodes=10000)
    
    print("\nStarting visible poker game...")
    visible_poker_game(agents, num_hands=5, initial_chips=1000)

Loaded agent 0 from file
Loaded agent 1 from file
Loaded agent 2 from file
Loaded agent 3 from file


KeyboardInterrupt: Interrupted by user

In [None]:
agents = train_agents(episodes=10000)


In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, namedtuple
import itertools
import os
from tqdm import tqdm

In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, namedtuple
import itertools
import os
from tqdm import tqdm

# Define the neural network for Deep Q-Learning
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, output_size)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)

# Experience replay buffer
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        
    def push(self, *args):
        self.memory.append(Transition(*args))
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
        
    def __len__(self):
        return len(self.memory)

class PokerDQNAgent:
    def __init__(self, player_index, state_size, action_size, device):
        self.player_index = player_index
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        
        # Main network
        self.policy_net = DQN(state_size, action_size).to(device)
        # Target network
        self.target_net = DQN(state_size, action_size).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.001)
        self.memory = ReplayMemory(10000)
        self.batch_size = 128
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.05  # Higher minimum exploration
        self.epsilon_decay = 0.9999  # Slower decay
        self.update_target_every = 100
        self.steps_done = 0
        self.bluff_stats = {'attempted': 0, 'successful': 0, 'failed': 0}
        
    def get_state_representation(self, game_state):
        """Convert game state to numerical vector"""
        # 1. Hole cards (104 dim one-hot)
        hole_cards = game_state.player_hands[self.player_index]
        hole_card_rep = np.zeros(104)  # 13 ranks * 4 suits * 2 cards
        
        for i, card in enumerate(hole_cards):
            rank_idx = card.get_value() - 2  # 2->0, A->12
            suit_idx = ['h', 'd', 'c', 's'].index(card.suit)
            hole_card_rep[i*52 + rank_idx*4 + suit_idx] = 1
        
        # 2. Community cards (52 dim one-hot)
        community_rep = np.zeros(52)
        for card in game_state.community_cards:
            rank_idx = card.get_value() - 2
            suit_idx = ['h', 'd', 'c', 's'].index(card.suit)
            community_rep[rank_idx*4 + suit_idx] = 1
        
        # 3. Current bets (normalized)
        current_bets = np.array(game_state.current_bets) / 1000
        my_bet = current_bets[self.player_index]
        other_bets = np.delete(current_bets, self.player_index)
        
        # 4. Pot size (normalized)
        pot_size = game_state.pot / 1000
        
        # 5. Chips (normalized)
        chips = np.array(game_state.players_chips) / 1000
        my_chips = chips[self.player_index]
        other_chips = np.delete(chips, self.player_index)
        
        # 6. Action history (last 3 actions)
        action_history = np.zeros(15)  # 5 actions * 3 rounds
        for i, (player, action, amount) in enumerate(game_state.actions[-3:]):
            action_idx = ['fold', 'check', 'call', 'raise', 'all-in'].index(action)
            action_history[i*5 + action_idx] = amount / 1000 if amount > 0 else 0
        
        # 7. Position and round
        position = (self.player_index - game_state.dealer_position) % game_state.num_players
        position_onehot = np.zeros(4)
        position_onehot[position] = 1
        round_onehot = np.zeros(4)
        round_onehot[game_state.round] = 1
        
        # 8. Pot commitment and growth
        pot_commitment = sum(game_state.current_bets) / (game_state.pot + 1e-6)
        pot_growth = game_state.pot / (game_state.initial_pot + 1e-6) if hasattr(game_state, 'initial_pot') else 1.0
        
        # 9. Hand strength estimate
        hand_strength = self.evaluate_hand_strength(hole_cards, game_state.community_cards)
        
        # Concatenate all features
        state_vector = np.concatenate([
            hole_card_rep,
            community_rep,
            [my_bet],
            other_bets,
            [pot_size],
            [my_chips],
            other_chips,
            action_history,
            position_onehot,
            round_onehot,
            [pot_commitment],
            [pot_growth],
            [hand_strength]
        ])
        
        return torch.FloatTensor(state_vector).unsqueeze(0).to(self.device)
    
    def evaluate_hand_strength(self, hole_cards, community_cards):
        """Estimate hand strength (0-1) for bluffing decisions"""
        if not community_cards:  # Pre-flop
            card1, card2 = hole_cards
            high_card = max(card1.get_value(), card2.get_value())
            suited = card1.suit == card2.suit
            paired = card1.get_value() == card2.get_value()
            
            if paired:
                return 0.6 + high_card * 0.02
            elif suited:
                return 0.4 + high_card * 0.015
            else:
                return 0.3 + high_card * 0.01
        else:
            # Post-flop: use actual hand ranking
            all_cards = hole_cards + community_cards
            rank, _ = evaluate_hand(all_cards)
            return rank / 8  # Normalize to 0-1 (8 being straight flush)
    
    def get_legal_actions_mask(self, game_state):
        legal_actions = game_state.get_legal_actions(self.player_index)
        mask = torch.zeros(self.action_size).to(self.device)
        
        for action, amount in legal_actions:
            if action == "fold":
                mask[0] = 1
            elif action == "check":
                mask[1] = 1
            elif action == "call":
                mask[2] = 1
            elif action == "raise":
                mask[3] = 1
            elif action == "all-in":
                mask[4] = 1
                
        return mask
    
    def select_action(self, game_state, training=True):
        state = self.get_state_representation(game_state)
        legal_actions_mask = self.get_legal_actions_mask(game_state)
        
        if training and random.random() < self.epsilon:
            # Random action (exploration)
            action_idx = random.choice([i for i in range(self.action_size) if legal_actions_mask[i] == 1])
        else:
            # Greedy action (exploitation)
            with torch.no_grad():
                q_values = self.policy_net(state) * legal_actions_mask
                action_idx = q_values.argmax().item()
        
        # Convert action index to poker action
        action_map = ['fold', 'check', 'call', 'raise', 'all-in']
        action = action_map[action_idx]
        
        # For raise/all-in, determine amount
        if action in ['raise', 'all-in']:
            current_bet = game_state.current_bets[self.player_index]
            max_bet = max(game_state.current_bets)
            chips = game_state.players_chips[self.player_index]
            
            if action == 'raise':
                min_raise = max(game_state.last_raise, game_state.big_blind)
                min_amount = max_bet - current_bet + min_raise
                max_amount = chips
                amount = random.randint(min_amount, max_amount)
                
                # Bluffing logic - more aggressive with weak hands
                hand_strength = self.evaluate_hand_strength(
                    game_state.player_hands[self.player_index],
                    game_state.community_cards
                )
                if hand_strength < 0.4 and random.random() < 0.3:
                    amount = min(max_amount, int(min_amount * 1.5))
                    self.bluff_stats['attempted'] += 1
            else:  # all-in
                amount = chips
                # Track all-ins with weak hands as bluffs
                hand_strength = self.evaluate_hand_strength(
                    game_state.player_hands[self.player_index],
                    game_state.community_cards
                )
                if hand_strength < 0.5:
                    self.bluff_stats['attempted'] += 1
        
        else:  # fold, check, call
            if action == 'call':
                amount = max(0, max(game_state.current_bets) - game_state.current_bets[self.player_index])
            else:
                amount = 0
        
        return action, amount
    
    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        
        # Convert to tensors
        state_batch = torch.cat(batch.state)
        action_batch = torch.tensor(batch.action, device=self.device)
        reward_batch = torch.tensor(batch.reward, device=self.device)
        
        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state_batch).gather(1, action_batch.unsqueeze(1))
        
        # Compute V(s_{t+1})
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        
        # Compute expected Q values
        non_final_mask = torch.tensor([s is not None for s in batch.next_state], device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * self.gamma) + reward_batch
        
        # Compute loss
        loss = nn.MSELoss()(state_action_values, expected_state_action_values.unsqueeze(1))
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Update target network
        self.steps_done += 1
        if self.steps_done % self.update_target_every == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
        
        # Decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def save_model(self, path):
        """Save the model and training state"""
        torch.save({
            'policy_net_state_dict': self.policy_net.state_dict(),
            'target_net_state_dict': self.target_net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'bluff_stats': self.bluff_stats,
            'steps_done': self.steps_done
        }, path)
        print(f"Model saved to {path}")
    
    def load_model(self, path, reset_epsilon=False):
        """Load a saved model"""
        if os.path.exists(path):
            checkpoint = torch.load(path)
            self.policy_net.load_state_dict(checkpoint['policy_net_state_dict'])
            self.target_net.load_state_dict(checkpoint['target_net_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            self.epsilon = 1.0 if reset_epsilon else checkpoint['epsilon']
            self.bluff_stats = checkpoint.get('bluff_stats', {'attempted': 0, 'successful': 0, 'failed': 0})
            self.steps_done = checkpoint.get('steps_done', 0)
            print(f"Model loaded from {path}")
        else:
            print(f"No model found at {path}, starting from scratch")

def train_dqn_agents(episodes=10000, save_dir="saved_agents"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_players = 4
    state_size = 104 + 52 + 1 + 3 + 1 + 1 + 3 + 15 + 4 + 4 + 3  # Updated state size
    action_size = 5  # fold, check, call, raise, all-in
    
    # Create directory for saved agents
    os.makedirs(save_dir, exist_ok=True)
    
    agents = [PokerDQNAgent(i, state_size, action_size, device) for i in range(num_players)]
    
    # Try to load existing models
    for i, agent in enumerate(agents):
        agent.load_model(f"{save_dir}/dqn_agent_{i}.pth")
    
    with tqdm(total=episodes, desc="Training DQN agents") as pbar:
        for episode in range(episodes):
            game = PokerGameState(num_players=num_players)
            game.start_new_hand()
            game.initial_pot = game.pot  # Track initial pot for pot growth feature
            
            # Track starting chips for reward calculation
            starting_chips = [game.players_chips[i] for i in range(num_players)]
            
            while not game.is_hand_over():
                current_player = game.current_player
                agent = agents[current_player]
                
                state = agent.get_state_representation(game)
                action, amount = agent.select_action(game)
                
                # Save current state and action
                prev_state = state
                prev_action = ['fold', 'check', 'call', 'raise', 'all-in'].index(action)
                
                # Execute action
                game.apply_action(current_player, action, amount)
                
                # Get new state
                new_state = agent.get_state_representation(game) if not game.is_hand_over() else None
                
                # Store transition in memory (reward will be calculated after hand ends)
                agent.memory.push(prev_state, prev_action, new_state, 0)
            
            # Calculate rewards after hand ends
            winners = game.get_winner()
            for i in range(num_players):
                agent = agents[i]
                
                # Calculate chip difference
                chip_diff = game.players_chips[i] - starting_chips[i]
                
                # Assign rewards
                if game.players_chips[i] <= 0:  # Bankruptcy
                    reward = -100
                elif i in winners:
                    # Bonus for winning via aggression
                    last_actions = [a[1] for a in game.actions[-3:]]
                    aggression_score = sum(1 for a in last_actions if a in ("raise", "all-in"))
                    reward = 50 + chip_diff * 2 + aggression_score * 5
                    
                    # Track successful bluffs
                    if hasattr(agent, 'is_bluff') and agent.is_bluff:
                        agent.bluff_stats['successful'] += 1
                else:
                    # Penalize passive play when ahead
                    if chip_diff > 0 and all(a == "check" for a in [a[1] for a in game.actions[-3:]]):
                        reward = chip_diff * 0.2  # Heavy penalty for passive play
                    else:
                        reward = chip_diff * 1
                    
                    # Track failed bluffs
                    if hasattr(agent, 'is_bluff') and agent.is_bluff:
                        agent.bluff_stats['failed'] += 1
                
                # Update the last transition with the actual reward
                if len(agent.memory.memory) > 0:
                    last_transition = agent.memory.memory[-1]
                    agent.memory.memory[-1] = Transition(
                        last_transition.state,
                        last_transition.action,
                        last_transition.next_state,
                        reward
                    )
                
                # Train the agent
                agent.learn()
                agent.is_bluff = False  # Reset bluff flag
            
            # Update progress bar
            pbar.update(1)
            if (episode + 1) % 100 == 0:
                # Show epsilon and bluff stats
                bluff_rate = agents[0].bluff_stats['attempted'] / (episode + 1) * num_players
                success_rate = 0
                if agents[0].bluff_stats['attempted'] > 0:
                    success_rate = agents[0].bluff_stats['successful'] / agents[0].bluff_stats['attempted'] * 100
                pbar.set_description(f"Training (ε={agents[0].epsilon:.2f}, Bluffs: {bluff_rate:.1f}%, Success: {success_rate:.1f}%)")
            
            # Save models periodically
            if (episode + 1) % 1000 == 0:
                for i, agent in enumerate(agents):
                    agent.save_model(f"{save_dir}/dqn_agent_{i}.pth")
    
    # Final save
    for i, agent in enumerate(agents):
        agent.save_model(f"{save_dir}/dqn_agent_{i}_final.pth")
    
    return agents

def visible_poker_game_dqn(agents, num_hands=3, initial_chips=1000):
    game = PokerGameState(num_players=4, initial_chips=initial_chips)
    hand_count = 0
    
    while hand_count < num_hands:
        # Reset if any player is broke
        if any(chips <= 0 for chips in game.players_chips):
            print("\n=== Resetting chip counts (player went broke) ===")
            game = PokerGameState(num_players=4, initial_chips=initial_chips)
        
        hand_count += 1
        game.start_new_hand()
        game.initial_pot = game.pot
        
        print(f"\n=== Starting Hand #{hand_count} ===")
        print(f"Chip counts: {[f'P{i}: ${chips}' for i, chips in enumerate(game.players_chips)]}")
        print(f"Dealer: Player {game.dealer_position}")
        print(f"Blinds: P{(game.dealer_position + 1) % 4} (${game.small_blind}), P{(game.dealer_position + 2) % 4} (${game.big_blind})")
        
        while not game.is_hand_over():
            print("\n" + "="*50)
            print(f"Round {'Pre-flop' if game.round == 0 else 'Flop' if game.round == 1 else 'Turn' if game.round == 2 else 'River'}")
            
            if game.community_cards:
                print(f"\nCommunity Cards: {', '.join(str(card) for card in game.community_cards)}")
            else:
                print("\nCommunity Cards: None yet")
            
            for i in range(4):
                status = [
                    f"Player {i}:",
                    f"Chips: ${game.players_chips[i]}",
                    f"Current Bet: ${game.current_bets[i]}",
                    "[FOLDED]" if game.folded[i] else "",
                ]
                if game.is_hand_over() or i == game.current_player:
                    status.append(f"Hand: {', '.join(str(card) for card in game.player_hands[i])}")
                else:
                    status.append("Hand: [Hidden]")
                print(" ".join(status))
            
            current_player = game.current_player
            print(f"\nPlayer {current_player}'s turn")
            
            action, amount = agents[current_player].select_action(game, training=False)
            
            print(f"Player {current_player} chooses to {action.upper()} {f'${amount}' if amount > 0 else ''}")
            game.apply_action(current_player, action, amount)
        
        # Showdown
        print("\n" + "="*50)
        print("=== Hand Results ===")
        
        print("\nFinal Board:")
        print(f"Community Cards: {', '.join(str(card) for card in game.community_cards)}")
        
        print("\nPlayer Hands:")
        for i in range(4):
            hand_desc = f"Player {i}: {', '.join(str(card) for card in game.player_hands[i])}"
            if game.folded[i]:
                hand_desc += " [FOLDED]"
            print(hand_desc)
        
        winners = game.get_winner()
        if winners:
            win_amount = game.pot // len(winners)
            print(f"\nWinner(s): {', '.join(f'Player {w}' for w in winners)}")
            print(f"Each wins: ${win_amount}")
            
            hand_names = [
                "High Card", "One Pair", "Two Pair", "Three of a Kind",
                "Straight", "Flush", "Full House", "Four of a Kind", 
                "Straight Flush"
            ]
            for winner in winners:
                all_cards = game.player_hands[winner] + game.community_cards
                rank, _ = evaluate_hand(all_cards)
                print(f"Player {winner} has: {hand_names[rank]}")
        else:
            print("\nNo winners - all players folded")
        
        game.distribute_pot(winners if winners else [])
        
        print("\nUpdated Chip Counts:")
        for i in range(4):
            print(f"Player {i}: ${game.players_chips[i]}")
        
        # Pause between hands
        if hand_count < num_hands:
            input("\nPress Enter to continue to next hand...")

if __name__ == "__main__":
    # Train or load DQN agents
    trained_agents = train_dqn_agents(episodes=10000)
    
    # Play a visible game
    print("\nStarting visible poker game with trained DQN agents...")
    visible_poker_game_dqn(trained_agents, num_hands=5, initial_chips=1000)

No model found at saved_agents/dqn_agent_0.pth, starting from scratch
No model found at saved_agents/dqn_agent_1.pth, starting from scratch
No model found at saved_agents/dqn_agent_2.pth, starting from scratch
No model found at saved_agents/dqn_agent_3.pth, starting from scratch


Training (ε=0.91, Bluffs: 1.4%, Success: 0.0%):  10%|█         | 1011/10000 [00:11<01:53, 79.41it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.82, Bluffs: 1.4%, Success: 0.0%):  20%|██        | 2014/10000 [00:23<01:45, 76.02it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.75, Bluffs: 1.4%, Success: 0.0%):  30%|███       | 3008/10000 [00:36<01:37, 71.99it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.67, Bluffs: 1.3%, Success: 0.0%):  40%|████      | 4013/10000 [00:49<01:20, 74.81it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.61, Bluffs: 1.3%, Success: 0.0%):  50%|█████     | 5014/10000 [01:02<01:06, 75.02it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.55, Bluffs: 1.2%, Success: 0.0%):  60%|██████    | 6010/10000 [01:16<00:58, 67.98it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.50, Bluffs: 1.2%, Success: 0.0%):  70%|███████   | 7008/10000 [01:30<00:42, 70.21it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.45, Bluffs: 1.1%, Success: 0.0%):  80%|████████  | 8007/10000 [01:44<00:31, 62.99it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.41, Bluffs: 1.1%, Success: 0.0%):  90%|█████████ | 9010/10000 [01:58<00:14, 66.15it/s]

Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth


Training (ε=0.37, Bluffs: 1.0%, Success: 0.0%): 100%|██████████| 10000/10000 [02:11<00:00, 75.85it/s]


Model saved to saved_agents/dqn_agent_0.pth
Model saved to saved_agents/dqn_agent_1.pth
Model saved to saved_agents/dqn_agent_2.pth
Model saved to saved_agents/dqn_agent_3.pth
Model saved to saved_agents/dqn_agent_0_final.pth
Model saved to saved_agents/dqn_agent_1_final.pth
Model saved to saved_agents/dqn_agent_2_final.pth
Model saved to saved_agents/dqn_agent_3_final.pth

Starting visible poker game with trained DQN agents...

=== Starting Hand #1 ===
Chip counts: ['P0: $1000', 'P1: $1000', 'P2: $990', 'P3: $980']
Dealer: Player 1
Blinds: P2 ($10), P3 ($20)

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0  Hand: Jh, 4c
Player 1: Chips: $1000 Current Bet: $0  Hand: [Hidden]
Player 2: Chips: $990 Current Bet: $10  Hand: [Hidden]
Player 3: Chips: $980 Current Bet: $20  Hand: [Hidden]

Player 0's turn
Player 0 chooses to FOLD 

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 1: Chips: $1000 C


Press Enter to continue to next hand... 



=== Resetting chip counts (player went broke) ===

=== Starting Hand #2 ===
Chip counts: ['P0: $1000', 'P1: $1000', 'P2: $990', 'P3: $980']
Dealer: Player 1
Blinds: P2 ($10), P3 ($20)

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0  Hand: 2h, 5d
Player 1: Chips: $1000 Current Bet: $0  Hand: [Hidden]
Player 2: Chips: $990 Current Bet: $10  Hand: [Hidden]
Player 3: Chips: $980 Current Bet: $20  Hand: [Hidden]

Player 0's turn
Player 0 chooses to FOLD 

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0  Hand: 6d, 5h
Player 2: Chips: $990 Current Bet: $10  Hand: [Hidden]
Player 3: Chips: $980 Current Bet: $20  Hand: [Hidden]

Player 1's turn
Player 1 chooses to FOLD 

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 2: Chips: $990 Current 


Press Enter to continue to next hand... 



=== Resetting chip counts (player went broke) ===

=== Starting Hand #3 ===
Chip counts: ['P0: $1000', 'P1: $1000', 'P2: $990', 'P3: $980']
Dealer: Player 1
Blinds: P2 ($10), P3 ($20)

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0  Hand: 6d, 7d
Player 1: Chips: $1000 Current Bet: $0  Hand: [Hidden]
Player 2: Chips: $990 Current Bet: $10  Hand: [Hidden]
Player 3: Chips: $980 Current Bet: $20  Hand: [Hidden]

Player 0's turn
Player 0 chooses to FOLD 

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0  Hand: Tc, Qh
Player 2: Chips: $990 Current Bet: $10  Hand: [Hidden]
Player 3: Chips: $980 Current Bet: $20  Hand: [Hidden]

Player 1's turn
Player 1 chooses to FOLD 

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 2: Chips: $990 Current 


Press Enter to continue to next hand... 



=== Starting Hand #4 ===
Chip counts: ['P0: $980', 'P1: $1000', 'P2: $980', 'P3: $1010']
Dealer: Player 2
Blinds: P3 ($10), P0 ($20)

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $980 Current Bet: $20  Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0  Hand: Qs, 3s
Player 2: Chips: $980 Current Bet: $0  Hand: [Hidden]
Player 3: Chips: $1010 Current Bet: $10  Hand: [Hidden]

Player 1's turn
Player 1 chooses to FOLD 

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $980 Current Bet: $20  Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 2: Chips: $980 Current Bet: $0  Hand: 6d, 2h
Player 3: Chips: $1010 Current Bet: $10  Hand: [Hidden]

Player 2's turn
Player 2 chooses to CALL $20

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $980 Current Bet: $20  Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0 [FOLDED] Hand: [Hidden]
Player 2: Chips: $960 Current Bet: $20  Hand: [Hidden]
Player 3: Chips: $1010 Current 

In [31]:
agent.epsilon = 1.0

In [61]:
visible_poker_game_dqn(trained_agents, num_hands=5, initial_chips=1000)


=== Starting Hand #1 ===
Chip counts: ['P0: $1000', 'P1: $1000', 'P2: $1000', 'P3: $1000']

Dealer: Player 1
Small Blind: Player 2 ($10)
Big Blind: Player 3 ($20)

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $1000 Current Bet: $0  Hand: 5h, 4d
Player 1: Chips: $1000 Current Bet: $0  Hand: [Hidden]
Player 2: Chips: $990 Current Bet: $10  Hand: [Hidden]
Player 3: Chips: $980 Current Bet: $20  Hand: [Hidden]

Player 0's turn
Player 0 chooses to RAISE $462

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $538 Current Bet: $462  Hand: [Hidden]
Player 1: Chips: $1000 Current Bet: $0  Hand: 9h, 2c
Player 2: Chips: $990 Current Bet: $10  Hand: [Hidden]
Player 3: Chips: $980 Current Bet: $20  Hand: [Hidden]

Player 1's turn
Player 1 chooses to RAISE $927

Round Pre-flop

Community Cards: None yet
Player 0: Chips: $538 Current Bet: $462  Hand: [Hidden]
Player 1: Chips: $73 Current Bet: $927  Hand: [Hidden]
Player 2: Chips: $990 Current Bet: $10  Hand: 3d, Qh
Player 3: 

KeyboardInterrupt: 