In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from einops import rearrange
import numpy as np
import math

# Easy Mahjong scratch work
- only balls and sticks
- only numbers

In [2]:
tiles_list = [f"{n}{t}" for t in ["b","s"] for n in range(1,10)]
tiles_list

['1b',
 '2b',
 '3b',
 '4b',
 '5b',
 '6b',
 '7b',
 '8b',
 '9b',
 '1s',
 '2s',
 '3s',
 '4s',
 '5s',
 '6s',
 '7s',
 '8s',
 '9s']

In [3]:
tiles_to_id = {t: id for id, t in enumerate(tiles_list)}
tiles_to_id

{'1b': 0,
 '2b': 1,
 '3b': 2,
 '4b': 3,
 '5b': 4,
 '6b': 5,
 '7b': 6,
 '8b': 7,
 '9b': 8,
 '1s': 9,
 '2s': 10,
 '3s': 11,
 '4s': 12,
 '5s': 13,
 '6s': 14,
 '7s': 15,
 '8s': 16,
 '9s': 17}

In [12]:
vocabulary_size = len(tiles_list)
hp = {
    'vocabulary_size': vocabulary_size,
    'embedding_size': vocabulary_size-int(math.log10(vocabulary_size)), # unique 100% orthogonal directions, we don't get benefit of mostly orthogonal until 100+ dims
}
hp

{'vocabulary_size': 18, 'embedding_size': 17}

In [10]:
torch.log10(torch.tensor(10))

tensor(1.)

In [165]:
class PinoyMahjongNet(nn.Module):
    def __init__(self,hp):
        super().__init__()
        self.embedding_size = embedding_size = hp['embedding_size']
        self.vocabulary_size = vocabulary_size = hp['vocabulary_size']
        self.embeddings = nn.Embedding(vocabulary_size, embedding_size)
        self.tokenization = nn.Linear(embedding_size,vocabulary_size,bias=False)
        # consider adding dropout later, but want simple 2 hidden layer network
        self.linear = nn.Sequential(
            # input:
            # - hand embedding (embedding_size)
            # - discard tile embedding (embedding_size)
            nn.Linear(embedding_size*2, embedding_size*2),
            nn.ReLU(),
            nn.Linear(embedding_size*2,embedding_size*2),
            nn.ReLU(),
            nn.Linear(embedding_size*2,embedding_size*2),
            nn.ReLU(),
            # output:
            # - logits all tiles discard (embedding_size)
            # - logits draw, pickup (2)
            nn.Linear(embedding_size*2,embedding_size+2), # not a typo
        )

    def forward(self,hand_ids,top_discard_id):
        assert hand_ids.dim() == 2
        # assume embedding dim is high enough (and orthogonal enough) so that adding embeddings together can represent a hand
        hand_e = self.embeddings(hand_ids).sum(dim=1)
        # I'm doing adding since there isn't anything unique about position in the hand
        x = torch.cat([hand_e,self.embeddings(top_discard_id)],dim=-1)
        out = self.linear(x)
        discard_e, draw_pickup_logits = torch.split(out,[self.embedding_size,2],dim=-1)
        return self.tokenization(discard_e), draw_pickup_logits

# challenge:
# - we need to get probability via softmax
#   - softmax will be across logits for LEGAL moves/actions only (requires filtering of LEGAL logits (for LEGAL actions) somewhere/somehow)
#   - then we extract the probability for the action we ACTUALLY took (need to record this and pass through to here)
# - is it possible to do this with pure torch/tensors of regular action sequence lengths across all batches? (ie not varying action sequences for different batches?)
# - also need advantage computation

# simplify re-think
# - just have player_agent store log_probs for it's actions so we don't need to figure this out later
def grpo_loss(per_player_reward, per_player_log_probs):
    # because of game symmetry, each win is always 3, and each loss is -1, no need to normalize, as long as net reward is always 0
    loss = -per_player_reward['n']*torch.hstack(per_player_log_probs['n']).sum()
    for player in ['e','s','w']:
        log_probs = per_player_log_probs[player]
        if log_probs:
            loss -= per_player_reward[player]*torch.hstack(log_probs).sum()
    return loss

In [237]:
# test
vocabulary_size = 8
embedding_size = 4
hand_size = 5
batch_size = 2
hp = {
    'vocabulary_size': vocabulary_size,
    'embedding_size': embedding_size,
    'batch_size': batch_size,
}
net = PinoyMahjongNet(hp)

hand = torch.randint(vocabulary_size,(batch_size, hand_size))
top_discard = torch.randint(vocabulary_size,(batch_size,))
hand, top_discard
discard_tile, draw_pickup = net(hand,top_discard)
discard_tile, draw_pickup

(tensor([[ 0.1161,  0.0445, -0.0216,  0.1236, -0.0127,  0.0729,  0.0948, -0.1137],
         [ 0.1101,  0.0358,  0.0086,  0.1218, -0.0154,  0.0873,  0.0914, -0.0699]],
        grad_fn=<MmBackward0>),
 tensor([[ 0.2390, -0.0518],
         [ 0.2859, -0.0469]], grad_fn=<SplitWithSizesBackward0>))

In [234]:
from random import shuffle
directions = ['n','e','s','w']
next_direction = {'n': 'e', 'e': 's', 's': 'w', 'w': 'n'}
def random_init(tiles):
    deck = tiles[:]*4
    shuffle(deck)
    # need a per player display (showing chow tiles), having upper case directions represent that
    return {'deck': deck, 'discard': [], 'n': [], 'e': [], 's': [], 'w': [], 'N': [], 'E': [], 'S': [], 'W': []}

def deal(state):
    deck = state['deck']
    for player in directions:
        for i in range(4):
            state[player].append(deck.pop())
    return state

def first_draw(state):
    deck = state['deck']
    state['n'].append(deck.pop())
    state['turn'] = 'n'
    return state

def parse_tile(tile):
    num_s, suit = tile
    num = int(num_s)
    return num, suit

# mahjong is just chow + eyes
def is_hand_mahjong(hand):
    # check pair
    counts = {}
    for tile in hand:
        counts[tile] = counts.get(tile,0)+1
    paired_tiles = set()
    for tile, count in counts.items():
        if count >= 2:
            paired_tiles.add(tile)
    if not paired_tiles:
        return False
    for paired_tile in paired_tiles:
        size = 2
        tmp_hand = []
        for tile in hand:
            if size and paired_tile == tile:
                size -= 1
                continue
            tmp_hand.append(tile)
        piles = {'b': [], 's': [], 'c': []}
        for tile in tmp_hand:
            num, suit = parse_tile(tile)
            piles[suit].append(num)
        for suit, pile in piles.items():
            if len(pile) < 3:
                continue
            pile.sort()
            a, b, c = pile
            if a+2 == b+1 == c:
                return True
    return False
    
def smallest_chow(hand, pickup):
    if len(hand) < 2:
        return None
    matching_suit = []
    d_num, d_suit = parse_tile(pickup)
    for tile in hand:
        num, suit = parse_tile(tile)
        if suit == d_suit:
            matching_suit.append(num)
    matching_suit.sort()
    for i in range(len(matching_suit)-1):
        t1 = matching_suit[i]
        t2 = matching_suit[i+1]
        if d_num+2 == t1+1 == t2: # pickup starts chow
            return [str(n)+d_suit for n in [d_num, t1, t2]]
        if t1+2 == t2+1 == d_num: # pickup ends chow
            return [str(n)+d_suit for n in [t1, t2, d_num]]
    return None

# either a draw or a pickup
# what do we do if we run out of tiles? maybe handle this as part of orchestrator
def action1(state,choice):
    turn = state['turn']
    hand = state[turn]
    display = state[turn.upper()]
    if not state['discard'] or choice == 'draw':
        hand.append(state['deck'].pop())
    else:
        top_discard = state['discard'][-1]
        if is_hand_mahjong(hand+state['discard'][-1:]+display):
            hand.append(state['discard'].pop())
            return state
        s_chow = smallest_chow(hand,top_discard)
        if s_chow:
            state[turn.upper()] += s_chow
            s_chow.remove(top_discard)
            for tile in s_chow:
                hand.remove(tile)
            state['discard'].pop()
        else:
            assert False
    return state

# discard from hand
def action2(state,choice):
    turn = state['turn']
    assert choice in state[turn]
    state[turn].remove(choice)
    state['discard'].append(choice)
    # for easy mode, we can determine next turn already, but in advanced, we have between turn actions
    state['turn'] = next_direction[turn]
    return state

# really, there should be an action 3: which chow/(pong for later) to show, cause sometimes you have a choice to pick what goes out of play
# - to keep it simple for easy mode, we should have it default to smaller chow
# - eg. 1,2, [3], 4,5  -> can have 1,2,3 or 3,4,5


# player must draw
def force_action1(state):
    turn = state['turn']
    hand = state[turn]
    display = state[turn.upper()]
    deck = state['deck']
    assert len(hand)+len(display) == 4
    hand.append(deck.pop())
    return state
    
def find_winner(state):
    for player in directions:
        hand = state[player]
        if is_hand_mahjong(hand):
            return player, hand
    return False

# during action1, is it legal to pickup?
# pickup can be from chow, or winning
def can_pickup(state):
    turn = state['turn']
    hand = state[turn]
    display = state[turn.upper()]
    assert len(hand)+len(display) == 4
    top_discard_tile = state['discard'][-1]
    if is_hand_mahjong(hand+display+[top_discard_tile]):
        return True
    # is chow
    d_num, d_suit = parse_tile(top_discard_tile)
    matching_tiles = []
    for tile in hand:
        num, suit = parse_tile(tile)
        if suit == d_suit:
            matching_tiles.append(num)
    unique_matching_tiles = list(set(matching_tiles))
    unique_matching_tiles.sort()
    for i in range(len(unique_matching_tiles)-1):
        t1 = unique_matching_tiles[i]
        t2 = unique_matching_tiles[i+1]
        if t1+1 != t2:
            continue
        if d_num+1 == t1 or d_num-1 == t2:
            return True
    return False
        
    

# move game forward, declaring what's next step if any (is game done?)
def narrator(state):
    is_winner = find_winner(state)
    turn = state['turn']
    if is_winner:
        return {'game_over': True, 'winning_player': is_winner[0], 'winning_hand': is_winner[1], 'message': f"game over: winner {is_winner[0]}", 'turn': turn}
    if not state['deck']:
        return {'game_over': True, 'winning_player': 'draw', 'message': "game is draw, no tiles left", 'turn': turn}
    hand = state[turn]
    display = state[turn.upper()]
    if len(hand)+len(display) == 4:
        if can_pickup(state):
            return {'game_over': False, 'message': f"awaiting player {turn} to draw or pickup", 'turn': turn, 'await_action': 1}
        else:
            return {'game_over': False, 'message': f"player {turn} must draw", 'turn': turn, 'force_action': 1}
    if len(hand)+len(display) == 5:
        return {'game_over': False, 'message': f"awaiting player {turn} to discard", 'turn': turn, 'await_action': 2}

import random
class SimpleMahjongAgent:
    def __init__(self):
        pass
    # given state (extract top_discard_tile, hand) -> draw or pickup aka chow
    def a1(self,state):
        # always pickup
        return 'pickup'
    def a2(self,state):
        # discard random tile
        turn = state['turn']
        return random.choice(state[turn])

class DumbMahjongAgent:
    def __init__(self):
        pass
    # given state (extract top_discard_tile, hand) -> draw or pickup aka chow
    def a1(self,state):
        # always draw
        return 'draw'
    def a2(self,state):
        # discard random tile
        turn = state['turn']
        return random.choice(state[turn])

#    def forward(self,hand_ids,top_discard_id,mask=None):
#        assert hand_ids.dim() == 2
#        hand_e = self.embeddings(hand_ids).sum(dim=1) # assume embedding dim is high enough (and orthogonal enough) so that adding embeddings together can represent a hand
#        # I'm doing adding since there isn't anything unique about position in the hand
#        x = torch.cat([hand_e,self.embeddings(top_discard_id)],dim=-1)
#        out = self.linear(x)
#        discard_e, draw_pickup_logits = torch.split(out,[self.embedding_size,2],dim=-1)
#        return self.tokenization(discard), draw_pickup

# per game agent
draw_pickup_choice = ['draw','pickup']
class NetMahjongAgentInference:
    def __init__(self,model):
        self.model = model
    def a1(self,state):
        turn = state['turn']
        hand = state[turn]
        display = state[turn.upper()]
        top_discard = state['discard'][-1]
        hand_ids = [tiles_to_id[tile] for tile in hand+display]
        top_discard_id = tiles_to_id[top_discard]
        _discard_id_logits, draw_pickup_logits = self.model(torch.tensor([hand_ids]),torch.tensor([top_discard_id]))
        # might be the case that can't use torch to do batching cause each trajectory will be different and have interleaved python cpu logic
        # or maybe it is possible, even if different actions via different masking/gathering
        dist = torch.distributions.Categorical(logits=draw_pickup_logits[0])
        choice = dist.sample()
        return draw_pickup_choice[choice]
    def a2(self,state):
        turn = state['turn']
        hand = state[turn]
        display = state[turn.upper()]
        top_discard = state['discard'][-1] if state['discard'] else hand[0] # don't know how to encode dummy value to represent empty yet, just picking first tile for now
        hand_ids = [tiles_to_id[tile] for tile in hand]
        all_ids = [tiles_to_id[tile] for tile in hand+display]
        top_discard_id = tiles_to_id[top_discard]
        discard_id_logits, _draw_pickup_logits = self.model(torch.tensor([all_ids]),torch.tensor([top_discard_id]))
        unique_hand_ids = list(set(hand_ids))
        unique_hand_ids.sort()
        # sample only from your hand, not just any possible tile
        legal_discard_id_logits = discard_id_logits.gather(dim=1,index=torch.tensor(unique_hand_ids).unsqueeze(0))
        dist = torch.distributions.Categorical(logits=legal_discard_id_logits[0])
        unique_index = dist.sample()
        tile_id = unique_hand_ids[unique_index]
        return tiles_list[tile_id]

class NetMahjongAgentTrainingV1:
    def __init__(self,model):
        self.model = model
        self.per_player_logits_and_actions = {'n': [], 'e': [], 's': [], 'w': []}
    def a1(self,state):
        turn = state['turn']
        hand = state[turn]
        top_discard = state['discard'][-1]
        hand_ids = [tiles_to_id[tile] for tile in hand]
        top_discard_id = tiles_to_id[top_discard]
        _discard_id_logits, draw_pickup_logits = self.model(torch.tensor([hand_ids]),torch.tensor([top_discard_id]))
        # might be the case that can't use torch to do batching cause each trajectory will be different and have interleaved python cpu logic
        # or maybe it is possible, even if different actions via different masking/gathering
        dist = torch.distributions.Categorical(logits=draw_pickup_logits[0])
        choice = dist.sample()
        self.per_player_logits_and_actions[turn].append((draw_pickup_logits[0,:],choice))
        return draw_pickup_choice[choice]
    def a2(self,state):
        turn = state['turn']
        hand = state[turn]
        top_discard = state['discard'][-1] if state['discard'] else hand[0] # don't know how to encode dummy value to represent empty yet, just picking first tile for now
        hand_ids = [tiles_to_id[tile] for tile in hand]
        top_discard_id = tiles_to_id[top_discard]
        discard_id_logits, _draw_pickup_logits = self.model(torch.tensor([hand_ids]),torch.tensor([top_discard_id]))
        unique_hand_ids = list(set(hand_ids))
        unique_hand_ids.sort()
        # sample only from your hand, not just any possible tile
        legal_discard_id_logits = discard_id_logits.gather(dim=1,index=torch.tensor(unique_hand_ids).unsqueeze(0))
        dist = torch.distributions.Categorical(logits=legal_discard_id_logits[0])
        unique_index = dist.sample()
        tile_id = unique_hand_ids[unique_index]
        return tiles_list[tile_id]
# Thinking its easier to just accumulate the probabilities here and just have a training version of the agent and an inference version
class NetMahjongAgentTrainingV2:
    def __init__(self,model):
        self.model = model
        self.per_player_log_probs = {'n': [], 'e': [], 's': [], 'w': []}
    def a1(self,state):
        turn = state['turn']
        hand = state[turn]
        display = state[turn.upper()]
        top_discard = state['discard'][-1]
        #hand_ids = [tiles_to_id[tile] for tile in hand]
        all_ids = [tiles_to_id[tile] for tile in hand+display]
        top_discard_id = tiles_to_id[top_discard]
        _discard_id_logits, draw_pickup_logits = self.model(torch.tensor([all_ids]),torch.tensor([top_discard_id]))
        # might be the case that can't use torch to do batching cause each trajectory will be different and have interleaved python cpu logic
        # or maybe it is possible, even if different actions via different masking/gathering
        dist = torch.distributions.Categorical(logits=draw_pickup_logits[0])
        choice = dist.sample()
        log_probs = F.log_softmax(draw_pickup_logits[0],dim=-1)
        self.per_player_log_probs[turn].append(log_probs[choice])
        return draw_pickup_choice[choice]
    def a2(self,state):
        turn = state['turn']
        hand = state[turn]
        display = state[turn.upper()]
        top_discard = state['discard'][-1] if state['discard'] else hand[0] # don't know how to encode dummy value to represent empty yet, just picking first tile for now
        hand_ids = [tiles_to_id[tile] for tile in hand]
        all_ids = [tiles_to_id[tile] for tile in hand+display]
        top_discard_id = tiles_to_id[top_discard]
        discard_id_logits, _draw_pickup_logits = self.model(torch.tensor([all_ids]),torch.tensor([top_discard_id]))
        unique_hand_ids = list(set(hand_ids))
        unique_hand_ids.sort()
        # sample only from your hand, not just any possible tile
        legal_discard_id_logits = discard_id_logits.gather(dim=1,index=torch.tensor(unique_hand_ids).unsqueeze(0))
        dist = torch.distributions.Categorical(logits=legal_discard_id_logits[0])
        unique_index = dist.sample()
        tile_id = unique_hand_ids[unique_index]
        log_probs = F.log_softmax(legal_discard_id_logits,dim=-1)[0]
        self.per_player_log_probs[turn].append(log_probs[unique_index])
        return tiles_list[tile_id]

# sets up game
# checks with narrator
# gets actions from agent
def game_manager(player_agent,game_statistics=None,training=True):
    #print('start')
    s0 = first_draw(deal(random_init(tiles_list)))
    # ensure we didn't just get a win without moving
    while find_winner(s0):
        s0 = first_draw(deal(random_init(tiles_list)))
    state = s0
    narration = narrator(state)
    i = 0
    per_player_reward = {'n': -1.0, 'e': -1.0, 's': -1.0, 'w': -1.0}
    while not narration['game_over']:
        turn = state['turn']
        #print(i,"'"+narration['message']+"'",turn,state[turn],state['discard'][-1:])
        i += 1
        force_action = narration.get('force_action',False)
        if force_action:
            if force_action == 1:
                force_action1(state)
            narration = narrator(state)
            continue
        await_action = narration['await_action']
        if await_action == 1:
            choice = player_agent.a1(state)
            action1(state,choice)
        elif await_action == 2:
            choice = player_agent.a2(state)
            action2(state,choice)
        else:
            assert False
        narration = narrator(state)
    #print(i,narration,turn,state[turn])
    if training:
        winning_player = narration['winning_player']
        if winning_player != 'draw':
            per_player_reward[winning_player] = 3.0
        else:
            game_statistics['game_lengths_draws'].append(i)
        game_statistics['game_lengths_all'].append(i)
        return {'per_player_reward': per_player_reward, 'per_player_log_probs': player_agent.per_player_log_probs}

def multi_agent_game_manager(player_agent,opponent_agent,game_statistics=None,training=True):
    #print('start')
    s0 = first_draw(deal(random_init(tiles_list)))
    # ensure we didn't just get a win without moving
    while find_winner(s0):
        s0 = first_draw(deal(random_init(tiles_list)))
    state = s0
    narration = narrator(state)
    i = 0
    player_direction = random.choice(['n','e','s','w'])
    player_to_agent = {'n': opponent_agent, 'e': opponent_agent, 's': opponent_agent, 'w': opponent_agent}
    player_to_agent[player_direction] = player_agent
    while not narration['game_over']:
        turn = state['turn']
        active_agent = player_to_agent[turn]
        print(i,"'"+narration['message']+"'",turn,state[turn],state['discard'][-1:])
        i += 1
        force_action = narration.get('force_action',False)
        if force_action:
            if force_action == 1:
                force_action1(state)
            narration = narrator(state)
            continue
        await_action = narration['await_action']
        if await_action == 1:
            choice = active_agent.a1(state)
            action1(state,choice)
        elif await_action == 2:
            choice = active_agent.a2(state)
            action2(state,choice)
        else:
            assert False
        narration = narrator(state)
    print(i,player_direction, narration,turn,state[turn])
    reward = -1.0
    if training:
        winning_player = narration['winning_player']
        win = 0
        if winning_player == 'draw':
            reward = -0.5
            game_statistics['game_lengths_draws'].append(i)
        elif winning_player == player_direction:
            reward = 3.0
            win = 1
        game_statistics['game_lengths_all'].append(i)
        return {'win': win, 'reward': reward, 'log_probs': player_agent.per_player_log_probs[player_direction]}

def simple_trainer(model,j=0):
    game_statistics = {'game_lengths_all': [], 'game_lengths_draws': []}

    loss = None
    optimizer = Adam(model.parameters(),lr=0.1)
    model.train()
    total_loss = None
    for i in range(100):
        no_data = True
        while no_data:
            agent = NetMahjongAgentTrainingV2(model)
            training_data = game_manager(agent,game_statistics)
            if training_data is None: # if we want to exclude this game from training, game_manager should return None
                continue
            else:
                no_data = False
            loss = grpo_loss(**training_data)
        #print(f"step {i} loss is {loss}")
        if total_loss is None:
            total_loss = loss
        else:
            total_loss += loss
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    game_lengths_all = game_statistics['game_lengths_all']
    game_length_avg = sum(game_lengths_all)/len(game_lengths_all)
    game_lengths_draw = game_statistics['game_lengths_draws']
    num_draws = len(game_lengths_draw)
    draw_length_avg = sum(game_lengths_draw)/num_draws
    print(f"{j} end iteration loss: {total_loss}, game_length_avg: {game_length_avg}, draw_length_avg: {draw_length_avg}, num_draws: {num_draws}/{len(game_lengths_all)}")

def multi_agent_trainer(model):
    optimizer = Adam(model.parameters(),lr=0.001)
    model.train()
    opponent = SimpleMahjongAgent()
    #opponent = DumbMahjongAgent()
    # get rollouts/episodes
    for epoch in range(100):
        game_statistics = {'game_lengths_all': [], 'game_lengths_draws': []}
        wins = 0
        rewards = []
        log_probs_totals = []
        for i in range(400):
            no_data = True
            while no_data:
                agent = NetMahjongAgentTrainingV2(model)
                training_data = multi_agent_game_manager(agent,opponent,game_statistics)
                if training_data is None: # if we want to exclude this game from training, game_manager should return None
                    continue
                else:
                    no_data = False
                wins += training_data['win']
                if training_data['log_probs']:
                    rewards.append(training_data['reward'])
                    log_probs_totals.append(torch.hstack(training_data['log_probs']).sum())
        # compute standardized advantage
        reward_tensor = torch.tensor(rewards)
        reward_mean = reward_tensor.mean()
        reward_std = reward_tensor.std()
        advantage = (reward_tensor-reward_mean)/reward_std
        log_probs_totals_tensor = torch.hstack(log_probs_totals)
        # grpo loss
        loss = -(advantage*log_probs_totals_tensor).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        game_lengths_all = game_statistics['game_lengths_all']
        game_length_avg = sum(game_lengths_all)/len(game_lengths_all)
        game_lengths_draw = game_statistics['game_lengths_draws']
        num_draws = len(game_lengths_draw)
        draw_length_avg = sum(game_lengths_draw)/num_draws
        #print(log_probs_totals_tensor)
        #print(advantage)
        print(f"{epoch} end iteration loss: {loss}, wins: {wins}, game_length_avg: {game_length_avg}, draw_length_avg: {draw_length_avg}, num_draws: {num_draws}/{len(game_lengths_all)}")


In [139]:
vocabulary_size = len(tiles_list)
embedding_size = vocabulary_size-int(math.log10(vocabulary_size))
hand_size = 5
batch_size = 1
hp = {
    'vocabulary_size': vocabulary_size,
    'embedding_size': embedding_size,
    'batch_size': batch_size,
}
print(hp)
model = PinoyMahjongNet(hp)
model

{'vocabulary_size': 18, 'embedding_size': 17, 'batch_size': 1}


PinoyMahjongNet(
  (embeddings): Embedding(18, 17)
  (tokenization): Linear(in_features=17, out_features=18, bias=False)
  (linear): Sequential(
    (0): Linear(in_features=34, out_features=34, bias=True)
    (1): ReLU()
    (2): Linear(in_features=34, out_features=34, bias=True)
    (3): ReLU()
    (4): Linear(in_features=34, out_features=34, bias=True)
    (5): ReLU()
    (6): Linear(in_features=34, out_features=19, bias=True)
  )
)

In [220]:
model = PinoyMahjongNet(hp)
multi_agent_trainer(model)

0 end iteration loss: -352.9567565917969, wins: 21, game_length_avg: 90.91, draw_length_avg: 110.61764705882354, num_draws: 136/200
1 end iteration loss: -360.40057373046875, wins: 18, game_length_avg: 92.85, draw_length_avg: 110.53521126760563, num_draws: 142/200
2 end iteration loss: -152.31893920898438, wins: 12, game_length_avg: 92.24, draw_length_avg: 110.76811594202898, num_draws: 138/200
3 end iteration loss: -424.6858215332031, wins: 19, game_length_avg: 95.22, draw_length_avg: 110.79194630872483, num_draws: 149/200
4 end iteration loss: -197.89993286132812, wins: 24, game_length_avg: 92.71, draw_length_avg: 110.94202898550725, num_draws: 138/200
5 end iteration loss: -198.65623474121094, wins: 16, game_length_avg: 93.35, draw_length_avg: 110.67142857142858, num_draws: 140/200
6 end iteration loss: -251.2693328857422, wins: 18, game_length_avg: 93.24, draw_length_avg: 110.68055555555556, num_draws: 144/200
7 end iteration loss: -415.1628112792969, wins: 20, game_length_avg: 91.

In [141]:
%debug

> [0;32m/var/folders/_9/0bl8t2fd4dnfgskt322pjttc0000gn/T/ipykernel_15839/224881488.py[0m(99)[0;36maction1[0;34m()[0m
[0;32m     97 [0;31m                [0mhand[0m[0;34m.[0m[0mremove[0m[0;34m([0m[0mtile[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 99 [0;31m            [0;32massert[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m    [0;32mreturn[0m [0mstate[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m[0;34m[0m[0m
[0m


ipdb>  hand


['1b', '4s', '2b', '3s']


ipdb>  top_discard


'3b'


ipdb>  s_chow
ipdb>  q


In [231]:
multi_agent_trainer(model)

0 end iteration loss: -16.0616512298584, wins: 108, game_length_avg: 82.895, draw_length_avg: 114.19047619047619, num_draws: 189/400
1 end iteration loss: -48.29631805419922, wins: 89, game_length_avg: 86.88, draw_length_avg: 114.2085308056872, num_draws: 211/400
2 end iteration loss: 0.8633313179016113, wins: 111, game_length_avg: 87.575, draw_length_avg: 114.32835820895522, num_draws: 201/400
3 end iteration loss: -6.712360382080078, wins: 111, game_length_avg: 84.21, draw_length_avg: 114.17647058823529, num_draws: 204/400
4 end iteration loss: -41.66496276855469, wins: 104, game_length_avg: 88.44, draw_length_avg: 114.01941747572816, num_draws: 206/400
5 end iteration loss: -10.485408782958984, wins: 111, game_length_avg: 85.25, draw_length_avg: 114.29292929292929, num_draws: 198/400
6 end iteration loss: -52.822059631347656, wins: 103, game_length_avg: 86.175, draw_length_avg: 114.34482758620689, num_draws: 203/400
7 end iteration loss: -42.11555480957031, wins: 107, game_length_av

In [150]:
%debug

> [0;32m/var/folders/_9/0bl8t2fd4dnfgskt322pjttc0000gn/T/ipykernel_15839/806113209.py[0m(99)[0;36maction1[0;34m()[0m
[0;32m     97 [0;31m                [0mhand[0m[0;34m.[0m[0mremove[0m[0;34m([0m[0mtile[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 99 [0;31m            [0;32massert[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m    [0;32mreturn[0m [0mstate[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m[0;34m[0m[0m
[0m


ipdb>  matching_suit


*** NameError: name 'matching_suit' is not defined


ipdb>  hand


['5b']


ipdb>  pickup


*** NameError: name 'pickup' is not defined


ipdb>  s_chow
ipdb>  choice


'pickup'


ipdb>  top_discard


'5b'


ipdb>  state


{'deck': ['1s', '4b', '6s', '8s', '2b', '5b', '6b', '2b', '1s', '5s', '9s', '6b', '7s', '9b', '7s', '3s', '4b', '2b', '5b', '5s', '5s', '1b', '2s', '8b', '4s', '2b', '9b', '2s', '2s', '3b', '4b', '9s', '4b', '7s', '7s', '7b', '6s', '1b', '7b', '9s', '9b', '1s', '1b', '8b', '8s', '4s', '1s', '8b', '8s', '6s', '6b', '9s', '6s'], 'discard': ['6b', '3s'], 'n': ['7b', '3b', '3b', '3b'], 'e': ['5b'], 's': ['7b', '9b', '8s', '1b'], 'w': ['8b'], 'N': [], 'E': ['3s', '4s', '5s'], 'S': [], 'W': ['2s', '3s', '4s'], 'turn': 'e'}


ipdb>  narration


*** NameError: name 'narration' is not defined


ipdb>  q


In [233]:
# save model
torch.save(model.state_dict(), 'prototype_18t_v0.pth')  


In [236]:
game_statistics = {'game_lengths_all': [], 'game_lengths_draws': []}
multi_agent_game_manager(NetMahjongAgentInference(model),SimpleMahjongAgent(),game_statistics,False)

0 'awaiting player n to discard' n ['1b', '7b', '6b', '6s', '9b'] []
1 'player e must draw' e ['6b', '3s', '2b', '3s'] ['1b']
2 'awaiting player e to discard' e ['6b', '3s', '2b', '3s', '9b'] ['1b']
3 'player s must draw' s ['5b', '9s', '6s', '7s'] ['9b']
4 'awaiting player s to discard' s ['5b', '9s', '6s', '7s', '5s'] ['9b']
5 'player w must draw' w ['7s', '2b', '8b', '3b'] ['5s']
6 'awaiting player w to discard' w ['7s', '2b', '8b', '3b', '7s'] ['5s']
7 'player n must draw' n ['7b', '6b', '6s', '9b'] ['3b']
8 'awaiting player n to discard' n ['7b', '6b', '6s', '9b', '9b'] ['3b']
9 'player e must draw' e ['6b', '3s', '2b', '3s'] ['9b']
10 'awaiting player e to discard' e ['6b', '3s', '2b', '3s', '5b'] ['9b']
11 'player s must draw' s ['5b', '9s', '6s', '7s'] ['5b']
12 'awaiting player s to discard' s ['5b', '9s', '6s', '7s', '8b'] ['5b']
13 'player w must draw' w ['7s', '2b', '8b', '7s'] ['5b']
14 'awaiting player w to discard' w ['7s', '2b', '8b', '7s', '5s'] ['5b']
15 'player n mus

In [123]:
narrator(e1)

{'game_over': False,
 'message': 'awaiting player n to discard',
 'turn': 'n',
 'await_action': 2}

In [124]:
e2 = action2(e1,'1s')
e2

{'deck': ['4b',
  '3b',
  '4s',
  '5b',
  '2b',
  '8s',
  '3s',
  '4b',
  '8b',
  '7s',
  '6b',
  '4s',
  '1s',
  '3b',
  '7s',
  '4b',
  '5s',
  '1s',
  '9b',
  '3b',
  '8b',
  '5s',
  '9s',
  '3s',
  '4b',
  '2s',
  '5b',
  '6s',
  '8b',
  '8s',
  '5b',
  '4s',
  '3s',
  '9b',
  '1b',
  '8s',
  '3b',
  '1b',
  '6s',
  '6b',
  '9s',
  '2b',
  '9b',
  '7s',
  '7b',
  '2b',
  '2s',
  '1b',
  '5s',
  '2b',
  '3s',
  '2s',
  '7b',
  '6b',
  '9b'],
 'discard': ['1s'],
 'n': ['9s', '5b', '4s', '8b'],
 'e': ['1b', '6b', '6s', '1s'],
 's': ['5s', '7s', '6s', '2s'],
 'w': ['7b', '7b', '9s', '8s'],
 'turn': 'e'}

In [129]:
can_pickup(e2)

False

In [126]:
e0 = first_draw(deal(random_init(tiles_list)))
e0['deck'] = []
narrator(e0)

{'game_over': True,
 'winning_player': 'draw',
 'message': 'game is draw, no tiles left',
 'turn': 'n'}