In [5]:
import pymahjong
import numpy as np

# Heuristic bots to benchmark against # 

In [101]:
TERMINAL_HONOR_INDICES = set([
    0, 8, 9, 17, 18, 26, 27, 28, 29, 30, 31, 32, 33 # 1s, 9s, and honors
])

WIN_INDICES = set([49,50]) # ron, tsumo

# heuristic bots to test against

# THW: takes the win action if available, otherwise discards a terminal honor tile if available, otherwise picks a random action
class THW():

  def select_action(self, obs_np, valid_actions, training=False):
    win_actions = [a for a in valid_actions if a in WIN_INDICES]
    if win_actions:
        return np.random.choice(win_actions)
    terminal_honor_discards = [a for a in valid_actions if a in TERMINAL_HONOR_INDICES]

    if terminal_honor_discards:
        return np.random.choice(terminal_honor_discards)
    else:
        return np.random.choice(valid_actions)

# TH: discards a terminal honor tile if available, otherwise picks a random action
class TH():

  def select_action(self, obs_np, valid_actions):
    terminal_honor_discards = [a for a in valid_actions if a in TERMINAL_HONOR_INDICES]
    if terminal_honor_discards:
        return np.random.choice(terminal_honor_discards)
    else:
        return np.random.choice(valid_actions)

# RAND: picks a random action from the valid actions
class RAND():
  def select_action(self, obs_np, valid_actions, training = False):
    return np.random.choice(valid_actions)


In [81]:
# run a game of 10,000 rounds with the 4 given agents
def run_game(agents, num_rounds=1000):
  reward_sum = 0
  rank_sum = 0
  num_1sts = 0
  nonzero_rounds = 0
  for _ in range(num_rounds):
    env = pymahjong.MahjongEnv()
    obs = env.reset()
    while True:
        curr_pid = env.get_curr_player_id()
        valid_actions = env.get_valid_actions()  # e.g., [0, 3, 4, 20, 21]
        executor_obs = env.get_obs(curr_pid)

        a = agents[curr_pid].select_action(executor_obs, valid_actions)
        env.step(curr_pid, a)
       
        if env.is_over():
            payoffs = env.get_payoffs() # payoffs = [p0, p1, p2, p3]
            if any(p != 0 for p in payoffs):
                nonzero_rounds += 1
                rank = 1 + sum(payoffs[0] < p for p in payoffs[1:])
                if rank == 1:
                    num_1sts += 1
                rank_sum += rank
            reward_sum += payoffs[0]
            break
  print("total payoff = {} after {} rounds".format(reward_sum, num_rounds))
  print("average payoff = {}".format(reward_sum / num_rounds))
  print("average placement = {}".format(rank_sum / nonzero_rounds))
  print("num 1st place = {} / {}".format(num_1sts, nonzero_rounds))

In [74]:
print("terminal / honor discard agent")
run_game(agents=[TH(), RAND(), RAND(), RAND()], num_rounds=1000)
print("terminal / honor / win discard agent")
run_game(agents=[THW(), RAND(), RAND(), RAND()], num_rounds=1000)

terminal / honor discard agent
total payoff = 343700.0 after 1000 rounds
average payoff = 343.7
average placement = 1.4107142857142858
num 1st place = 145 / 224
terminal / honor / win discard agent
total payoff = 417000.0 after 1000 rounds
average payoff = 417.0
average placement = 1.5056179775280898
num 1st place = 153 / 267


# v1. Trying a CNN #

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# cnn archtecture v1
class MahjongCNNPolicy(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 93 * 34, 512)
        self.fc2 = nn.Linear(512, 54)  # 54 possible actions

    def forward(self, x): 
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # logits
    
# cnn architecture v2
class MahjongCNNPolicy_v2(nn.Module):
    def __init__(self, kernel_size=3):
        super().__init__()
        padding = 0  

        self.conv1 = nn.Conv2d(1, 64, kernel_size=kernel_size, padding=padding)
        self.bn1 = nn.BatchNorm2d(64)
        self.drop1 = nn.Dropout(0.5)

        self.conv2 = nn.Conv2d(64, 64, kernel_size=kernel_size, padding=padding)
        self.bn2 = nn.BatchNorm2d(64)
        self.drop2 = nn.Dropout(0.5)

        self.conv3 = nn.Conv2d(64, 64, kernel_size=kernel_size, padding=padding)
        self.bn3 = nn.BatchNorm2d(64)
        self.drop3 = nn.Dropout(0.5)

        self.conv4 = nn.Conv2d(64, 32, kernel_size=kernel_size, padding=padding)
        self.bn4 = nn.BatchNorm2d(32)
        self.drop4 = nn.Dropout(0.5)

        # We'll infer the flatten size dynamically
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(32 * self._calc_flatten_size(kernel_size), 256)
        # self.bn_fc1 = nn.BatchNorm1d(256)
        self.bn_fc1 = nn.LayerNorm(256)
        self.drop_fc1 = nn.Dropout(0.5)

        self.out = nn.Linear(256, 54)  # Output: 54 logits (actions)
    
    def _calc_flatten_size(self, k):
    # Starting from H=93, W=34 and padding=0 each time
        h, w = 93, 34
        for _ in range(4):
            h = h - (k - 1)
            w = w - (k - 1)
        return h * w

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.drop1(x)

        x = F.relu(self.bn2(self.conv2(x)))
        x = self.drop2(x)

        x = F.relu(self.bn3(self.conv3(x)))
        x = self.drop3(x)

        x = F.relu(self.bn4(self.conv4(x)))
        x = self.drop4(x)

        x = self.flatten(x)
        x = self.drop_fc1(F.relu(self.bn_fc1(self.fc1(x))))
        return self.out(x)  # logits (no softmax for RL)

In [76]:
class CNNAgent:
    def __init__(self, model, device="cpu"):
        self.model = model.to(device)
        self.device = device

    def select_action(self, obs_np, valid_actions, training=False):
        obs_tensor = torch.tensor(obs_np, dtype=torch.float32, device=self.device).unsqueeze(0).unsqueeze(0)

        if not training:
          with torch.no_grad():
            logits = self.model(obs_tensor)[0]
        else:
            logits = self.model(obs_tensor)[0] 

        logits = logits[0]

        # Mask invalid actions
        mask = torch.full((54,), float('-inf'), device=self.device)
        mask[valid_actions] = 0  # allow valid actions only

        masked_logits = logits + mask
        probs = F.softmax(masked_logits, dim=0)

        dist = torch.distributions.Categorical(probs)
        action = dist.sample()

        if training:
            return action.item(), dist.log_prob(action)
        else:
            return action.cpu().item()

In [None]:
# hyperameters: 
# num_games 
# learning_rate
# batch_size
def train_cnn_agent_w_randoms(model = MahjongCNNPolicy(), num_games=10000, learning_rate=1e-5, save_as="trained_cnn_model.pth"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    agent = CNNAgent(model, device=device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for episode in range(num_games):
        env = pymahjong.MahjongEnv()
        obs = env.reset()

        log_probs = []

        while True:
            pid = env.get_curr_player_id()
            valid_actions = env.get_valid_actions()
            executor_obs = env.get_obs(pid)
            if pid == 0: #cnn agent
                a, log_prob = agent.select_action(executor_obs, valid_actions, training=True)
                # print(f"cnn turn action: {a}")
                log_probs.append(log_prob)
            else: # 3 other random agents
                a = np.random.choice(valid_actions)

            env.step(pid, a)

            if env.is_over():
              payoffs = env.get_payoffs() # payoffs = [p0, p1, p2, p3]
              break

        reward = payoffs[0]

        # Training step (REINFORCE)
        optimizer.zero_grad()
        loss = 0
        for log_prob in log_probs:
            loss += -log_prob * reward
        loss.backward()
        optimizer.step()

        if episode % 100 == 0:
            print(f"Episode [{episode}] - log prob[0]: {log_probs[0].item()} reward: {reward}, loss: {loss.item()}")

        # Save the model
    torch.save(model.state_dict(), save_as)
    print("Training complete and model saved.")

    return model

In [102]:
model = train_cnn_agent_w_randoms(model = MahjongCNNPolicy(), num_games=10000, learning_rate=1e-5, save_as="trained_cnn_rand_model.pth")

Using device: cuda
Episode [0] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [100] - log prob[0]: -2.4849066734313965 reward: -2000.0, loss: -97461.421875
Episode [200] - log prob[0]: -2.5649495124816895 reward: 0.0, loss: 0.0
Episode [300] - log prob[0]: -2.5649495124816895 reward: 0.0, loss: 0.0
Episode [400] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [500] - log prob[0]: -2.6390573978424072 reward: -1000.0, loss: -49978.578125
Episode [600] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [700] - log prob[0]: -2.397895336151123 reward: 0.0, loss: 0.0
Episode [800] - log prob[0]: -2.397895336151123 reward: 0.0, loss: 0.0
Episode [900] - log prob[0]: -2.397895336151123 reward: 0.0, loss: 0.0
Episode [1000] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [1100] - log prob[0]: -2.5649492740631104 reward: 0.0, loss: 0.0
Episode [1200] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [1300] - lo

KeyboardInterrupt: 

^^ Ignore the error above, we already had trained this model!!!

In [82]:
model = MahjongCNNPolicy()
model.load_state_dict(torch.load("trained_cnn_rand_model.pth", map_location="cuda"))  # or "cuda" if using GPU
model.eval()
trained_agent = CNNAgent(model, device="cuda") 
# run trained model against random agents
print("Cnn vs 3 random agents")
run_game(agents=[trained_agent, RAND(), RAND(), RAND()], num_rounds=1000)
print("Cnn vs 3 terminal honor win bots")
run_game(agents=[trained_agent, THW(), THW(), THW()], num_rounds=1000)
print("Cnn vs 3 terminal honor bots")
run_game(agents=[trained_agent, TH(), TH(), TH()], num_rounds=1000)

  model.load_state_dict(torch.load("trained_cnn_rand_model.pth", map_location="cuda"))  # or "cuda" if using GPU


Cnn vs 3 random agents
total payoff = -700.0 after 1000 rounds
average payoff = -0.7
average placement = 1.7733333333333334
num 1st place = 43 / 150
Cnn vs 3 terminal honor win bots
total payoff = -364100.0 after 1000 rounds
average payoff = -364.1
average placement = 2.363112391930836
num 1st place = 14 / 347
Cnn vs 3 terminal honor bots
total payoff = -290100.0 after 1000 rounds
average payoff = -290.1
average placement = 2.1506849315068495
num 1st place = 23 / 365


# Compare the bots moves to moves made by above-average human players #

In [64]:
# get offline data
import scipy.io as sio
def convert_actions(actions_arr): # for newer support for red dora
    for i in range(len(actions_arr)):
        action = actions_arr[i]
        if 34 <= action <= 36:
            actions_arr[i] = action + 3 # 34, 35, 36 -> 37, 38, 39
        elif action == 37: # pon
            actions_arr[i] = 43
        elif 38 <= action <= 46:
            actions_arr[i] = action + 7 
    return actions_arr


def get_human_accuracy(agent, offline_data_file = "", num_steps=500000):
    data = sio.loadmat(offline_data_file)
    num_correct, num_total, num_discards = 0, 0, 0

    for i in range(num_steps):
        
        if data["A"][0][i] == 255:
            continue
        num_total += 1
        obs = data["X"][i]
        assert obs.shape == (93, 34)
        valid_actions = data["M"][i]
        assert len(valid_actions) == 47
        valid_actions = [i for i in range(47) if valid_actions[i] == 1]
        action = [data["A"][0][i]]
        
        valid_actions = convert_actions(valid_actions)
        human_action = convert_actions(action)
        assert human_action[0] in valid_actions, "action not in valid actions"

        our_action = agent.select_action(obs, valid_actions, training=False)
        # print("our action: {}, human action: {}, action space: {}".format(our_action, human_action[0], len(valid_actions)))
        if our_action == human_action[0]:
            if 0 <= our_action <= 33:
                num_discards += 1
            num_correct += 1
        
        if i % 1000 == 0:
            print("step {}. num correct = {}. num discards correct = {}. num total = {}".format(i, num_correct, num_discards, num_total))
    
    print("num correct = {}. num total = {}".format(num_correct, num_total))
    print("top 1 accuracy = {}".format(num_correct / num_total))
    print("correct discards / correct total = {}".format(num_discards / num_correct))
    

In [80]:
get_human_accuracy(trained_agent, offline_data_file="pymahjong-offline-data-20M/mahjong-offline-data-batch-0.mat", num_steps=30000)

step 0. num correct = 1. num discards correct = 1. num total = 1
step 1000. num correct = 164. num discards correct = 60. num total = 942
step 3000. num correct = 509. num discards correct = 180. num total = 2820
step 4000. num correct = 672. num discards correct = 257. num total = 3760
step 5000. num correct = 852. num discards correct = 330. num total = 4700
step 6000. num correct = 1052. num discards correct = 402. num total = 5639
step 7000. num correct = 1249. num discards correct = 465. num total = 6580
step 8000. num correct = 1431. num discards correct = 546. num total = 7503
step 10000. num correct = 1789. num discards correct = 697. num total = 9360
step 11000. num correct = 1958. num discards correct = 755. num total = 10298
step 12000. num correct = 2137. num discards correct = 821. num total = 11233
step 13000. num correct = 2300. num discards correct = 886. num total = 12168
step 14000. num correct = 2506. num discards correct = 957. num total = 13104
step 15000. num corr

# v1.5 Self-play training # 

In [74]:
def train_cnn_agent_selfplay(num_games=10000, learning_rate=1e-5, save_as="trained_cnn_model.pth"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    model = MahjongCNNPolicy()
    agent = CNNAgent(model, device=device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    agents = [agent for _ in range(4)]
    for episode in range(num_games):
        payoffs = [0, 0, 0, 0]
        env = pymahjong.MahjongEnv()
        obs = env.reset()
        log_probs_by_player = [[] for _ in range(4)]

        while True:
            pid = env.get_curr_player_id()
            valid_actions = env.get_valid_actions()
            executor_obs = env.get_obs(pid)
            
            a, log_prob = agents[pid].select_action(executor_obs, valid_actions, training=True)
            log_probs_by_player[pid].append(log_prob)

            env.step(pid, a)

            if env.is_over():
              payoffs = env.get_payoffs() # payoffs = [p0, p1, p2, p3]
              break


        # Training step (REINFORCE)
        optimizer.zero_grad()
        loss = 0
        for pid in range(4):
            for log_prob in log_probs_by_player[pid]:
                loss += -log_prob * payoffs[pid]  # REINFORCE
        loss.backward()
        optimizer.step()

        if episode % 100 == 0 or episode > 0:
            print(f"Ep [{episode}] loss: {loss.item():.2f} | payofsf: {payoffs}")

        # Save the model
    torch.save(model.state_dict(), save_as)
    print("Training complete and model saved ")

    return model

In [75]:
train_cnn_agent_selfplay(num_games=10000, learning_rate=1e-5, save_as="trained_cnn_selfplay_model.pth")

Using device: cuda
Ep [0] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [1] loss: 4929.67 | payofsf: [-1000.  3000. -1000. -1000.]
Ep [2] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [3] loss: -14155.23 | payofsf: [ 3000. -1000. -1000. -1000.]
Ep [4] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [5] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [6] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [7] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [8] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [9] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [10] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [11] loss: -21260.22 | payofsf: [-1000. -1000. -1000.  3000.]
Ep [12] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [13] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [14] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [15] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [16] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [17] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [18] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [19] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [20] loss: 0.00 | payofsf: [0. 0. 0. 0.]
Ep [21

MahjongCNNPolicy(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=202368, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=54, bias=True)
)

In [83]:

model = MahjongCNNPolicy()
model.load_state_dict(torch.load("trained_cnn_selfplay_model.pth", map_location="cuda"))  # or "cuda" if using GPU
model.eval()
trained_selfplay_agent = CNNAgent(model, device="cuda") 
get_human_accuracy(trained_selfplay_agent, offline_data_file="pymahjong-offline-data-20M/mahjong-offline-data-batch-0.mat", num_steps=30000)

  model.load_state_dict(torch.load("trained_cnn_selfplay_model.pth", map_location="cuda"))  # or "cuda" if using GPU


step 0. num correct = 0. num discards correct = 0. num total = 1
step 1000. num correct = 192. num discards correct = 79. num total = 942
step 3000. num correct = 562. num discards correct = 237. num total = 2820
step 4000. num correct = 736. num discards correct = 310. num total = 3760
step 5000. num correct = 901. num discards correct = 371. num total = 4700
step 6000. num correct = 1102. num discards correct = 442. num total = 5639
step 7000. num correct = 1298. num discards correct = 510. num total = 6580
step 8000. num correct = 1447. num discards correct = 576. num total = 7503
step 10000. num correct = 1811. num discards correct = 722. num total = 9360
step 11000. num correct = 1989. num discards correct = 790. num total = 10298
step 12000. num correct = 2165. num discards correct = 853. num total = 11233
step 13000. num correct = 2351. num discards correct = 935. num total = 12168
step 14000. num correct = 2531. num discards correct = 997. num total = 13104
step 15000. num corr

In [84]:
run_game(agents=[trained_selfplay_agent, THW(), THW(), THW()], num_rounds=1000)

total payoff = -336400.0 after 1000 rounds
average payoff = -336.4
average placement = 2.3398533007334965
num 1st place = 27 / 409


# v2 Trying different model architecture (from Meowjong) #

In [21]:
model = MahjongCNNPolicy_v2()
train_cnn_agent_w_randoms(model = model, num_games=10000, learning_rate=1e-5, save_as="trained_cnn_rand_model_v2.pth")

Using device: cuda
Episode [0] - log prob[0]: -2.5649492740631104 reward: -1000.0, loss: -45705.66796875
Episode [1] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [2] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [3] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [4] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [5] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [6] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [7] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [8] - log prob[0]: -2.4849066734313965 reward: -1000.0, loss: -47258.2109375
Episode [9] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [10] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [11] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [12] - log prob[0]: -2.4849066734313965 reward: 0.0, loss: 0.0
Episode [13] - log prob[0]: -2.484906

MahjongCNNPolicy_v2(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop1): Dropout(p=0.5, inplace=False)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop2): Dropout(p=0.5, inplace=False)
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop3): Dropout(p=0.5, inplace=False)
  (conv4): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop4): Dropout(p=0.5, inplace=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=70720, out_features=256, bias=True)
  (bn_fc1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (drop_fc1): Dropout(p=0.5, inplace=Fal

In [22]:
model = MahjongCNNPolicy_v2()
model.load_state_dict(torch.load("trained_cnn_rand_model_v2.pth", map_location="cuda"))  # or "cuda" if using GPU
model.eval()
trained_selfplay_agent = CNNAgent(model, device="cuda") 
get_human_accuracy(trained_selfplay_agent, offline_data_file="pymahjong-offline-data-20M/mahjong-offline-data-batch-0.mat", num_steps=30000)

  model.load_state_dict(torch.load("trained_cnn_rand_model_v2.pth", map_location="cuda"))  # or "cuda" if using GPU


step 0. num correct = 0. num discards correct = 0. num total = 1
step 1000. num correct = 179. num discards correct = 77. num total = 942
step 3000. num correct = 548. num discards correct = 221. num total = 2820
step 4000. num correct = 733. num discards correct = 300. num total = 3760
step 5000. num correct = 927. num discards correct = 379. num total = 4700
step 6000. num correct = 1087. num discards correct = 427. num total = 5639
step 7000. num correct = 1258. num discards correct = 484. num total = 6580
step 8000. num correct = 1430. num discards correct = 545. num total = 7503
step 10000. num correct = 1740. num discards correct = 671. num total = 9360
step 11000. num correct = 1909. num discards correct = 732. num total = 10298
step 12000. num correct = 2096. num discards correct = 799. num total = 11233
step 13000. num correct = 2260. num discards correct = 865. num total = 12168
step 14000. num correct = 2427. num discards correct = 918. num total = 13104
step 15000. num corr

In [45]:
run_game(agents=[trained_selfplay_agent, THW(), THW(), THW()], num_rounds=1000)

total payoff = -353500.0 after 1000 rounds
average payoff = -353.5
average placement = 2.2883597883597884
num 1st place = 24 / 378


# v3. Tyler's architecture (Trained 750k games) # 

In [99]:
OBS_CHANNELS = 93
HEIGHT = 34
WIDTH = 1
NUM_ACTIONS = 136
class MahjongCNN_v3(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(OBS_CHANNELS, 64, kernel_size=(3, 1), padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=(3, 1), padding=0),
            nn.ReLU(),
            nn.Flatten()
        )
        self.output_dim = self._get_output_dim()

    def _get_output_dim(self):
        dummy = torch.zeros((1, OBS_CHANNELS, HEIGHT, WIDTH))
        return self.conv(dummy).shape[1]

    def forward(self, x):
        return self.conv(x)

class DiscreteHead(nn.Module):
    def __init__(self, base, output_size):
        super().__init__()
        self.base = base
        self.head = nn.Sequential(
            nn.Linear(base.output_dim, 256),
            nn.ReLU(),
            nn.Linear(256, output_size)
        )

    def forward(self, x):
        return self.head(self.base(x))
    
def encode_game_state(obs):
    obs = np.array(obs, dtype=np.float32)
    return torch.tensor(obs).unsqueeze(0).unsqueeze(-1).to(device)
    
class TrainedBot:
    def __init__(self, pid, model):
        self.pid = pid
        self.model = model

    def select_action(self, obs, valid_actions, training = False):
        state = encode_game_state(obs)  # shape: (1, 93, 34, 1)
        logits = self.model(state).squeeze(0)
        valid_logits = logits[valid_actions]
        probs = F.softmax(valid_logits, dim=0).detach().cpu().numpy()
        index = np.random.choice(len(valid_actions), p=probs)
        return valid_actions[index]

In [95]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base = MahjongCNN_v3().to(device)
model = DiscreteHead(base, 136).to(device)
model.load_state_dict(torch.load("./checkpoints/ep_750000/action.pt", map_location=torch.device("cpu")))
model.eval()

  model.load_state_dict(torch.load("./checkpoints/ep_750000/action.pt", map_location=torch.device("cpu")))


DiscreteHead(
  (base): MahjongCNN_v3(
    (conv): Sequential(
      (0): Conv2d(93, 64, kernel_size=(3, 1), stride=(1, 1))
      (1): ReLU()
      (2): Conv2d(64, 64, kernel_size=(3, 1), stride=(1, 1))
      (3): ReLU()
      (4): Flatten(start_dim=1, end_dim=-1)
    )
  )
  (head): Sequential(
    (0): Linear(in_features=1920, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=136, bias=True)
  )
)

In [96]:
run_game(agents=[TrainedBot(0, model), RAND(), RAND(), RAND()], num_rounds=1000)

total payoff = 792200.0 after 1000 rounds
average payoff = 792.2
average placement = 1.240506329113924
num 1st place = 252 / 316


Pretty good results ^__^

In [97]:
run_game(agents=[TrainedBot(0, model), THW(), THW(), THW()], num_rounds=1000)

total payoff = 131300.0 after 1000 rounds
average payoff = 131.3
average placement = 1.9130434782608696
num 1st place = 166 / 437


In [100]:
get_human_accuracy(TrainedBot(0, model), offline_data_file="pymahjong-offline-data-20M/mahjong-offline-data-batch-0.mat", num_steps=30000)

step 0. num correct = 1. num discards correct = 1. num total = 1
step 1000. num correct = 119. num discards correct = 71. num total = 942
step 3000. num correct = 372. num discards correct = 218. num total = 2820
step 4000. num correct = 509. num discards correct = 300. num total = 3760
step 5000. num correct = 654. num discards correct = 396. num total = 4700
step 6000. num correct = 785. num discards correct = 476. num total = 5639
step 7000. num correct = 910. num discards correct = 548. num total = 6580
step 8000. num correct = 1014. num discards correct = 598. num total = 7503
step 10000. num correct = 1244. num discards correct = 709. num total = 9360
step 11000. num correct = 1345. num discards correct = 770. num total = 10298
step 12000. num correct = 1477. num discards correct = 836. num total = 11233
step 13000. num correct = 1585. num discards correct = 896. num total = 12168
step 14000. num correct = 1699. num discards correct = 941. num total = 13104
step 15000. num correc