In [3]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import math

BOARD_SIZE = 4
ACTIONS = [0, 1, 2, 3]  # up, down, left, right

# Game Environment
Notes:

0 is empty cell\
all tiles are displayed as $\log_2(\text{tile value})$ for readability

In [34]:
def add_tile(board):
    empty = list(zip(*np.where(board == 0)))
    if not empty:   # no empty cells
        return board
    y, x = random.choice(empty)
    board[y][x] = 1 if random.random() < 0.9 else 2
    return board

def move_right(board):
    new_board = np.zeros_like(board)
    reward = 0
    for row in range(BOARD_SIZE):
        tiles = board[row][board[row] != 0] # collect non-zero tiles
        merged = []
        skip = False
        for i in range(len(tiles)):
            if skip:
                skip = False
                continue
            if i + 1 < len(tiles) and tiles[i] == tiles[i+1]:
                merged.append(tiles[i] + 1)
                reward += 2 ** (tiles[i] + 1)  # calculate reward
                skip = True
            else:
                merged.append(tiles[i])
        new_board[row][:len(merged)] = merged
    return new_board, reward

def move(board, direction): 
    if direction == 0:  # up
        board = np.rot90(board, 1)
        new_board, reward = move_right(board)   #reuse this func to death bc im lazy lmao
        new_board = np.rot90(new_board, -1)
    elif direction == 1:  # down
        board = np.rot90(board, -1)
        new_board, reward = move_right(board)
        new_board = np.rot90(new_board)
    elif direction == 2:  # left
        new_board, reward = move_right(board)
    elif direction == 3:  # right
        board = np.fliplr(board)
        new_board, reward = move_right(board)
        new_board = np.fliplr(new_board)
    else:
        raise ValueError("Invalid direction")
    return new_board, reward

def is_game_over(board):
    for a in ACTIONS:
        new_board, _ = move(board, a)
        if not np.array_equal(new_board, board):
            return False
    return True

class Game2048Env:
    def reset(self):
        self.board = np.zeros((BOARD_SIZE, BOARD_SIZE), dtype=int)
        self.board = add_tile(add_tile(self.board))
        return self.get_state()

    def step(self, action):
        old_max_tile = np.max(self.board)
        old_board = self.board.copy()
        self.board, reward = move(self.board, action)
        changed = not np.array_equal(self.board, old_board)
        if changed: # only add a tile if the board changed
            self.board = add_tile(self.board)
        new_max_tile = np.max(self.board)
        reward = (new_max_tile > old_max_tile)  # reward for increasing max tile, small reward for merging
        done = is_game_over(self.board)
        return self.get_state(), reward, done

    def get_state(self):
        return self.board.flatten() / 17.0  # normalize log2(2^17), max tile

# Environment Testing
for debugging purposes

In [None]:
env = Game2048Env()
state = env.reset()
print("Initial board:")
print(env.board)

done = False
total_reward = 0

while not done:
    print("\nCurrent board:")
    print(env.board)

    move_str = input("Enter move (w=up, s=down, a=left, d=right, q=quit): ")
    if move_str == 'q':
        break
    move_map = {'w': 0, 's': 1, 'a': 2, 'd': 3}
    if move_str not in move_map:
        print("Invalid input.")
        continue

    action = move_map[move_str]
    prev_max = np.max(env.board)
    state, reward, done = env.step(action)
    new_max = np.max(env.board)

    print(f"Action: {move_str.upper()} | Reward: {reward:.2f} | Max tile: {2 ** new_max}")
    total_reward += reward

print("\nGame Over.")
print("Final board:")
print(env.board)
print(f"Total reward: {total_reward:.2f}")

# DQN

In [35]:
class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(16, 256), # 4x4 board flattened
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 4)   # 4 possible actions
        )

    def forward(self, x):
        return self.model(x)

# Training Loop

In [37]:
env = Game2048Env()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

policy_net = DQN().to(device)
target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=1e-4)
memory = deque(maxlen=200000)
BATCH_SIZE = 128    # batch size for training
GAMMA = 0.999    # discount factor for future rewards
EPSILON = 1.0   # exploration rate
EPSILON_DECAY = 0.9995   # decay rate for exploration
EPSILON_MIN = 0.05   # minimum exploration rate
TARGET_UPDATE = 10  # how often to update the target network

def sample_action(state):
    if random.random() < EPSILON:   #if smaller than the exploration rate, choose random action
        return random.choice(ACTIONS)
    with torch.no_grad():   # otherwise choose the action based on the policy network
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        q_values = policy_net(state_tensor)
        return q_values.argmax().item()

def optimize(): # training
    if len(memory) < BATCH_SIZE:    # not enough samples to train
        return
    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states).to(device)
    actions = torch.LongTensor(actions).to(device).unsqueeze(1)
    rewards = torch.FloatTensor(rewards).to(device).unsqueeze(1)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.BoolTensor(dones).to(device).unsqueeze(1)

    q_values = policy_net(states).gather(1, actions)
    next_q_values = target_net(next_states).max(1, keepdim=True)[0]
    target_q = rewards + GAMMA * next_q_values * (~dones)

    loss = nn.functional.mse_loss(q_values, target_q.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Main Training

In [39]:
for i in range(10000):
    state = env.reset() #reset the game
    total_reward = 0
    for t in range(1000):   #start playing games
        action = sample_action(state)
        next_state, reward, done = env.step(action)
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward
        optimize()
        if done:
            break
    biggest_tile = np.max(env.board)
    EPSILON = max(EPSILON_MIN, EPSILON * EPSILON_DECAY)
    if i % TARGET_UPDATE == 0:  #update policy network
        target_net.load_state_dict(policy_net.state_dict())
    if i % 10 == 0:
        print(f"Episode {i}, Total reward: {total_reward:.2f}, Biggest tile: {biggest_tile}, Epsilon: {EPSILON:.2f}")

Episode 0, Total reward: 5.00, Biggest tile: 7, Epsilon: 0.97
Episode 10, Total reward: 6.00, Biggest tile: 7, Epsilon: 0.96
Episode 20, Total reward: 4.00, Biggest tile: 6, Epsilon: 0.96
Episode 30, Total reward: 7.00, Biggest tile: 8, Epsilon: 0.95
Episode 40, Total reward: 5.00, Biggest tile: 6, Epsilon: 0.95
Episode 50, Total reward: 5.00, Biggest tile: 6, Epsilon: 0.94
Episode 60, Total reward: 5.00, Biggest tile: 6, Epsilon: 0.94
Episode 70, Total reward: 5.00, Biggest tile: 6, Epsilon: 0.93
Episode 80, Total reward: 6.00, Biggest tile: 7, Epsilon: 0.93
Episode 90, Total reward: 7.00, Biggest tile: 8, Epsilon: 0.92
Episode 100, Total reward: 4.00, Biggest tile: 6, Epsilon: 0.92
Episode 110, Total reward: 6.00, Biggest tile: 7, Epsilon: 0.91
Episode 120, Total reward: 5.00, Biggest tile: 7, Epsilon: 0.91
Episode 130, Total reward: 6.00, Biggest tile: 7, Epsilon: 0.90
Episode 140, Total reward: 5.00, Biggest tile: 6, Epsilon: 0.90
Episode 150, Total reward: 4.00, Biggest tile: 6, E

# Export to ONNX

In [42]:
import onnx
dummy_input = torch.randn(1, 16).to(device)  # 4x4 flattened
torch.onnx.export(policy_net, dummy_input, "2048_ai.onnx", input_names=["input"], output_names=["output"])

# Convert ONNX to Tensorflow

the package versions need to be really specific or else it crashes and burns

make a new virtual env with the following:\
`python3.10 -m venv tfenv`

then install these versions:\
`pip install tensorflow==2.13.0 keras==2.13.1`\
`pip install onnx==1.14.0 onnx-tf==1.10.0 protobuf==3.20.3`\
`pip install tensorflow-probability==0.20.0`

then run this bash script:\
`onnx-tf convert -i 2048_rl.onnx -o 2048_rl_tf`