# Imitation Learning with RL Finetuning through Self-Play

# Import and Split Training Data

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import json
import os

data_dir = "../data/"
X = []
Y = []

for subdir in os.listdir(data_dir):
    subdir_path = os.path.join(data_dir, subdir)
    for file_name in os.listdir(subdir_path):
        file_path = os.path.join(subdir_path, file_name)
        with open(file_path, "r") as file:
            for line in file:
                data = json.loads(line.strip())
                if "state" in data and "action" in data:
                    X.append(data["state"])
                    Y.append(data["action"])
X = np.array(X)
X[X > 0] = np.log2(X[X > 0])    #replace with log2 for simplicity

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=26)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

X_train = np.array(X)
y_train = np.array(Y)   #overwrite with full dataset for training

# Convert NumPy Arrays to PyTorch Tensors

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

#convert data to torch tensors
class Data(Dataset):
    def __init__(self, X, y):   #reshape to fit CNN input, -1 to auto infer batch size, 1 for single channel
        self.X = torch.from_numpy(X.astype(np.float32)).reshape(-1, 1, 4, 4)
        self.y = torch.from_numpy(y.astype(np.float32))
        self.len = self.X.shape[0]
       
    def __getitem__(self, index):
        return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len
   
batch_size = 64

#instantiate training and test data
train_data = Data(X_train, y_train)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

test_data = Data(X_test, y_test)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

#sanity check
for batch, (X, y) in enumerate(train_dataloader):
    print(f"Batch: {batch+1}")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    break

Batch: 1
X shape: torch.Size([64, 1, 4, 4])
y shape: torch.Size([64])


# Neural Network Implementation

In [3]:
import torch
from torch import nn
from torch import optim

input_dim = 16
hidden_dim1 = 256
hidden_dim2 = 128
output_dim = 4

# class NeuralNetwork(nn.Module):
#     def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
#         super(NeuralNetwork, self).__init__()
#         self.layer_1 = nn.Linear(input_dim, hidden_dim1)
#         nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="relu")
#         self.layer_2 = nn.Linear(hidden_dim1, hidden_dim2)
#         nn.init.kaiming_uniform_(self.layer_2.weight, nonlinearity="relu")
#         self.layer_3 = nn.Linear(hidden_dim2, output_dim)
    
#     def forward(self, x):
#         x = torch.nn.functional.relu(self.layer_1(x))
#         x = torch.nn.functional.relu(self.layer_2(x))
#         x = self.layer_3(x)

#         return x
    
# model = NeuralNetwork(input_dim, hidden_dim1, hidden_dim2, output_dim)

class CNN(nn.Module):   #use CNN because input is image-like (4x4 grid)
    def __init__(self, output_dim=4):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=2, stride=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=2, stride=1)
        self.fc1 = nn.Linear(128 * 2 * 2, 128)  # final output size after convs
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

model = CNN(output_dim=output_dim)
print(model)    

CNN(
  (conv1): Conv2d(1, 64, kernel_size=(2, 2), stride=(1, 1))
  (conv2): Conv2d(64, 128, kernel_size=(2, 2), stride=(1, 1))
  (fc1): Linear(in_features=512, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=4, bias=True)
)


# Training

In [4]:
learning_rate = 0.001

loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_epochs = 30

for epoch in range(num_epochs):
    epoch_loss = 0.0
    batch_count = 0
    for X, y in train_dataloader:
        optimizer.zero_grad()
        pred = model(X)
        loss = loss_fn(pred, y.long())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        batch_count += 1
    print(f"Epoch {epoch+1}/{num_epochs}")


print("DONE!!! :3")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
DONE!!! :3


# Predictions

In [5]:
import itertools

y_pred = []
y_test = []
correct = 0
total = 0
results = [0,0,0,0]

"""
We're not training so we don't need to calculate the gradients for our outputs
"""
with torch.no_grad():
    for X, y in test_dataloader:
        outputs = model(X)  # Get model outputs
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.tolist())
        y_test.extend(y.tolist())
        correct += (predicted == y).sum().item()
        total += y.size(0)
        for pred in predicted:
            results[pred.item()] += 1

print(f'Accuracy: {100 * correct // total}%')
print(f'Prediction distribution:')
print(f'0:  {results[0]}')
print(f'1:  {results[1]}')
print(f'2:  {results[2]}')
print(f'3:  {results[3]}')

Accuracy: 85%
Prediction distribution:
0:  32119
1:  33445
2:  30383
3:  32468


# Game Enviornment

In [6]:
import random
from collections import deque
import math

BOARD_SIZE = 4
ACTIONS = [0, 1, 2, 3]  # up, down, left, right

def add_tile(board):
    empty = list(zip(*np.where(board == 0)))
    if not empty:   # no empty cells
        return board
    y, x = random.choice(empty)
    board[y][x] = 1 if random.random() < 0.9 else 2
    return board

def move_right(board):
    new_board = np.zeros_like(board)
    reward = 0
    for row in range(BOARD_SIZE):
        tiles = board[row][board[row] != 0] # collect non-zero tiles
        merged = []
        skip = False
        for i in range(len(tiles)):
            if skip:
                skip = False
                continue
            if i + 1 < len(tiles) and tiles[i] == tiles[i+1]:
                merged.append(tiles[i] + 1)
                reward += 2 ** (tiles[i] + 1)  # calculate reward
                skip = True
            else:
                merged.append(tiles[i])
        new_board[row][:len(merged)] = merged
    return new_board, reward

def move(board, direction): 
    if direction == 0:  # up
        board = np.rot90(board, 1)
        new_board, reward = move_right(board)   #reuse this func to death bc im lazy lmao
        new_board = np.rot90(new_board, -1)
    elif direction == 1:  # down
        board = np.rot90(board, -1)
        new_board, reward = move_right(board)
        new_board = np.rot90(new_board)
    elif direction == 2:  # left
        new_board, reward = move_right(board)
    elif direction == 3:  # right
        board = np.fliplr(board)
        new_board, reward = move_right(board)
        new_board = np.fliplr(new_board)
    else:
        raise ValueError("Invalid direction")
    return new_board, reward

def is_game_over(board):
    for a in ACTIONS:
        new_board, _ = move(board, a)
        if not np.array_equal(new_board, board):
            return False
    return True

class Game2048Env:
    def reset(self):
        self.board = np.zeros((BOARD_SIZE, BOARD_SIZE), dtype=int)
        self.board = add_tile(add_tile(self.board))
        return self.get_state()

    def step(self, action):
        # old_max_tile = np.max(self.board)
        old_board = self.board.copy()
        self.board, reward = move(self.board, action)
        changed = not np.array_equal(self.board, old_board)
        if changed: # only add a tile if the board changed
            self.board = add_tile(self.board)
        # new_max_tile = np.max(self.board)
        # reward = (new_max_tile > old_max_tile)  # reward for increasing max tile, small reward for merging
        done = is_game_over(self.board)
        return self.get_state(), reward, done

    def get_state(self):
        board = self.board.copy()
        board = np.where(board > 0, board, 0)
        board = board.astype(np.float32)
        board = board.reshape(1, 1, 4, 4)
        return board

# Finetuning

In [7]:
from torch.distributions import Categorical

model.eval()
env = Game2048Env()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
num_episodes = 10000
baseline = 1500
batch_size = 25
batch_log_probs = []
batch_improvs = []

for episode in range(num_episodes):
    state = env.reset()
    done = False

    valid_log_probs = []
    finalscore = 0

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).reshape(1, 1, 4, 4)
        logits = model(state_tensor)

        ranked_actions = torch.argsort(logits, dim=1, descending=True)[0]   #sort by how liikely move is

        original_board = env.board.copy()
        final_action = None
        selected_log_prob = None

        movecount = 0   #tracks if the first move was valid 
        for action in ranked_actions:
            test_board, _ = move(original_board.copy(), action.item())
            if not np.array_equal(test_board, original_board):
                final_action = action.item()
                dist = Categorical(logits=logits)
                selected_log_prob = dist.log_prob(action)
                break
            movecount += 1

        if final_action is None:    #game is stuck, skip (shouldn't happen)
            print("SOMETHING WRONG AAAAAUUEEUAGHGEUGHHH")
            break

        state, score, done = env.step(final_action)
        valid_log_probs.append(selected_log_prob)
        finalscore += score

    # use baseline to force games to improve
    baseline = 0.95 * baseline + 0.05 * finalscore
    improvement= finalscore - baseline + 0.1 * len(valid_log_probs)

    #use batches for more stable training
    batch_log_probs.extend(valid_log_probs)
    batch_improvs.extend([improvement] * len(valid_log_probs))

    if (episode+1) % batch_size == 0:
        loss = 0
        for log_prob, improvement in zip(batch_log_probs, batch_improvs):
            loss -= log_prob * improvement
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_log_probs = []
        batch_improvs = []

    if (episode+1) % 50 == 0:
        print(f"Episode {episode+1}, Max Tile: {env.board.max()}, Final Score: {finalscore},  Baseline: {baseline:.2f}, Improvement: {improvement:.2f}")

Episode 50, Max Tile: 6, Final Score: 712,  Baseline: 975.73, Improvement: -254.53
Episode 100, Max Tile: 6, Final Score: 596,  Baseline: 984.17, Improvement: -380.17
Episode 150, Max Tile: 6, Final Score: 568,  Baseline: 898.30, Improvement: -322.50
Episode 200, Max Tile: 7, Final Score: 1392,  Baseline: 1039.05, Improvement: 367.25
Episode 250, Max Tile: 8, Final Score: 2412,  Baseline: 1162.07, Improvement: 1270.73
Episode 300, Max Tile: 6, Final Score: 784,  Baseline: 928.51, Improvement: -134.31
Episode 350, Max Tile: 6, Final Score: 560,  Baseline: 931.18, Improvement: -363.58
Episode 400, Max Tile: 7, Final Score: 1492,  Baseline: 1019.30, Improvement: 487.90
Episode 450, Max Tile: 6, Final Score: 828,  Baseline: 982.07, Improvement: -143.97
Episode 500, Max Tile: 7, Final Score: 1032,  Baseline: 916.30, Improvement: 126.60
Episode 550, Max Tile: 6, Final Score: 668,  Baseline: 857.96, Improvement: -180.86
Episode 600, Max Tile: 5, Final Score: 288,  Baseline: 923.55, Improvemen

# Export to ONNX

In [8]:
import torch.onnx
dummy_input = torch.randn(1, 1, 4, 4) #batch, channels, height, width for cnn
torch.onnx.export(model, dummy_input, "2048_fine.onnx", input_names=["input"], output_names=["output"])

# Convert ONNX to Tensorflow to Tensorflow.js

the package versions need to be really specific or else it crashes and burns

make a new virtual env with the following:\
`python3.9 -m venv tfenv`

run this bash script to convert to Tensorflow:\
`pip install tensorflow==2.13.0 keras==2.13.1 onnx==1.14.0 onnx-tf==1.10.0 protobuf==3.20.3 tensorflow-probability==0.20.0 && onnx-tf convert -i 2048_fine.onnx -o 2048_fine_tf`

and then this bash script to convert from Tensorflow to Tensorflow.js:\
`pip install tensorflowjs==4.18.0 && tensorflowjs_converter --input_format=tf_saved_model --output_format=tfjs_graph_model 2048_fine_tf/ 2048_fine_tfjs/`