In [None]:
# Training the model
# This script trains a GPT-2 model on random walk sequences

import os
import shutil
from tqdm import tqdm
from utils.grid_utils import *
from tokenizers import ByteLevelBPETokenizer
import json

# Data Generation
def generate_and_save_walks(num_walks, walk_length, grid_size, output_file):
    """Generate and save random walk dataset."""
    num_nodes = grid_size * grid_size
    
    with open(output_file, "w", encoding="utf-8") as f:
        for _ in tqdm(range(num_walks)):
            node_names = generate_random_names(num_nodes)
            G = get_grid_graph(node_names, size=grid_size)
            walk_nodes = generate_random_walk(G, random.choice(node_names), walk_length)
            walk_str = walk_to_string(walk_nodes, G)
            if walk_str:
                f.write(walk_str + "\n")

# Configuration
NUM_TRAIN_WALKS = 1000000
NUM_TEST_WALKS = 10000
WALK_LENGTH = 120
GRID_SIZE = 4
set_seed(42)

# Generate data
os.makedirs("data", exist_ok=True)
generate_and_save_walks(NUM_TRAIN_WALKS, WALK_LENGTH, GRID_SIZE, "data/train.txt")
generate_and_save_walks(NUM_TEST_WALKS, WALK_LENGTH, GRID_SIZE, "data/test.txt")

# Train custom tokenizer
tokenizer = ByteLevelBPETokenizer()
special_tokens = ["[PAD]", "[UNK]", "[EOS]", "[SOS]"]

tokenizer.train(files=["data/train.txt"], vocab_size=2000, 
                min_frequency=2, special_tokens=special_tokens)

os.makedirs("tokenizer", exist_ok=True)
tokenizer.save_model("tokenizer")

# Create config files
config = {
    "tokenizer_class": "GPT2Tokenizer",
    "bos_token": "[SOS]",
    "eos_token": "[EOS]",
    "unk_token": "[UNK]",
    "pad_token": "[PAD]",
    "model_max_length": 1024
}

with open("tokenizer/tokenizer_config.json", 'w') as f:
    json.dump(config, f, indent=2)

# Train model using HuggingFace script
def train_model():
    command = """
    python run_clm.py \
        --config_name foraging_chunked_config \
        --tokenizer_name tokenizer \
        --train_file data/train.txt \
        --validation_file data/test.txt \
        --do_train \
        --do_eval \
        --output_dir model \
        --per_device_train_batch_size 16 \
        --per_device_eval_batch_size 16 \
        --num_train_epochs 5 \
        --save_strategy steps \
        --save_steps 500 \
        --save_total_limit 20 \
        --eval_strategy steps \
        --eval_steps 500 \
        --learning_rate 1e-04 \
        --weight_decay 0.1 \
        --bf16 True
    """
    os.system(command)



In [None]:
train_model()