In [None]:
import wandb
wandb.login()
import os
os.environ['WANDB_PROJECT'] = 'generalist_model'
import pandas as pd
import networkx as nx
import logging
from random import shuffle
import pandas as pd
from matplotlib import pyplot as plt
import csrgraph as cg
import numpy as np
import random
import string
from itertools import combinations
import pickle
import gc
import os
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer

#os.environ['WANDB_MODE'] = 'disabled'

In [None]:
# set current directory
os.chdir("/cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS")

**DATA GENERATION**

In [None]:
import os
import shutil
import random
import networkx as nx
import numpy as np

# --- 1. Graph and Naming Utilities ---

def get_graph(nodes):
    """Creates a standard grid graph with named nodes for a given list of nodes."""
    size = int(np.sqrt(len(nodes)))
    if size * size != len(nodes):
        raise ValueError("Number of nodes must be a perfect square.")

    G = nx.DiGraph()
    for r in range(size):
        for c in range(size):
            idx = r * size + c
            # East
            if c < size - 1: G.add_edge(nodes[idx], nodes[idx + 1], direction='EAST')
            # West
            if c > 0: G.add_edge(nodes[idx], nodes[idx - 1], direction='WEST')
            # South
            if r < size - 1: G.add_edge(nodes[idx], nodes[idx + size], direction='SOUTH')
            # North
            if r > 0: G.add_edge(nodes[idx], nodes[idx - size], direction='NORTH')
    return G

def generate_unique_names(count):
    """Generates a list of unique 2-character lowercase strings."""
    names = set()
    while len(names) < count:
        names.add(''.join(random.choices(string.ascii_lowercase, k=2)))
    return list(names)

# --- 2. Path Generation Logic ---

def generate_random_walk(G, start_node, length):
    """Generates a path by performing a random walk from a start_node."""
    path = [start_node]
    current_node = start_node
    while len(path) < length:
        neighbors = list(G.successors(current_node))
        if not neighbors: break
        next_node = random.choice(neighbors)
        path.append(next_node)
        current_node = next_node
    return path

def format_path_to_string(path, G):
    """Converts a list of nodes into a string of 'NODE DIRECTION NODE...'"""
    if not path or len(path) < 2:
        return path[0] if path else ""

    path_str = ""
    for i in range(len(path) - 1):
        u, v = path[i], path[i+1]
        try:
            direction = G.edges[u, v]['direction']
            path_str += f"{u} {direction} "
        except KeyError:
            print(f"Warning: Edge ({u}, {v}) not found in graph. Skipping segment.")
            continue
    path_str += path[-1]
    return path_str

# --- 3. Core Data Generation Function ---

def generate_training_example(G, config):
    """
    Generates a single, complete training example (prompt, target) based on the
    final recommended design.
    """
    nodes = list(G.nodes())

    # A. Generate the CONTEXT Block
    context_len = random.randint(config['min_context_len'], config['max_context_len'])
    context_start_node = random.choice(nodes)
    context_path = generate_random_walk(G, context_start_node, context_len)
    context_str = format_path_to_string(context_path, G)
    
    # B. Flip a Coin to Choose the TASK
    if random.random() < 0.5:
        # --- TASK 1: Pathfinding ([PATH]) ---
        while True:
            start_node, goal_node = random.sample(nodes, 2)
            try:
                shortest_path = next(nx.all_shortest_paths(G, source=start_node, target=goal_node))
                if len(shortest_path) >= config['min_shortest_path_nodes']:
                    break
            except (nx.NetworkXNoPath, StopIteration):
                continue
        task_instruction = f"[PATH] [START_NODE] {start_node} [GOAL_NODE] {goal_node}"
        target_str = format_path_to_string(shortest_path, G)
        
    else:
        # --- TASK 2: Foraging ([WALK]) ---
        start_node = random.choice(nodes)
        walk_len = random.randint(config['min_walk_len'], config['max_walk_len'])
        task_instruction = f"[WALK] [START_NODE] {start_node} [LENGTH] {walk_len}"
        walk_path = generate_random_walk(G, start_node, walk_len)
        target_str = format_path_to_string(walk_path, G)

    # C. Assemble the final prompt and target strings
    prompt = f"[START] {context_str} [SEP] {task_instruction} [PLAN]"
    target = f"{target_str} [END]"
    
    return prompt, target

# --- 4. Orchestration and File I/O ---

def generate_and_save_dataset(n_examples, grid_size, output_file_path):
    """Orchestrates the generation of the full dataset and saves it to a file."""
    
    config = {
        'min_context_len': 30,
        'max_context_len': 150,
        'min_shortest_path_nodes': 4,
        'min_walk_len': 5,
        'max_walk_len': 15,
    }

    print(f"--- Generating {n_examples} examples for {output_file_path} ---")
    print(f"Grid Size: {grid_size}x{grid_size}, Context Length: {config['min_context_len']}-{config['max_context_len']}")
    
    dataset = []
    num_nodes = grid_size * grid_size
    
    # Generate one graph environment to be used for this dataset split
    node_names = generate_unique_names(num_nodes)
    G = get_graph(node_names)
    print(f"Using graph with nodes: {node_names[:5]}...")

    for i in range(n_examples):
        example = generate_training_example(G, config)
        dataset.append(example)

    print(f"Writing {len(dataset)} examples to {output_file_path}...")
    with open(output_file_path, "w", encoding="utf-8") as f:
        for prompt, target in dataset:
            # Use a tab separator for easy parsing later
            f.write(f"{prompt}\t{target}\n")


In [None]:
if __name__ == "__main__":
    # --- Configuration ---
    # These are now the total number of examples, which is more direct
    NUM_TRAIN_EXAMPLES = 50000 
    NUM_TEST_EXAMPLES = 5000
    GRID_SIZE = 4

    # Set a seed for reproducibility across all random operations
    random.seed(42)
    np.random.seed(42)

    # --- Setup Output Directory ---
    output_dir = "generalist_data"
    if os.path.exists(output_dir):
        print(f"Removing existing directory: {output_dir}")
        shutil.rmtree(output_dir)
    print(f"Creating new directory: {output_dir}")
    os.makedirs(output_dir)

    # --- Generate Training Data ---
    train_file_path = os.path.join(output_dir, "train.txt")
    generate_and_save_dataset(
        n_examples=NUM_TRAIN_EXAMPLES,
        grid_size=GRID_SIZE,
        output_file_path=train_file_path
    )

    # --- Generate Testing Data ---
    # IMPORTANT: We use the same configuration but a different number of examples
    # to create the test set. For a truly rigorous test set, you might use a 
    # different random seed or ensure the graph environment is different.
    # For simplicity here, we'll just generate a smaller set.
    test_file_path = os.path.join(output_dir, "test.txt")
    generate_and_save_dataset(
        n_examples=NUM_TEST_EXAMPLES,
        grid_size=GRID_SIZE,
        output_file_path=test_file_path
    )

    print("\nProcess finished successfully.")

**MODEL TRAINING**

In [None]:
class GPT:

    def __init__(self, base_model=None, base_model_name='gpt2', vocab_size=100):
        self.base_model = base_model
        self.base_model_name = base_model_name
        self.vocab_size = vocab_size

        if self.base_model is not None:
            self.tokenizer = GPT2Tokenizer.from_pretrained(base_model)
            self.model = GPT2LMHeadModel.from_pretrained(base_model)
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def continue_input(self, input_sequence, max_new_tokens=5, num_return_sequences=1, no_repeat_ngram_size=0,
                       do_sample=False, temperature=0.7, num_beams=1):
        input_ids = self.tokenizer.encode(input_sequence, return_tensors='pt')

        # Generate text
        output = self.model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            num_return_sequences=num_return_sequences,
            num_beams=num_beams,
            no_repeat_ngram_size=no_repeat_ngram_size,
            do_sample=do_sample,
            temperature=temperature,
        )

        # Decode the output
        sequence = output[0].tolist()
        text = self.tokenizer.decode(sequence)
        return text
    
    

In [None]:
def load_pkl(pth):
    with open(pth, 'rb') as f:
        d = pickle.load(f)
    return d

def is_valid_path(sequence, graphs):
    # Split the sequence into parts
    parts = sequence.split()

    # Extract nodes and edges; nodes are at even indices, edges at odd indices
    nodes = parts[::2]
    edges = parts[1::2]

    # Convert edges to a lowercase version for comparison (assuming all edges in graphs are lowercase)
    edges = [edge.lower() for edge in edges]

    # Iterate over each graph to check if the path exists
    for graph in graphs:
        path_exists = True
        for i in range(len(nodes) - 1):
            # Check if the current graph has the edge between the current node and the next node
            if not graph.has_edge(nodes[i], nodes[i+1]):
                path_exists = False
                break

        # If path exists in the current graph, return True
        if path_exists:
            return True

    # If none of the graphs contain the path, return False
    return False

In [None]:

# **CREATING CUSTOM MODEL CONFIGURATION**

from transformers import AutoConfig, GPT2Tokenizer

# --- Configuration ---
TOKENIZER_PATH = "./generalist_tokenizer"
CONFIG_SAVE_PATH = "./generalist_config"

# 1. Load the custom tokenizer to get its properties
my_custom_tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_PATH)

# 2. Load the base gpt2 config as a blueprint
print("Loading base gpt2 config...")
config = AutoConfig.from_pretrained('gpt2')

# 3. CRITICAL: Override the config with our custom tokenizer's properties
print(f"Overriding config vocab_size. Old: {config.vocab_size}, New: {my_custom_tokenizer.vocab_size}")
config.vocab_size = my_custom_tokenizer.vocab_size

# Also update the special token IDs
print("Updating special token IDs in the config...")
config.bos_token_id = my_custom_tokenizer.bos_token_id
config.eos_token_id = my_custom_tokenizer.eos_token_id
# If you have pad_token, etc., you can set them here too
# config.pad_token_id = my_custom_tokenizer.pad_token_id

# 4. Save the new, modified config to its own directory
os.makedirs(CONFIG_SAVE_PATH, exist_ok=True)
config.save_pretrained(CONFIG_SAVE_PATH)

print(f"Custom model config saved to: {CONFIG_SAVE_PATH}")

In [None]:
import os
import gc

def train_model_script(num_epochs=1,
                       output_dir='generalist_model',
                       lr=5e-05):
    gc.collect()

    save_steps = 500
    save_total_limit = 30 
    
    # --- Define Clean Paths ---
    data_dir = 'generalist_data' 
    tokenizer_dir = 'generalist_tokenizer'
    config_dir = 'generalist_config'
    
    train_path = f'./{data_dir}/train.txt'
    test_path = f'./{data_dir}/test.txt'

    # --- THE FIX IS HERE ---
    # 1. Construct the full command as a Python f-string
    command = f"""
    python3 ./run_clm.py \\
        --config_name {config_dir} \\
        --tokenizer_name {tokenizer_dir} \\
        --train_file {train_path} \\
        --validation_file {test_path} \\
        --per_device_train_batch_size 1 \\
        --per_device_eval_batch_size 1 \\
        --do_train \\
        --do_eval \\
        --output_dir {output_dir} \\
        --overwrite_output_dir \\
        --num_train_epochs {num_epochs} \\
        --save_strategy 'steps' \\
        --save_steps {save_steps} \\
        --save_total_limit {save_total_limit} \\
        --eval_steps 2000 \\
        --learning_rate {lr} \\
        --report_to 'wandb'
    """
    
    # 2. Print the command to verify it's correct
    print("--- Running Command ---")
    print(command)
    print("-----------------------")
    
    # 3. Execute the command using os.system
    os.system(command)

In [None]:
import os

# Define the path for new cache directory 
project_cache_path = os.path.join(os.getcwd(), '.cache', 'huggingface_datasets')
os.makedirs(project_cache_path, exist_ok=True)

# Set the environment variable to tell the 'datasets' library to use this new path
os.environ['HF_DATASETS_CACHE'] = project_cache_path

print(f"Hugging Face datasets cache is now set to: {os.environ['HF_DATASETS_CACHE']}")

wandb_path = os.path.join(os.getcwd(), 'wandb_local_runs')
os.makedirs(wandb_path, exist_ok=True)
# Set the environment variable to tell 'wandb' to use this new path
os.environ['WANDB_DIR'] = wandb_path
print(f"Wandb local run directory set to: {os.environ['WANDB_DIR']}")

In [None]:
# !rm -rf generalist_model/
!mkdir generalist_model/

In [None]:

train_model_script(num_epochs=1,
                   output_dir='generalist_model',
                   lr=5e-05)
