In [49]:
import torch
import tokenizers
import transformers
import tqdm
import matplotlib.pyplot as plt
import numpy as np
import datasets

In [50]:
def is_strongly_connected(adj):
    adj = adj.to(torch.int64)
    reach = adj.clone()

    prev = torch.zeros_like(reach)
    while not torch.equal(prev, reach):
        prev = reach.clone()
        intermediate = ((reach @ reach)>0).to(dtype=torch.int64)#propagate
        reach = reach | intermediate
        
    if not reach.all():
        return False
    return True

def edge_mat_to_adj(edges):
    n=edges.size(0)
    # Create an n x n Boolean tensor initialized with False.
    adj = torch.zeros(n, n, dtype=torch.bool)
    
    # Create a row index for each entry in the edge matrix.
    # This will be an n x k matrix where each row i is filled with i.
    row_indices = torch.arange(n).unsqueeze(1).expand_as(edges)

    mask= edges != -1
    
    # Use advanced indexing to set the corresponding entries to True.
    adj[row_indices[mask], edges[mask]] = True
    
    return adj

In [51]:
#hyper parameters
n_nodes=5
n_actions=5
n_ablate="0.2"
seed=0
#
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
n_edges=n_nodes*n_actions
if type(n_ablate) == str:
    n_ablate = int(float(n_ablate)*n_edges)
print("n_ablate", n_ablate)
while True:
    edge_mat=torch.randint(0,n_nodes,(n_nodes, n_actions),dtype=torch.int64)
    adj=edge_mat_to_adj(edge_mat)
    mask=torch.rand(n_nodes,n_actions)
    sorted_mask=torch.argsort(mask.flatten())
    i,j=torch.unravel_index(sorted_mask[:int(n_ablate)], mask.shape)
    mask[i,j]=-1
    mask=mask>=0
    edge_mat_masked=edge_mat.clone()
    edge_mat_masked[~mask]=-1
    adj_masked=edge_mat_to_adj(edge_mat_masked)
    if is_strongly_connected(adj) and is_strongly_connected(adj_masked):
        assert edge_mat.any(1).all(), "edge mat has no edges"
        assert edge_mat_masked.any(1).all(), "masked edge mat has no edges"
        break

def count_paths(adj, max_length=10):
    # Convert the boolean adj matrix to int for multiplication
    A = adj.to(torch.int64)
    total_paths = 0
    A_power = A.clone()
    for L in range(1, max_length+1):
        total_paths += A_power.sum().item()
        A_power = torch.matmul(A_power, A)
    return total_paths

# Example usage:
total = count_paths(adj, max_length=10)
print("Total unique paths of length 1 to 10:", total)

n_ablate 5
Total unique paths of length 1 to 10: 2547335


In [52]:
#generate a problem
n_problems=262144
data={}
edge_mat_full=edge_mat.clone()
for key,edge_mat_used in zip(["train","test_rl","test"],[edge_mat_masked,edge_mat,edge_mat]):
    edge_mat_used=edge_mat_used.clone()
    i_starts=torch.randint(0,n_nodes,(n_problems,),dtype=torch.int64)
    lengths=torch.randint(1,10,(n_problems,),dtype=torch.int64)
    paths=[]
    actionss=[]
    prompts=[]
    completions=[]
    num_maskeds=[]
    for i in tqdm.trange(n_problems):
        i_curr=i_starts[i].item()
        length=lengths[i]
        path=[i_curr]
        actions=[]
        for j in range(length):
            avail_actions=torch.where(edge_mat_used[i_curr]!= -1)[0]
            #randomly select an action
            i_action=torch.torch.randint(0,len(avail_actions),(1,),dtype=torch.int64)
            i_action=avail_actions[i_action].item()
            #take the action
            i_next=edge_mat_used[i_curr,i_action].item()
            path.append(i_next)
            actions.append(i_action)
            i_curr=i_next
        action_dests=edge_mat_masked[torch.tensor(path[:-1]),torch.tensor(actions)]
        num_masked=torch.sum(action_dests == -1)
        #print(key, "num masked:", num_masked.item())
        assert len(path) == length+1 and len(actions) == length, "path and actions have different lengths"
        paths.append(path)
        actionss.append(actions)
        prompt="S"+str(path[0])+" "
        prompt+="".join(["a"+str(i)+" " for i in actions])
        prompt+=": "#find
        completion="".join(["S"+str(i)+" " for i in path])
        prompts.append(prompt)
        completions.append(completion)
        num_maskeds.append(num_masked.item())
        #break
    data[key] = {
        'paths': paths,
        'actionss': actionss,
        'prompts': prompts,
        'completions': completions,
        'num_maskeds': num_maskeds,
        'edge_mat': edge_mat_used.tolist(),
    }

100%|██████████| 262144/262144 [00:28<00:00, 9354.11it/s]
100%|██████████| 262144/262144 [00:28<00:00, 9150.86it/s]
100%|██████████| 262144/262144 [00:29<00:00, 8860.60it/s]


In [53]:
torch.save(data, f"./data/raw_data/data-nn_{n_nodes}-na_{n_actions}-nab_{n_ablate}-seed_{seed}.pt")

In [54]:
data=torch.load(f"./data/raw_data/data-nn_{n_nodes}-na_{n_actions}-nab_{n_ablate}-seed_{seed}.pt",weights_only=False)
dataset_data={}
for key in data.keys():
    prompts=data[key]['prompts']
    completions=data[key]['completions']
    num_maskeds=data[key]['num_maskeds']
    texts=[p+c for p,c in zip(prompts,completions)]
    data_={
        "prompts": prompts,
        "completions": completions,
        "num_maskeds": num_maskeds,
        'texts': texts,
    }
    dataset_data[key]=datasets.Dataset.from_dict(data_)
dataset=datasets.DatasetDict(dataset_data)
dataset

DatasetDict({
    train: Dataset({
        features: ['prompts', 'completions', 'num_maskeds', 'texts'],
        num_rows: 262144
    })
    test_rl: Dataset({
        features: ['prompts', 'completions', 'num_maskeds', 'texts'],
        num_rows: 262144
    })
    test: Dataset({
        features: ['prompts', 'completions', 'num_maskeds', 'texts'],
        num_rows: 262144
    })
})

In [55]:
dataset_name=f"cfpark00/toy-multistep-nn_{n_nodes}-na_{n_actions}-nab_{n_ablate}-seed_{seed}"

In [56]:
dataset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/263 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/263 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/263 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cfpark00/toy-multistep-nn_5-na_5-nab_5-seed_0/commit/b281b7758d7dc79e23498c9d3f52d072eb06eeee', commit_message='Upload dataset', commit_description='', oid='b281b7758d7dc79e23498c9d3f52d072eb06eeee', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cfpark00/toy-multistep-nn_5-na_5-nab_5-seed_0', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cfpark00/toy-multistep-nn_5-na_5-nab_5-seed_0'), pr_revision=None, pr_num=None)

In [1]:
import torch
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
import transformers
from transformers import Qwen2Config
import glob

In [7]:
for sft_fol in glob.glob("./data/sft/*/final_model"):
    print("sft_fol", sft_fol)
    tokenizer = transformers.AutoTokenizer.from_pretrained(sft_fol)
    tokenizer.eos_token = tokenizer.pad_token
    tokenizer.eos_token_id = tokenizer.pad_token_id
    tokenizer.save_pretrained(sft_fol)

sft_fol ./data/sft/toy-multistep-nn_10-na_10-nab_20-seed_0/final_model
sft_fol ./data/sft/toy-multistep-nn_10-na_20-nab_60-seed_1/final_model
sft_fol ./data/sft/toy-multistep-nn_20-na_20-nab_120-seed_1/final_model
sft_fol ./data/sft/toy-multistep-nn_10-na_5-nab_15-seed_1/final_model
sft_fol ./data/sft/toy-multistep-nn_10-na_5-nab_15-seed_2/final_model
sft_fol ./data/sft/toy-multistep-nn_20-na_10-nab_60-seed_0/final_model
sft_fol ./data/sft/toy-multistep-nn_5-na_5-nab_2-seed_2/final_model
sft_fol ./data/sft/toy-multistep-nn_20-na_20-nab_40-seed_1/final_model
sft_fol ./data/sft/toy-multistep-nn_5-na_5-nab_2-seed_1/final_model
sft_fol ./data/sft/toy-multistep-nn_10-na_10-nab_20-seed_1/final_model
sft_fol ./data/sft/toy-multistep-nn_20-na_20-nab_80-seed_2/final_model
sft_fol ./data/sft/toy-multistep-nn_20-na_20-nab_80-seed_1/final_model
sft_fol ./data/sft/toy-multistep-nn_20-na_10-nab_40-seed_0/final_model
sft_fol ./data/sft/toy-multistep-nn_5-na_20-nab_20-seed_2/final_model
sft_fol ./data

In [9]:
for sft_fol in glob.glob("./data/sft/*/final_model"):
    print("sft_fol", sft_fol)
    tokenizer = transformers.AutoTokenizer.from_pretrained(sft_fol)
    break

sft_fol ./data/sft/toy-multistep-nn_10-na_10-nab_20-seed_0/final_model


In [11]:
tokenizer.eos_token

'<pad>'

In [None]:


if __name__ == "__main__":
    #hyper parameters
    argss=[]
    for n_nodes in [5,10,20]:
        for n_actions in [5,10,20]:
            argss.append({
                "n_nodes":n_nodes,
                "n_actions":n_actions
            })

    for args in argss:
        n_nodes=args["n_nodes"]
        n_actions=args["n_actions"]
        #
        i_curr=0
        vocab = {
            "<unk>": i_curr,
            "<bos>": i_curr+1,
            "<pad>": i_curr + 2,
            ":": i_curr + 3,
        }
        i_curr += len(vocab)
        for i in range(n_nodes):
            vocab[f"S{i}"] = i_curr
            i_curr += 1
        for i in range(n_actions):
            vocab[f"a{i}"] = i_curr
            i_curr += 1

        model_name=f"cfpark00/toy-multistep-nn_{n_nodes}-na_{n_actions}"

        model = WordLevel(vocab=vocab, unk_token="<unk>")
        tokenizer = Tokenizer(model)
        tokenizer.pre_tokenizer = Whitespace()
        tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,pad_token="<pad>",eos_token="<pad>")
        tokenizer.push_to_hub(model_name)

        model_config=Qwen2Config(
            hidden_size=512,
            intermediate_size=2048,
            num_hidden_layers=4,
            num_attention_heads=4,
            num_key_value_heads=4,
            vocab_size=len(tokenizer.vocab),
        )
        model=transformers.AutoModelForCausalLM.from_config(model_config)
        model.push_to_hub(model_name)