In [21]:
import os
import pickle
from contextlib import nullcontext
import torch
import numpy as np
import tiktoken
from model import GPTConfig, GPT
from torch.nn import functional as F

import matplotlib.pyplot as plt

In [4]:
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out-rare_history' # ignored if init_from is not 'resume'
bias = 0.5
num_samples = 1000 # number of samples to draw
max_new_tokens = 100 # number of tokens generated in each sample
temperature = 1 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile_bool = False # use PyTorch 2.0 to compile the model to be faster

In [5]:
print(out_dir, bias)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

out-rare_history 0.5


In [6]:
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
model = GPT(gptconf)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)

number of parameters: 0.84M
number of parameters: 841536.00


<All keys matched successfully>

In [7]:
model.eval()
model.to(device)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(3, 108)
    (wpe): Embedding(3, 108)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=108, out_features=324, bias=False)
          (c_proj): Linear(in_features=108, out_features=108, bias=False)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=108, out_features=432, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=432, out_features=108, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=108, out_features=3, bias=False)
)

In [15]:
load_meta = False
if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
    meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
    load_meta = os.path.exists(meta_path)
print(load_meta)

True


In [16]:
print(f"Loading meta from {meta_path}...")
with open(meta_path, 'rb') as f:
    meta = pickle.load(f)
# TODO want to make this more general to arbitrary encoder/decoder schemes
stoi, itos = meta['stoi'], meta['itos']
print(stoi, itos)
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
markov = {}
size = 5
rng = np.random.default_rng(42)
for s in range(1, size+1):
    # print(s)
    for i in range(2**s):
        curr_seed = format(i, f'0{s}b')
        # print(i, curr_seed)
        bias = rng.choice([0.25, 0.5, 0.75])
        markov[curr_seed] = bias
markov[''] = 0.5
print(markov)

Loading meta from data/rare_history/meta.pkl...
{'\n': 0, '0': 1, '1': 2} {0: '\n', 1: '0', 2: '1'}
{'0': 0.25, '1': 0.75, '00': 0.5, '01': 0.5, '10': 0.5, '11': 0.75, '000': 0.25, '001': 0.75, '010': 0.25, '011': 0.25, '100': 0.5, '101': 0.75, '110': 0.75, '111': 0.75, '0000': 0.75, '0001': 0.75, '0010': 0.5, '0011': 0.25, '0100': 0.75, '0101': 0.5, '0110': 0.5, '0111': 0.5, '1000': 0.25, '1001': 0.75, '1010': 0.75, '1011': 0.5, '1100': 0.5, '1101': 0.75, '1110': 0.5, '1111': 0.5, '00000': 0.5, '00001': 0.25, '00010': 0.25, '00011': 0.5, '00100': 0.75, '00101': 0.25, '00110': 0.75, '00111': 0.75, '01000': 0.25, '01001': 0.5, '01010': 0.25, '01011': 0.75, '01100': 0.75, '01101': 0.5, '01110': 0.25, '01111': 0.75, '10000': 0.5, '10001': 0.75, '10010': 0.75, '10011': 0.75, '10100': 0.75, '10101': 0.25, '10110': 0.5, '10111': 0.5, '11000': 0.5, '11001': 0.25, '11010': 0.5, '11011': 0.25, '11100': 0.75, '11101': 0.75, '11110': 0.75, '11111': 0.75, '': 0.5}


In [17]:
start = "\n"
start_ids = encode(start)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

In [18]:
# true_prob_array = []
# model_prob_array = []
prob_diff_array = []
with torch.no_grad():
    with ctx:
        for k in range(1):
            print(k)
            while True:
                y = model.generate_probs(x, max_new_tokens, temperature=temperature, top_k=None)
                final_out = decode(y[0][0].tolist())
                print(final_out)
                invalid = False
                for i in range(1, len(final_out)):
                    c = final_out[i]
                    if c not in '01':
                        print(f"{c}, {i}")
                        invalid = True
                        break
                if invalid:
                    continue
                sample = y[0]
                probs = y[1]
                # true_probs = []
                # model_probs = []
                prob_diffs = []
                for i in range(len(probs)):
                    context = final_out[1:i+1][-size:]
                    zero_p = probs[i][1] / (probs[i][1] + probs[i][2])
                    # true_probs.append(markov[context])
                    # model_probs.append(zero_p)
                    print(zero_p, markov[context])
                    prob_diffs.append(zero_p - markov[context])
                i = np.argmax(np.abs(prob_diffs))
                break
                # print(final_out)
                # print('---------------')
            # true_prob_array.append(true_probs)
            # model_prob_array.append(model_probs)
            prob_diff_array.append(prob_diffs)
# true_prob_array = np.array(true_prob_array)
# model_prob_array = np.array(model_prob_array)
prob_diff_array = np.array(prob_diff_array)
print(np.argmax(prob_diff_array))


0

1010101101101100100100001101011011011001010110100000010101101100111001011011000011000001010011100101
0.50732374 0.5
0.75527614 0.75
0.50585914 0.5
0.5058591 0.75
0.5053709 0.75
0.5058591 0.25
0.5053709 0.25
0.5058591 0.25
0.755637 0.75
0.50488263 0.5
0.5058591 0.5
0.755637 0.25
0.50488263 0.5
0.5058591 0.5
0.755637 0.25
0.50488263 0.5
0.5068355 0.75
0.50634736 0.25
0.5053709 0.75
0.5068355 0.75
0.50634736 0.5
0.5053709 0.75
0.5068355 0.75
0.5073237 0.25
0.5073237 0.5
0.50634736 0.25
0.755637 0.5
0.50488263 0.75
0.5058591 0.5
0.5053709 0.5
0.5058591 0.25
0.755637 0.75
0.50488263 0.5
0.5058591 0.5
0.755637 0.25
0.50488263 0.5
0.5058591 0.5
0.755637 0.25
0.50488263 0.5
0.5068355 0.75
0.50634736 0.25
0.5053709 0.75
0.5058591 0.25
0.5053709 0.25
0.5058591 0.25
0.755637 0.75
0.50488263 0.5
0.5058591 0.5
0.5053709 0.5
0.5068355 0.75
0.5073237 0.25
0.5073237 0.5
0.5073237 0.5
0.5073237 0.5
0.50634736 0.25
0.5053709 0.25
0.5058591 0.25
0.5053709 0.25
0.5058591 0.25
0.755637 0.75
0.50488263 0

In [28]:
markov = {'0': 0.25, '1': 0.75, '00': 0.5, '01': 0.5, '10': 0.5, '11': 0.75, '000': 0.25, '001': 0.75, '010': 0.25, '011': 0.25, '100': 0.5, '101': 0.75, '110': 0.75, '111': 0.75}
k = markov.keys()
for c in k:
    start_ids = encode(c)
    x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
    logits, _ = model(x)
    logits = logits[:, -1, :]
    probs = F.softmax(logits, dim=-1)
    print(c, (probs[0][1]/(probs[0][1] + probs[0][2])).item(), markov[c])

0 0.5065558552742004 0.25
1 0.5182334184646606 0.75
00 0.24503326416015625 0.5
01 0.7552562355995178 0.5
10 0.2450280636548996 0.5
11 0.7550386786460876 0.75
000 0.5074297785758972 0.25
001 0.5066708326339722 0.75
010 0.5055210590362549 0.25
011 0.7554191946983337 0.25
100 0.5072233080863953 0.5
101 0.5060727596282959 0.75
110 0.5054183602333069 0.75
111 0.7552872896194458 0.75


In [37]:
f = open('data/rare_history/data.txt')
s = ''.join(f.readlines())
for c in k:
    if len(c) == 3:
        print(f"oCCURENCES OF {c}: {s.count(c)}")
        print(f"PERCENT OF 0 after: {s.count(c + '0') / (s.count(c + '0') + s.count(c + '1'))}")
        print(markov[c])

oCCURENCES OF 000: 601587
PERCENT OF 0 after: 0.20098389554308985
oCCURENCES OF 001: 1176250
PERCENT OF 0 after: 0.7330084768665734
oCCURENCES OF 010: 1507572
PERCENT OF 0 after: 0.3217461757528418
oCCURENCES OF 011: 792536
PERCENT OF 0 after: 0.24727664524153498
oCCURENCES OF 100: 1183421
PERCENT OF 0 after: 0.5221423462630254
oCCURENCES OF 101: 1256676
PERCENT OF 0 after: 0.6623683449495549
oCCURENCES OF 110: 788965
PERCENT OF 0 after: 0.7530659173844361
oCCURENCES OF 111: 600402
PERCENT OF 0 after: 0.7986375130231685
