In [1]:
continuous_series_names = [
                           'brownian_motion', 
                           # 'geometric_brownian_motion',
                           # 'noisy_logistic_map',
                           # 'logistic_map',
                           # 'lorenz_system',
                           # 'uncorrelated_gaussian',
                           # 'uncorrelated_uniform'
                           ]
markov_chain_names = ['markov_chain']

import numpy as np

### Set up directory
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5"
from pathlib import Path
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

from tqdm import tqdm
import pickle
import torch
from llama import get_model_and_tokenizer
from ICL import MultiResolutionPDF, recursive_refiner, trim_kv_cache, recursive_refiner_preprompt, recursive_refiner_llama

# Check if directory exists, if not create it
save_path = Path(parent_dir) / 'processed_series_v2'
if not os.path.exists(save_path):
    os.makedirs(save_path)
    
# Define the directory where the generated series are stored
generated_series_dir = Path(parent_dir) / 'generated_series'

In [2]:
def calculate_Markov(full_series, llama_size = '13b'):
    '''
     This function calculates the multi-resolution probability density function (PDF) for a given series.

     Parameters:
     full_series (str): The series for which the PDF is to be calculated.
     llama_size (str, optional): The size of the llama model. Defaults to '13b'.

     Returns:

    '''
    model, tokenizer = get_model_and_tokenizer(llama_size)
    states = sorted(set(full_series))
    good_tokens = [tokenizer.convert_tokens_to_ids(state) for state in states]
    batch = tokenizer(
        [full_series], 
        return_tensors="pt",
        add_special_tokens=True,        
    )
    torch.cuda.empty_cache()
    with torch.no_grad():
        out = model(batch['input_ids'].cpu())
    logit_mat = out['logits']
    logit_mat_good = logit_mat[:,:,good_tokens].cpu()

    return logit_mat_good

def calculate_multiPDF(
    full_series, prec, mode = 'neighbor', refine_depth = 1, llama_size = '13b', temperature=1.0
):
    '''
     This function calculates the multi-resolution probability density function (PDF) for a given series.

     Parameters:
     full_series (str): The series for which the PDF is to be calculated.
     prec (int): The precision of the PDF.
     mode (str, optional): The mode of calculation. Defaults to 'neighbor'.
     refine_depth (int, optional): The depth of refinement for the PDF. Defaults to 1.
     llama_size (str, optional): The size of the llama model. Defaults to '13b'.

     Returns:
     list: A list of PDFs for the series.
    '''
    # if llama_size != '13b':
    #     assert False, "Llama size must be '13b'"
    good_tokens_str = list("0123456789")
    print(f"good_tokens_str: {good_tokens_str}")
    good_tokens = [tokenizer.convert_tokens_to_ids(token) for token in good_tokens_str]
    print(f"good_tokens: {good_tokens}")
    assert refine_depth < prec, "Refine depth must be less than precision"
    refine_depth = refine_depth - prec
    curr = -prec
    batch = tokenizer(
        [full_series], 
        return_tensors="pt",
        add_special_tokens=True        
    )
    print(f"batch['input_ids']: shape | {batch['input_ids'].shape}, sample | {batch['input_ids'][0,:10]}")
    torch.cuda.empty_cache()
    with torch.no_grad():
        out = model(batch['input_ids'].cuda(), use_cache=True)
    print(f"out: {list(out.keys())}")
    logit_mat = out['logits']
    print(f"logit_mat: shape | {logit_mat.shape}, sample | {logit_mat[:10]}")
    kv_cache_main = out['past_key_values']
    logit_mat_good = logit_mat[:,:,good_tokens].clone()
    print(f"logit_mat_good: shape | {logit_mat_good.shape}, sample | {logit_mat_good[:10]}")
    probs = torch.nn.functional.softmax(logit_mat_good[:,1:,:], dim=-1)
    
    PDF_list = []
    comma_locations = np.sort(np.where(np.array(list(full_series)) == ',')[0])
    
    print(f"len coma locations: {comma_locations.shape} | sample: {comma_locations[:10]}")
    print(f"probs: {probs.shape}, type: {type(probs)}")
    # start_loop_from = 1 if use_instruct else 0
    for i in tqdm(range(len(comma_locations))):
        PDF = MultiResolutionPDF()
        # slice out the number before ith comma
        if i == 0:
            start_idx = 0
        else:
            start_idx = comma_locations[i-1]+1
        end_idx = comma_locations[i]
        # print(f"start_idx:end_idx {start_idx}:{end_idx}")
        # if end_idx <= probs.shape[1]:
        num_slice = full_series[start_idx:end_idx]
        prob_slice = probs[0,start_idx:end_idx].cpu().numpy()
        ### Load hierarchical PDF 
        print(f"prob_slice: {prob_slice.shape}, type: {type(prob_slice)}, sample: {prob_slice[:10]}")
        print(f"num_slice: {num_slice}, type: {type(num_slice)}")
        PDF.load_from_num_prob(num_slice, prob_slice)

        # raise ValueError('test')
        
        ### Refine hierarchical PDF
        seq = full_series[:end_idx]
        # cache and full_series are shifted from beginning, not end
        end_idx_neg = end_idx - len(full_series)
        ### kv cache contains seq[0:-1]
        kv_cache = trim_kv_cache(kv_cache_main, end_idx_neg-1)
        recursive_refiner_llama(
            PDF, seq, curr=curr, main=True, refine_depth=refine_depth, mode=mode, 
            kv_cache=kv_cache, model=model, tokenizer=tokenizer, good_tokens=good_tokens,
            temperature=temperature
        )

        PDF_list += [PDF]

        if i==10:
            print(f"start_idx: {start_idx}")
            print(f"end_idx: {end_idx}")
            print(f"num_slice: {num_slice}")
            print(f"prob_slice: {prob_slice}")
            print(f"PDF_list: shape | {len(PDF_list)}, sample | {PDF_list[:10]}")
    
    # release memory
    del logit_mat, kv_cache_main
    return PDF_list

In [3]:
model, tokenizer = get_model_and_tokenizer('7b')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
tokenizer('999')['input_ids']

[128000, 5500]

In [4]:
# Initialize dictionaries to store the data for continuous series and Markov chains
continuous_series_task = {}
markov_chain_task = {}

# Loop through each file in the directory
for file in generated_series_dir.iterdir():
    # Check if a series is already processed
    # if not (save_path / file.name).exists():\
    # Extract the series name from the file name
    series_name = file.stem.rsplit('_', 1)[0]
    # If the series is a continuous series, load the data into the continuous_series_data dictionary
    if series_name in continuous_series_names:
        continuous_series_task[file.name] = pickle.load(file.open('rb'))
    # If the series is a Markov chain, load the data into the markov_chain_data dictionary
    elif series_name in markov_chain_names:
        markov_chain_task[file.name] = pickle.load(file.open('rb'))
    # If the series name is not recognized, raise an exception
    # else:
    #     raise Exception(f"Unrecognized series name: {series_name}")

In [5]:
print(continuous_series_task.keys())
print(markov_chain_task.keys())

dict_keys(['brownian_motion_4.pkl', 'brownian_motion_14.pkl', 'brownian_motion_13.pkl', 'brownian_motion_17.pkl', 'brownian_motion_7.pkl', 'brownian_motion_11.pkl', 'brownian_motion_10.pkl', 'brownian_motion_6.pkl', 'brownian_motion_2.pkl', 'brownian_motion_5.pkl', 'brownian_motion_9.pkl', 'brownian_motion_16.pkl', 'brownian_motion_15.pkl', 'brownian_motion_3.pkl', 'brownian_motion_1.pkl', 'brownian_motion_18.pkl', 'brownian_motion_8.pkl', 'brownian_motion_19.pkl', 'brownian_motion_0.pkl', 'brownian_motion_12.pkl'])
dict_keys(['markov_chain_8.pkl', 'markov_chain_7.pkl', 'markov_chain_4.pkl', 'markov_chain_3.pkl', 'markov_chain_10.pkl', 'markov_chain_0.pkl', 'markov_chain_14.pkl', 'markov_chain_5.pkl', 'markov_chain_11.pkl', 'markov_chain_6.pkl', 'markov_chain_12.pkl', 'markov_chain_13.pkl', 'markov_chain_9.pkl', 'markov_chain_17.pkl', 'markov_chain_15.pkl', 'markov_chain_1.pkl', 'markov_chain_16.pkl', 'markov_chain_2.pkl'])


### Analyze Multi Digit series

In [6]:
tokenizer(' 2 4 1')['input_ids']

[128000, 220, 17, 220, 19, 220, 16]

In [9]:
number_of_tokens_original = None
for series_name, series_dict in sorted(continuous_series_task.items()):
    print("Processing ", series_name)
    if 'brownian_motion' in series_name:
        full_series = series_dict['full_series']
        # add spaces in the series
        # print(type(full_series))
        # str_fill = [' ' + str(char) if str(char) != ',' else str(char) for char in full_series]
        # full_series = ""
        # for char in str_fill:
        #     full_series += char
        print(f"full_series: {full_series[:10]}")
        prec = series_dict['prec']
        refine_depth = series_dict['refine_depth']
        llama_size = series_dict['llama_size']
        mode = series_dict['mode']
        number_of_tokens_original = len(tokenizer(full_series)['input_ids'])
        print(f"number_of_tokens_original: {number_of_tokens_original}")
        print(f"comma token: {tokenizer(',')['input_ids']}")
        PDF_list = calculate_multiPDF(
            full_series, prec, mode = mode, refine_depth = refine_depth, llama_size = llama_size, temperature=2.5
        )
        series_dict['PDF_list'] = PDF_list
        save_name = os.path.join(save_path, f"{series_name.split('.')[0]}_llama3_temperature.pkl")
        # save_name = os.path.join(save_path, series_name)
        with open(save_name, 'wb') as f:
            pickle.dump(series_dict, f)

Processing  brownian_motion_0.pkl
full_series: 214,223,21
number_of_tokens_original: 2001
comma token: [128000, 11]
good_tokens_str: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
good_tokens: [15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
batch['input_ids']: shape | torch.Size([1, 2001]), sample | tensor([128000,  11584,     11,  12533,     11,  13460,     11,  15666,     11,
         15966])
out: ['logits', 'past_key_values']
logit_mat: shape | torch.Size([1, 2001, 128256]), sample | tensor([[[  6.8789,   8.8047,  12.9609,  ...,  -4.4453,  -4.4453,  -4.4453],
         [  2.9844,   2.9414,   2.4766,  ..., -10.4453, -10.4453, -10.4453],
         [ -4.3281,  -8.3203,  -3.3828,  ...,  -9.1172,  -9.1172,  -9.1172],
         ...,
         [  7.8242,   7.2891,   8.2188,  ...,  -5.4727,  -5.4727,  -5.4727],
         [  6.1836,   8.6875,   7.1055,  ...,  -3.5059,  -3.5059,  -3.5059],
         [  8.0547,   7.3789,   8.3359,  ...,  -5.5000,  -5.5000,  -5.5000]]],
       device='cuda:0')
logit_mat

  0%|                                                                                                                  | 0/1000 [00:00<?, ?it/s]

prob_slice: (3, 10), type: <class 'numpy.ndarray'>, sample: [[0.15000482 0.12931223 0.09241529 0.08414508 0.1022944  0.11959427
  0.071413   0.08284052 0.09314012 0.07484019]
 [0.06728137 0.0916042  0.12278635 0.07865989 0.08505161 0.21803695
  0.08340657 0.08472003 0.08472003 0.08373301]
 [0.07188363 0.20239165 0.07956792 0.08145482 0.0735883  0.08773029
  0.06994469 0.12420376 0.08503111 0.12420376]]
num_slice: 214, type: <class 'str'>





ValueError: test

### Analyze Markov Series

In [22]:
for series_name, series_dict in sorted(markov_chain_task.items()):
    print("Processing ", series_name)
    full_series = series_dict['full_series']
    llama_size = series_dict['llama_size']
    logit_mat_good = calculate_Markov(full_series, llama_size = llama_size)    
    series_dict['logit_mat_good'] = logit_mat_good
    save_name = os.path.join(save_path, series_name)
    with open(save_name, 'wb') as f:
        pickle.dump(series_dict, f)
    break

Processing  markov_chain_0.pkl


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.89s/it]
