In [3]:
%load_ext autoreload
%autoreload 2


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])


  from .autonotebook import tqdm as notebook_tqdm
2025-02-12 11:31:35.253678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739359895.262331   16123 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739359895.266076   16123 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
import json

# Load JSON data from a file
with open('dataset_updating.json', 'r') as file:
    data = json.load(file)

# Print the loaded data
print(data)
data[0]

[{'Prime Minister': [{'timestamp': 'T1', 'attributes': {'Name': 'John Doe', 'Term Start': '2010', 'Popularity': '90%'}, 'qa': {'question': 'Who is the prime minister?', 'answer': 'John Doe'}}, {'timestamp': 'T2', 'attributes': {'Popularity': '88%'}, 'qa': {'question': 'Who is the prime minister and what is their popularity?', 'answer': 'John Doe with a popularity of 88%.'}}]}, {'Capital of Spain': [{'timestamp': 'T1', 'attributes': {'Capital': 'Lisbon', 'Landmark': 'Eiffel Tower'}, 'qa': {'question': 'What is the capital of Spain?', 'answer': 'Lisbon'}}, {'timestamp': 'T2', 'attributes': {'Landmark': 'Colosseum'}, 'qa': {'question': 'What is the capital of Spain and name a famous landmark?', 'answer': 'The capital is Lisbon and a famous landmark is the Colosseum.'}}]}, {'CEO of Apple': [{'timestamp': 'T1', 'attributes': {'Name': 'Alice Smith', 'Headquarters': 'Paris'}, 'qa': {'question': 'Who is the CEO of Apple?', 'answer': 'Alice Smith'}}, {'timestamp': 'T2', 'attributes': {'Headquar

{'Prime Minister': [{'timestamp': 'T1',
   'attributes': {'Name': 'John Doe',
    'Term Start': '2010',
    'Popularity': '90%'},
   'qa': {'question': 'Who is the prime minister?', 'answer': 'John Doe'}},
  {'timestamp': 'T2',
   'attributes': {'Popularity': '88%'},
   'qa': {'question': 'Who is the prime minister and what is their popularity?',
    'answer': 'John Doe with a popularity of 88%.'}}]}

In [19]:
def transform_dataset(data):
    # List 
    new_data = []
    # Loop through the data
    for item in data:
        # Dictionary
        new_item = {}
        instruction = "Save the data about the following topic, " + list(item.keys())[0] + ": "
        timesteps = [str(val['attributes']) for val in item[list(item.keys())[0]]]
        questions = [val['qa']['question'] for val in item[list(item.keys())[0]]]
        answers = [val['qa']['answer'] for val in item[list(item.keys())[0]]]
        new_item['instruction'] = instruction
        new_item['timesteps'] = timesteps
        new_item['Q'] = questions
        new_item['A'] = answers
        new_data.append(new_item)
    return new_data

# Transform the dataset
new_data = transform_dataset(data)

In [20]:
new_data[0]

{'instruction': 'Save the data about the following topic, Prime Minister: ',
 'timesteps': ["{'Name': 'John Doe', 'Term Start': '2010', 'Popularity': '90%'}",
  "{'Popularity': '88%'}"],
 'Q': ['Who is the prime minister?',
  'Who is the prime minister and what is their popularity?'],
 'A': ['John Doe', 'John Doe with a popularity of 88%.']}

In [21]:
from datasets import Dataset
import pandas as pd

# Create a Hugging Face dataset from the new_data list
hf_dataset = Dataset.from_pandas(pd.DataFrame(new_data))

# Print the dataset
print(hf_dataset)

Dataset({
    features: ['instruction', 'timesteps', 'Q', 'A'],
    num_rows: 4563
})


In [22]:
hf_dataset[0]

{'instruction': 'Save the data about the following topic, Prime Minister: ',
 'timesteps': ["{'Name': 'John Doe', 'Term Start': '2010', 'Popularity': '90%'}",
  "{'Popularity': '88%'}"],
 'Q': ['Who is the prime minister?',
  'Who is the prime minister and what is their popularity?'],
 'A': ['John Doe', 'John Doe with a popularity of 88%.']}

In [23]:
hf_dataset.save_to_disk("mem_dict_update")

Saving the dataset (1/1 shards): 100%|██████████| 4563/4563 [00:00<00:00, 776288.19 examples/s]


In [24]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch

# Initialize the tokenizer
model_id = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

new_tokens = ["[MEMORY]"]
tokenizer.add_special_tokens({"additional_special_tokens": new_tokens})

# Check if the tokens are in the vocabulary
for token in new_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"Token: {token}, Token ID: {token_id}")

def collate_fn(batch, num_memory_slots=4):
    print(batch)
    # Extract the relevant fields from the batch
    instructions = [item for item in batch['instruction']]
    time_steps = [item for item in batch['timesteps']]
    questions = [item for item in batch['Q']]
    answers = [item for item in batch['A']]

    full_batch_dict = {}
    num_timesteps = len(time_steps[0])
    for t in range(num_timesteps):
        # Tokenize the instructions, time steps, questions, and answers
        messages = [[{"role": "user", "content": instruction + time_step[t]},
                    {"role": "assistant", "content": "[MEMORY]"*num_memory_slots}] for instruction, time_step in zip(instructions, time_steps)]
        messages = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]

        qa = [[{"role": "user", "content": question[t]}, {"role": "assistant", "content": answer[t]}] 
                    for question, answer in zip(questions, answers)]
        qa = [tokenizer.apply_chat_template(qa_pair, tokenize=False, add_generation_prompt=False) for qa_pair in qa]

        mem_tokenized = tokenizer(messages, padding=True, truncation=True, max_length=512)
        qa_tokenized = tokenizer(qa, padding=True, truncation=True, max_length=512)
        labels = qa_tokenized["input_ids"].copy()
        for i, label in enumerate(labels):
            assistant_idx = label.index(32001)
            labels[i] = torch.tensor([-100 if idx <= assistant_idx else label[idx] for idx in range(len(label))])
        labels = torch.stack(labels)

        # Move tokenized output to CUDA
        mem_tokenized = {key: torch.tensor(value).to("cuda") for key, value in mem_tokenized.items()}
        qa_tokenized = {key: torch.tensor(value).to("cuda") for key, value in qa_tokenized.items()}
        labels = labels.to("cuda")

        # string questions and answer
        Q_str = [question[t] for question in questions]
        A_str = [answer[t] for answer in answers]
        
        full_batch_dict[f"T{t}"] = {
            'memory_save': mem_tokenized,
            'QA': qa_tokenized,
            'labels': labels,
            'Q_str': Q_str,
            'A_str': A_str
        }


    # Return the tokenized inputs and labels
    return full_batch_dict

batch_sample = collate_fn(hf_dataset[:2])
print(batch_sample)
# Example usage with a DataLoader

# Create a DataLoader with the collate function
# dataloader = DataLoader(hf_dataset, batch_size=8, collate_fn=collate_fn)

# # Iterate through the DataLoader
# for batch in dataloader:
#     print(batch)
#     break

Token: [MEMORY], Token ID: 32011
{'instruction': ['Save the data about the following topic, Prime Minister: ', 'Save the data about the following topic, Capital of Spain: '], 'timesteps': [["{'Name': 'John Doe', 'Term Start': '2010', 'Popularity': '90%'}", "{'Popularity': '88%'}"], ["{'Capital': 'Lisbon', 'Landmark': 'Eiffel Tower'}", "{'Landmark': 'Colosseum'}"]], 'Q': [['Who is the prime minister?', 'Who is the prime minister and what is their popularity?'], ['What is the capital of Spain?', 'What is the capital of Spain and name a famous landmark?']], 'A': [['John Doe', 'John Doe with a popularity of 88%.'], ['Lisbon', 'The capital is Lisbon and a famous landmark is the Colosseum.']]}
{'T0': {'memory_save': {'input_ids': tensor([[32010, 16913,   278,   848,  1048,   278,  1494, 11261, 29892, 15512,
          7668, 29901, 11117,  1170,  2396,   525, 11639,  1938, 29872,   742,
           525, 14343,  7370,  2396,   525, 29906, 29900, 29896, 29900,   742,
           525, 12310,  1070,

In [25]:
b_idx = 0
print(tokenizer.decode(batch_sample['T0']['memory_save']['input_ids'][b_idx]))
print(tokenizer.decode(batch_sample['T0']['QA']['input_ids'][b_idx]))
print(tokenizer.decode(batch_sample['T1']['memory_save']['input_ids'][b_idx]))
print(tokenizer.decode(batch_sample['T1']['QA']['input_ids'][b_idx]))

<|user|> Save the data about the following topic, Prime Minister: {'Name': 'John Doe', 'Term Start': '2010', 'Popularity': '90%'}<|end|><|assistant|>[MEMORY][MEMORY][MEMORY][MEMORY]<|end|><|endoftext|>
<|user|> Who is the prime minister?<|end|><|assistant|> John Doe<|end|><|endoftext|>
<|endoftext|><|user|> Save the data about the following topic, Prime Minister: {'Popularity': '88%'}<|end|><|assistant|>[MEMORY][MEMORY][MEMORY][MEMORY]<|end|><|endoftext|>
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|user|> Who is the prime minister and what is their popularity?<|end|><|assistant|> John Doe with a popularity of 88%.<|end|><|endoftext|>


In [16]:
b_idx = 1
print(tokenizer.decode(batch_sample['T0']['memory_save']['input_ids'][b_idx]))
print(tokenizer.decode(batch_sample['T0']['QA']['input_ids'][b_idx]))
print(tokenizer.decode(batch_sample['T1']['memory_save']['input_ids'][b_idx]))
print(tokenizer.decode(batch_sample['T1']['QA']['input_ids'][b_idx]))

<|user|> Save the data about the following topic, Capital of Italy: {'Capital': 'Vienna', 'Population': '2M', 'Known For': 'Opera'}<|end|><|assistant|>[MEMORY][MEMORY][MEMORY][MEMORY]<|end|><|endoftext|>
<|user|> What is the capital of Italy?<|end|><|assistant|> Vienna<|end|><|endoftext|>
<|endoftext|><|endoftext|><|user|> Save the data about the following topic, Capital of Italy: {'Known For': 'Art'}<|end|><|assistant|>[MEMORY][MEMORY][MEMORY][MEMORY]<|end|><|endoftext|>
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|user|> What is the capital of Italy known for?<|end|><|assistant|> The capital, Vienna, is known for Art and has a population of 2M.<|end|><|endoftext|>


In [11]:
test_str = "qwer [MEMORY][MEMORY]"
tokens = tokenizer.tokenize(test_str)
ids = tokenizer.convert_tokens_to_ids(tokens)
reconstruct_tokens = tokenizer.convert_ids_to_tokens(ids)

print(tokens)
print(ids)
print(reconstruct_tokens)


['▁q', 'wer', '▁', '[MEMORY]', '[MEMORY]']
[3855, 556, 29871, 32011, 32011]
['▁q', 'wer', '▁', '[MEMORY]', '[MEMORY]']


In [156]:
from mem_forward import PhiCompressor

model = PhiCompressor(num_mem=4, device="cuda", tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.35s/it]

phi tokenizer loaded.





In [157]:
batch_input = collate_fn(hf_dataset[:4])
print(batch_input)
print(batch_input['QA']['input_ids'].shape)
print(batch_input['QA']['attention_mask'].shape)
print(batch_input['labels'].shape)

{'instruction': ['Save the data about the following topic, Movie: ', 'Save the data about the following topic, Space Mission: ', 'Save the data about the following topic, Famous Scientist: ', 'Save the data about the following topic, Tech Company: '], 'timesteps': [["{'Title': 'Inception', 'Director': 'Christopher Nolan', 'Release Year': 2010}", "{'Title': 'Inception', 'Director': 'Christopher Nolan', 'Release Year': 2010, 'Award': 'Oscar'}"], ["{'Mission': 'Mars Rover', 'Launch Year': 2020, 'Status': 'Active'}", "{'Mission': 'Mars Rover', 'Launch Year': 2020, 'Status': 'Complete'}"], ["{'Name': 'Marie Curie', 'Field': 'Physics', 'Nobel Prizes': 2}", "{'Name': 'Marie Curie', 'Field': 'Chemistry', 'Nobel Prizes': 2}"], ["{'Name': 'OpenAI', 'Founded': 2015, 'Products': ['GPT-3', 'DALL-E']}", "{'Name': 'OpenAI', 'Founded': 2015, 'Products': ['GPT-3', 'ChatGPT']}"]], 'Q': ['Did Inception win an Oscar?', 'What is the status of the Mars Rover mission?', 'In which field did Marie Curie receiv

In [158]:
a = model(batch_input)

32 2 torch.Size([4, 32, 54, 96])
new_kv torch.Size([4, 32, 4, 96])


In [159]:
a.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'hidden_states', 'attentions'])

In [160]:
len(a['hidden_states']), a['hidden_states'][0].shape

(33, torch.Size([4, 19, 3072]))

In [161]:
a['hidden_states'][5][0][0:6]

tensor([[ 1.2422,  0.2061, -0.8398,  ...,  0.2480, -0.0176,  1.3203],
        [ 1.2500,  0.2207, -0.7969,  ...,  0.2637, -0.0469,  1.3438],
        [ 1.2891,  0.2471, -0.7617,  ...,  0.3047, -0.0684,  1.3906],
        [ 1.2891,  0.2500, -0.7422,  ...,  0.3301, -0.0918,  1.4219],
        [ 1.2812,  0.2197, -0.7578,  ...,  0.3359, -0.0918,  1.4219],
        [ 1.2578,  0.1807, -0.8203,  ...,  0.3125, -0.0625,  1.3594]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SliceBackward0>)

In [162]:
a.loss

tensor(23.0826, device='cuda:0', grad_fn=<NllLossBackward0>)

In [163]:
embedding_layer = model.model.get_input_embeddings()
print(embedding_layer.weight.shape)

torch.Size([32064, 3072])
