In [3]:
%load_ext autoreload
%autoreload 2


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])


  from .autonotebook import tqdm as notebook_tqdm
2025-02-12 11:31:35.253678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739359895.262331   16123 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739359895.266076   16123 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
import json

# Load JSON data from a file
with open('dataset.json', 'r') as file:
    data = json.load(file)

# Print the loaded data
print(data)

[{'President': [{'timestamp': 'T1', 'attributes': {'Name': 'John Doe', 'Age': 55, 'Approval Rating': '75%'}, 'qa': {'question': 'Who is the president?', 'answer': 'John Doe'}}, {'timestamp': 'T2', 'attributes': {'Name': 'Jane Smith', 'Age': 60, 'Approval Rating': '80%'}, 'qa': {'question': 'Who is the president?', 'answer': 'Jane Smith'}}]}, {'Capital of Brazil': [{'timestamp': 'T1', 'attributes': {'Capital': 'Buenos Aires', 'Population': '2.8M'}, 'qa': {'question': 'What is the capital of Brazil?', 'answer': 'Buenos Aires'}}, {'timestamp': 'T2', 'attributes': {'Capital': 'Lima', 'Population': '9M'}, 'qa': {'question': 'What is the capital of Brazil?', 'answer': 'Lima'}}]}, {'CEO of Facebook': [{'timestamp': 'T1', 'attributes': {'Name': 'Larry Page', 'Employees': '50,000'}, 'qa': {'question': 'Who is the CEO of Facebook?', 'answer': 'Larry Page'}}, {'timestamp': 'T2', 'attributes': {'Name': 'Tim Cook', 'Employees': '75,000'}, 'qa': {'question': 'Who is the CEO of Facebook?', 'answer': 

In [5]:
def transform_dataset(data):
    # List 
    new_data = []
    # Loop through the data
    for item in data:
        # Dictionary
        new_item = {}
        instruction = "Save the data about the following topic, " + list(item.keys())[0] + ": "
        timesteps = [str(val['attributes']) for val in item[list(item.keys())[0]]]
        new_item['instruction'] = instruction
        new_item['timesteps'] = timesteps
        new_item['Q'] = item[list(item.keys())[0]][-1]["qa"]['question']
        new_item['A'] = item[list(item.keys())[0]][-1]["qa"]['answer']
        new_data.append(new_item)
    return new_data

# Transform the dataset
new_data = transform_dataset(data)

In [6]:
new_data[0]

{'instruction': 'Save the data about the following topic, President: ',
 'timesteps': ["{'Name': 'John Doe', 'Age': 55, 'Approval Rating': '75%'}",
  "{'Name': 'Jane Smith', 'Age': 60, 'Approval Rating': '80%'}"],
 'Q': 'Who is the president?',
 'A': 'Jane Smith'}

In [7]:
from datasets import Dataset
import pandas as pd

# Create a Hugging Face dataset from the new_data list
hf_dataset = Dataset.from_pandas(pd.DataFrame(new_data))

# Print the dataset
print(hf_dataset)

Dataset({
    features: ['instruction', 'timesteps', 'Q', 'A'],
    num_rows: 1499
})


In [8]:
hf_dataset[0]

{'instruction': 'Save the data about the following topic, President: ',
 'timesteps': ["{'Name': 'John Doe', 'Age': 55, 'Approval Rating': '75%'}",
  "{'Name': 'Jane Smith', 'Age': 60, 'Approval Rating': '80%'}"],
 'Q': 'Who is the president?',
 'A': 'Jane Smith'}

In [9]:
hf_dataset.save_to_disk("mem_dict_false")

Saving the dataset (1/1 shards): 100%|██████████| 1499/1499 [00:00<00:00, 393027.55 examples/s]


In [10]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch

# Initialize the tokenizer
model_id = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

new_tokens = ["[MEMORY]"]
tokenizer.add_special_tokens({"additional_special_tokens": new_tokens})

# Check if the tokens are in the vocabulary
for token in new_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"Token: {token}, Token ID: {token_id}")

def collate_fn(batch, num_memory_slots=4):
    print(batch)
    # Extract the relevant fields from the batch
    instructions = [item for item in batch['instruction']]
    time_steps = [item for item in batch['timesteps']]
    questions = [item for item in batch['Q']]
    answers = [item for item in batch['A']]

    messages = [[{"role": "user", "content": instruction + time_step[1]},
                {"role": "assistant", "content": "[MEMORY]"*num_memory_slots}] for instruction, time_step in zip(instructions, time_steps)]
    messages = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
    print(messages)

    qa = [[{"role": "user", "content": question}, {"role": "assistant", "content": answer}] 
                for question, answer in zip(questions, answers)]
    qa = [tokenizer.apply_chat_template(qa_pair, tokenize=False, add_generation_prompt=False) for qa_pair in qa]


    mem_tokenized = tokenizer(messages, padding=True, truncation=True, max_length=512)
    qa_tokenized = tokenizer(qa, padding=True, truncation=True, max_length=512)
    labels = qa_tokenized["input_ids"].copy()
    for i, label in enumerate(labels):
        assistant_idx = label.index(32001)
        labels[i] = torch.tensor([-100 if idx <= assistant_idx else label[idx] for idx in range(len(label))])
    labels = torch.stack(labels)

    # Move tokenized output to CUDA
    mem_tokenized = {key: torch.tensor(value).to("cuda") for key, value in mem_tokenized.items()}
    qa_tokenized = {key: torch.tensor(value).to("cuda") for key, value in qa_tokenized.items()}
    labels = labels.to("cuda")


    # Return the tokenized inputs and labels
    return {
        'memory_save': mem_tokenized,
        'QA': qa_tokenized,
        'labels': labels
    }

collate_fn(hf_dataset[:2])

# Example usage with a DataLoader

# Create a DataLoader with the collate function
# dataloader = DataLoader(hf_dataset, batch_size=8, collate_fn=collate_fn)

# # Iterate through the DataLoader
# for batch in dataloader:
#     print(batch)
#     break

Token: [MEMORY], Token ID: 32011
{'instruction': ['Save the data about the following topic, President: ', 'Save the data about the following topic, Capital of Brazil: '], 'timesteps': [["{'Name': 'John Doe', 'Age': 55, 'Approval Rating': '75%'}", "{'Name': 'Jane Smith', 'Age': 60, 'Approval Rating': '80%'}"], ["{'Capital': 'Buenos Aires', 'Population': '2.8M'}", "{'Capital': 'Lima', 'Population': '9M'}"]], 'Q': ['Who is the president?', 'What is the capital of Brazil?'], 'A': ['Jane Smith', 'Lima']}
["<|user|>\nSave the data about the following topic, President: {'Name': 'Jane Smith', 'Age': 60, 'Approval Rating': '80%'}<|end|>\n<|assistant|>\n[MEMORY][MEMORY][MEMORY][MEMORY]<|end|>\n<|endoftext|>", "<|user|>\nSave the data about the following topic, Capital of Brazil: {'Capital': 'Lima', 'Population': '9M'}<|end|>\n<|assistant|>\n[MEMORY][MEMORY][MEMORY][MEMORY]<|end|>\n<|endoftext|>"]


{'memory_save': {'input_ids': tensor([[32010, 16913,   278,   848,  1048,   278,  1494, 11261, 29892,  7178,
           29901, 11117,  1170,  2396,   525, 29967,  1662,  7075,   742,   525,
           22406,  2396, 29871, 29953, 29900, 29892,   525,  2052,   307,   791,
             390,  1218,  2396,   525, 29947, 29900, 29995, 10827, 32007, 32001,
           32011, 32011, 32011, 32011, 32007, 32000],
          [32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32010,
           16913,   278,   848,  1048,   278,  1494, 11261, 29892, 25343,   310,
           16078, 29901, 11117, 12415,  2410,  2396,   525, 29931,  2946,   742,
             525, 12310,  2785,  2396,   525, 29929, 29924, 10827, 32007, 32001,
           32011, 32011, 32011, 32011, 32007, 32000]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [0, 0, 0

In [11]:
test_str = "qwer [MEMORY][MEMORY]"
tokens = tokenizer.tokenize(test_str)
ids = tokenizer.convert_tokens_to_ids(tokens)
reconstruct_tokens = tokenizer.convert_ids_to_tokens(ids)

print(tokens)
print(ids)
print(reconstruct_tokens)


['▁q', 'wer', '▁', '[MEMORY]', '[MEMORY]']
[3855, 556, 29871, 32011, 32011]
['▁q', 'wer', '▁', '[MEMORY]', '[MEMORY]']


In [156]:
from mem_forward import PhiCompressor

model = PhiCompressor(num_mem=4, device="cuda", tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.35s/it]

phi tokenizer loaded.





In [157]:
batch_input = collate_fn(hf_dataset[:4])
print(batch_input)
print(batch_input['QA']['input_ids'].shape)
print(batch_input['QA']['attention_mask'].shape)
print(batch_input['labels'].shape)

{'instruction': ['Save the data about the following topic, Movie: ', 'Save the data about the following topic, Space Mission: ', 'Save the data about the following topic, Famous Scientist: ', 'Save the data about the following topic, Tech Company: '], 'timesteps': [["{'Title': 'Inception', 'Director': 'Christopher Nolan', 'Release Year': 2010}", "{'Title': 'Inception', 'Director': 'Christopher Nolan', 'Release Year': 2010, 'Award': 'Oscar'}"], ["{'Mission': 'Mars Rover', 'Launch Year': 2020, 'Status': 'Active'}", "{'Mission': 'Mars Rover', 'Launch Year': 2020, 'Status': 'Complete'}"], ["{'Name': 'Marie Curie', 'Field': 'Physics', 'Nobel Prizes': 2}", "{'Name': 'Marie Curie', 'Field': 'Chemistry', 'Nobel Prizes': 2}"], ["{'Name': 'OpenAI', 'Founded': 2015, 'Products': ['GPT-3', 'DALL-E']}", "{'Name': 'OpenAI', 'Founded': 2015, 'Products': ['GPT-3', 'ChatGPT']}"]], 'Q': ['Did Inception win an Oscar?', 'What is the status of the Mars Rover mission?', 'In which field did Marie Curie receiv

In [158]:
a = model(batch_input)

32 2 torch.Size([4, 32, 54, 96])
new_kv torch.Size([4, 32, 4, 96])


In [159]:
a.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'hidden_states', 'attentions'])

In [160]:
len(a['hidden_states']), a['hidden_states'][0].shape

(33, torch.Size([4, 19, 3072]))

In [161]:
a['hidden_states'][5][0][0:6]

tensor([[ 1.2422,  0.2061, -0.8398,  ...,  0.2480, -0.0176,  1.3203],
        [ 1.2500,  0.2207, -0.7969,  ...,  0.2637, -0.0469,  1.3438],
        [ 1.2891,  0.2471, -0.7617,  ...,  0.3047, -0.0684,  1.3906],
        [ 1.2891,  0.2500, -0.7422,  ...,  0.3301, -0.0918,  1.4219],
        [ 1.2812,  0.2197, -0.7578,  ...,  0.3359, -0.0918,  1.4219],
        [ 1.2578,  0.1807, -0.8203,  ...,  0.3125, -0.0625,  1.3594]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SliceBackward0>)

In [162]:
a.loss

tensor(23.0826, device='cuda:0', grad_fn=<NllLossBackward0>)

In [163]:
embedding_layer = model.model.get_input_embeddings()
print(embedding_layer.weight.shape)

torch.Size([32064, 3072])
