## Setup

In [None]:
# Imports

import torch
from torch.nn import DataParallel
import torch.nn.functional as F
from dotenv import load_dotenv
import wandb
from accelerate import Accelerator, init_empty_weights, load_checkpoint_and_dispatch
from helpers.memory import check_memory

load_dotenv('secrets.env')
device = 'cuda'
check_memory()

In [None]:
# Test Multi-GPU Access

# Verify Pytorch can communicate with each GPU
for i in range(torch.cuda.device_count()):
    device = torch.device(f"cuda:{i}")
    try:
        x = torch.tensor([1.0, 2.0, 3.0], device = device)
        print(f"GPU {i}: Computation successful.")
    except Exception as e:
        print(f"GPU {i}: Computation failed. Error: {e}")

# Initialize accelerator
accelerator = Accelerator(
    device_placement = True,
    mixed_precision = 'bf16',
    gradient_accumulation_steps = 1, # Temp
    split_batches = False
)

## Test Inference with Base HF Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained('allenai/OLMoE-1B-7B-0924', add_eos_token = False, add_bos_token = False, padding_side = 'right')

model = AutoModelForCausalLM.from_pretrained(
    'allenai/OLMoE-1B-7B-0924',
    device_map = 'auto', 
    max_memory = {i: "20GB" for i in range(torch.cuda.device_count())},
    torch_dtype = torch.bfloat16,
    trust_remote_code = True
)

check_memory()

In [None]:
# Test with .pipeline()

@torch.no_grad()
def eval_model_v1(model, tokenizer, prompt):
    tokens = tokenizer(prompt, return_tensors = "pt").to(device)
    res = model.generate(
        **tokens,
        max_new_tokens = 32,
        do_sample = False,
        eos_token_id = [tokenizer.eos_token_id]
        )
    print(res)
    return tokenizer.batch_decode(res)[0]

print(eval_model_v1(
    model,
    tokenizer,
    'I am a dog and I like to eat. My favorite food is'
))

In [None]:
# Test with token-by-token generation
 
@torch.no_grad()
def eval_model_v2(model, tokenizer, prompt):
    tokens = tokenizer(prompt, return_tensors = 'pt').to(device)['input_ids']
    i = 1
    while i <= 1:
        output = model(tokens)
        logits = output['logits']
        output_token = torch.argmax(F.softmax(logits.squeeze(), dim = 1), dim = 1)[-1]
        print(output_token)
        tokens = torch.cat((tokens, output_token.view(1, 1)), dim = 1)
                
        if output_token in [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|end|>")]:
            break

        i = i + 1
    
    print(tokens)
    return tokenizer.batch_decode(tokens)[0]

print(eval_model_v2(
    model,
    tokenizer,
    'I am a dog and I like to eat. My favorite food is'
))

## Reverse Engineer the Class

In [None]:
prompt = 'I am a dog and I like to eat. My favorite food is' # Correct next token output is 'steak'
inputs = tokenizer(prompt, return_tensors = 'pt').to(device)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

In [None]:
# Split off LM head
with torch.no_grad():
    # Everything before the LM head
    decoder_output = model.model(
        input_ids,
        attention_mask
    )['last_hidden_state']
    # The LM head
    output_logits = model.lm_head(decoder_output)


output_ids = torch.argmax(output_logits[0, :, :], dim = 1)
print(tokenizer.decode(output_ids[-1]))

In [None]:
# Split off LM head
with torch.no_grad():
    # Everything before the LM head
    decoder_output = model.model(
        input_ids,
        attention_mask
    )['last_hidden_state']
    # The LM head
    output_logits = model.lm_head(decoder_output)


output_ids = torch.argmax(output_logits[0, :, :], dim = 1)
print(tokenizer.decode(output_ids[-1]))

In [None]:
tokenizer.eos_token_id