In [1]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.

from typing import Tuple
import os
import sys
import torch
import time
import json
from pathlib import Path
from llama import ModelArgs, Transformer, Tokenizer, LLaMA


def load(
        ckpt_dir: str,
        tokenizer_path: str,
        max_seq_len: int,
        max_batch_size: int,
) -> LLaMA:
    print("Creating model...")
    start_time = time.time()
    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))

    with open(Path(ckpt_dir) / "params.json", "r") as f:
        params = json.loads(f.read())

    model_args: ModelArgs = ModelArgs(
        max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
    )

    tokenizer = Tokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words

#     torch.set_default_tensor_type(torch.cuda.HalfTensor)
    model = Transformer(model_args)
    torch.set_default_tensor_type(torch.FloatTensor)

    # Original copyright by tloen
    # https://github.com/tloen/llama-int8/blob/main/example.py
    key_to_dim = {
        "w1": 0,
        "w2": -1,
        "w3": 0,
        "wo": -1,
        "wq": 0,
        "wk": 0,
        "wv": 0,
        "output": 0,
        "tok_embeddings": -1,
        "ffn_norm": None,
        "attention_norm": None,
        "norm": None,
        "rope": None,
    }

    for i, ckpt in enumerate(checkpoints):
        print(f"Loading checkpoint {i}")
        checkpoint = torch.load(ckpt, map_location="cpu")
        for parameter_name, parameter in model.named_parameters():
            short_name = parameter_name.split(".")[-2]
            if key_to_dim[short_name] is None and i == 0:
                parameter.data = checkpoint[parameter_name]
            elif key_to_dim[short_name] == 0:
                size = checkpoint[parameter_name].size(0)
                parameter.data[size * i: size * (i + 1), :] = checkpoint[
                    parameter_name
                ]
            elif key_to_dim[short_name] == -1:
                size = checkpoint[parameter_name].size(-1)
                parameter.data[:, size * i: size * (i + 1)] = checkpoint[
                    parameter_name
                ]
            del checkpoint[parameter_name]
        del checkpoint

    model = model.to("cpu")

    print(f"Loaded model in {time.time() - start_time:.2f} seconds")
    return model, tokenizer

In [2]:
ckpt_dir = '/datasets/llama/7B'
tokenizer_path = '/datasets/llama/7B/tokenizer.model'
temperature = 0.8
top_p = 0.95
max_seq_len = 512  # up to 2048
max_batch_size = 32

model, tokenizer = load(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)

Creating model...
Loading checkpoint 0
Loaded model in 52.47 seconds


In [3]:
import torch.nn as nn
import memory_utils

import imp
memory_utils = imp.reload(memory_utils)

class FlexSequential(nn.Sequential):
    def forward(self, *inputs):
        for module in self._modules.values():
            if type(inputs) == tuple:
                inputs = module(*inputs)
            else:
                inputs = module(inputs)
        return inputs

class LayerToDevice(nn.Module):
    def __init__(self, device, layer):
        super().__init__()
        self.D = device
        self.layer = layer
    def forward(self, *args):
        self.layer.to(self.D)
        if(len(args) == 1):
            return args[0]
        return args
    
class LayerOffDevice(nn.Module):
    def __init__(self, device, layer):
        super().__init__()
        self.D = device
        self.layer = layer
        
    def forward(self, *args):
        self.layer = self.layer.to(memory_utils.cpu_device)
        if(len(args) == 1):
            return args[0]
        return args

def move_layer(layer, device): 
    test =  [LayerToDevice(device, layer)] + [layer] + [LayerOffDevice(device, layer)]
    return FlexSequential(*test)

  after removing the cwd from sys.path.


In [4]:
class DataToCpu(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, *args):
        ret = tuple([e.to(memory_utils.cpu_device) if type(e) == torch.Tensor else e for e in args])
        if(len(ret) == 1):
            return ret[0]
        return ret
    
class DataToGpu(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, *args):
        ret = tuple([e.to(memory_utils.gpu_device) if type(e) == torch.Tensor else e for e in args])
        if(len(ret) == 1):
            return ret[0]
        return ret

In [5]:
def layer_size(layer):
    return sum(e.flatten().shape[0] for e in layer.parameters())

def return_to_cpu_layer(layer): 
    test =  [DataToCpu()] + [layer] 
    return FlexSequential(*test)

def return_to_gpu_layer(layer): 
    test =  [DataToGpu()] + [layer] 
    return FlexSequential(*test)

base_children = memory_utils.get_base_children(model, max_layer_size = 1e9)

capacity = 2e9
tot = 0

for par, name, module in base_children:
    tot += layer_size(module)
    if tot > capacity:
        setattr(par, name, return_to_cpu_layer(module))
    else:
        module.to(memory_utils.gpu_device)
        setattr(par, name, return_to_gpu_layer(module))

In [6]:
model

Transformer(
  (tok_embeddings): FlexSequential(
    (0): DataToGpu()
    (1): Embedding(32000, 4096)
  )
  (layers): ModuleList(
    (0): FlexSequential(
      (0): DataToGpu()
      (1): TransformerBlock(
        (attention): Attention(
          (wq): Linear(in_features=4096, out_features=4096, bias=False)
          (wk): Linear(in_features=4096, out_features=4096, bias=False)
          (wv): Linear(in_features=4096, out_features=4096, bias=False)
          (wo): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (feed_forward): FeedForward(
          (w1): Linear(in_features=4096, out_features=11008, bias=False)
          (w2): Linear(in_features=11008, out_features=4096, bias=False)
          (w3): Linear(in_features=4096, out_features=11008, bias=False)
        )
        (attention_norm): RMSNorm()
        (ffn_norm): RMSNorm()
      )
    )
    (1): FlexSequential(
      (0): DataToGpu()
      (1): TransformerBlock(
        (attention): Attention(
        

In [7]:
# model = model.eval()
# base_children = memory_utils.get_base_children(model, max_layer_size = 1e9)

# for par, name, module in base_children:
#     setattr(par, name, move_layer(module, memory_utils.gpu_device))

In [8]:
next(model.layers[0].parameters()).device

device(type='cuda', index=0)

In [10]:
model = model.eval()
generator = LLaMA(model, tokenizer)

prompts = [
    """
    The standard models have their weights (the connections between the NN) stored as floating points. This makes the models very large and difficult to store in either system or GPU RAM. Also, CPU’s are just not good at doing floating point math compared to GPU’s. Though, there are ways to improve your performance on CPU, namely by understanding how different converted models work.

This is where quantized models come into play. There’s 8-bit quantized models that use methods like zero-point quantization to change the model from floating point weights to 8-bit integers. This can run on a wider array of hardware, especially 7 billion or 13 billion parameter models. Though, there’s ways to even further reduce the hardware needs.

We also have GPTQ 4-bit quantizing (there are also 3 and 2-bit methods, but I’m not familiar with them, personally). This will let you easily run the 13 billion parameter model on consumer hardware. But what kind of hardware? CPU or GPU?

This is where GGML comes in. If you want to use a CPU, you would want to run a GGML optimized version, this will let you leverage a CPU and system RAM. I’ve seen some people saying 1 or 2 tokens per second, I imagine they are NOT running GGML versions. If you plan to run this on a GPU, you would want to use a standard GPTQ 4-bit quantized model.

I hope this helps demystify a bit of what different configurations do for different hardware. 
    """
]

results = generator.generate(
    prompts, max_gen_len=256, temperature=temperature, top_p=top_p
)

for result in results:
    print(result)
    print("\n==================================\n")


  0%|                                                                                                                                      | 0/163 [00:00<?, ?it/s][A

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [None]:
base = nn.Sequential()
base2 = move_layer(base, torch.device('cpu'))
#ltd = LayerToDevice(torch.device('cpu'), nn.Sequential())

In [None]:
base

In [None]:
model

In [None]:
type(model.layers[0].layer) == tuple

In [None]:
next(model.parameters()).device

In [None]:
torch.cuda.memory_allocated()

In [None]:
a = model.to("cuda:0")

In [None]:
type(torch.HalfTensor())