In [1]:
!git lfs install
import tqdm as notebook_tqdm

Updated git hooks.
Git LFS initialized.


In [3]:
import os
import random

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Nov 25 15:33:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:03:00.0 Off |                  Off |
| N/A   33C    P0    51W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Model Loading

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [55]:
def set_env():
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    
def set_seed(seed, deterministic=True):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = deterministic
        torch.backends.cudnn.benchmark = not deterministic
        # torch.use_deterministic_algorithms(deterministic)

In [61]:
rng_seed: int = 42

set_env()
set_seed(rng_seed, deterministic=True)

In [65]:
# Verify deterministic pseudo-random number generator
torch.rand(1,3)

tensor([[0.1332, 0.9346, 0.5936]])

In [7]:
"""
# Sharded model weights
model_name = "incoder-6B"
model_id=f"facebook/{model_name}"

model = AutoModelForCausalLM.from_pretrained("facebook/incoder-6B", revision="sharded", low_cpu_mem_usage=True)
#model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
"""

'\n# Sharded model weights\nmodel_name = "incoder-6B"\nmodel_id=f"facebook/{model_name}"\n\nmodel = AutoModelForCausalLM.from_pretrained("facebook/incoder-6B", revision="sharded", low_cpu_mem_usage=True)\n#model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True)\ntokenizer = AutoTokenizer.from_pretrained(model_id)\n'

In [8]:
dir(torch.cuda)

['Any',
 'BFloat16Storage',
 'BFloat16Tensor',
 'BoolStorage',
 'BoolTensor',
 'ByteStorage',
 'ByteTensor',
 'CUDAGraph',
 'CharStorage',
 'CharTensor',
 'ComplexDoubleStorage',
 'ComplexFloatStorage',
 'CudaError',
 'DeferredCudaCallError',
 'Device',
 'DoubleStorage',
 'DoubleTensor',
 'Event',
 'ExternalStream',
 'FloatStorage',
 'FloatTensor',
 'HalfStorage',
 'HalfTensor',
 'IntStorage',
 'IntTensor',
 'List',
 'LongStorage',
 'LongTensor',
 'Optional',
 'OutOfMemoryError',
 'Set',
 'ShortStorage',
 'ShortTensor',
 'Stream',
 'StreamContext',
 'Tuple',
 'Union',
 '_CudaBase',
 '_CudaDeviceProperties',
 '_LazySeedTracker',
 '__all__',
 '__annotations__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_check_capability',
 '_check_cubins',
 '_cudart',
 '_device',
 '_device_count_nvml',
 '_device_t',
 '_dummy_type',
 '_get_device_index',
 '_initialization_lock',
 '_initialized',
 '_is_compiled',
 '_is_i

In [66]:
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-2B-mono")

In [67]:
type(tokenizer)

transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast

In [71]:
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-2B-mono", torch_dtype=torch.bfloat16).cuda()
model.bfloat16()

CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 2560)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): CodeGenBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=2560, out_features=7680, bias=False)
          (out_proj): Linear(in_features=2560, out_features=2560, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=2560, out_features=10240, bias=True)
          (fc_out): Linear(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): CodeGenBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dro

In [73]:
# Model memory footprint (GB)
model.get_memory_footprint() / 10**9
# Flush GPU memory and delete the model
# del model
# torch.cuda.empty_cache()

5.692930048

# CodeGen Tokenizer caveats

In [None]:
# Custom Tokenizer for CodeGen
"""
Construct a CodeGen tokenizer. Based on byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:

>>> tokenizer("Hello world")['input_ids']
[15496, 995]
>>> tokenizer(" Hello world")['input_ids']
[18435, 995]
"""
def include_whitespace(t, n_min=2, n_max=20, as_special_tokens=False):
    t.add_tokens([' ' * n for n in reversed(range(n_min, n_max))], special_tokens=as_special_tokens)
    return t


def include_tabs(t, n_min=2, n_max=20, as_special_tokens=False):
    t.add_tokens(['\t' * n for n in reversed(range(n_min, n_max))], special_tokens=as_special_tokens)
    return t


def create_custom_gpt2_tokenizer():
    t = tokenizer
    t = include_whitespace(t=t, n_min=2, n_max=32, as_special_tokens=False)
    t = include_tabs(t=t, n_min=2, n_max=10, as_special_tokens=False)
    return t

## Sample

## Sampling with HuggingFace

In [88]:
# ToDo: Tokenizers Fast -> Rust implementation performance boost
context: str = "from typing import List, Tuple\ndef rolling_max(numbers: List[int]) -> List[int]:\n    \"\"\" From a given list of integers, generate a list of rolling maximum element found until given moment\n    in the sequence. \"\"\"\n"
inputs = tokenizer(context, return_tensors="pt").to(0)
print(inputs)
pad_token_id: int = 50256
sample = model.generate(**inputs, max_length=128)
"""
CodeGenTokenizer

args:
    token_ids
    skip_special_tokens
    clean_up_tokenization_spaces
    truncate_before_pattern
"""
print(tokenizer.decode.__code__.co_varnames)
print("HumanEval Problem Statement")
print(context)
print("HumanEval Problem Solution")
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[ 6738, 19720,  1330,  7343,    11,   309, 29291,   198,  4299, 10708,
            62,  9806,     7,    77, 17024,    25,  7343,    58,   600, 12962,
          4613,  7343,    58,   600,  5974,   198, 50284, 37811,  3574,   257,
          1813,  1351,   286, 37014,    11,  7716,   257,  1351,   286, 10708,
          5415,  5002,  1043,  1566,  1813,  2589,   198, 50284,   259,   262,
          8379,    13, 37227,   198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]], device='cuda:0')}
('self', 'token_ids', 'skip_special_tokens', 'clean_up_tokenization_spaces', 'truncate_before_pattern', 'kwargs', 'decoded_text')
HumanEval Problem Statement
from typing import List, Tuple
def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling ma

## Manual Sampling

In [None]:
import re

def truncate(completion):

    def find_re(string, pattern, start_pos):
        m = pattern.search(string, start_pos)
        return m.start() if m else -1

    terminals = [
        re.compile(r, re.MULTILINE)
        for r in
        [
            '^#',
            re.escape('<|endoftext|>'),
            "^'''",
            '^"""',
            '\n\n\n'
        ]
    ]

    prints = list(re.finditer('^print', completion, re.MULTILINE))
    if len(prints) > 1:
        completion = completion[:prints[1].start()]

    defs = list(re.finditer('^def', completion, re.MULTILINE))
    if len(defs) > 1:
        completion = completion[:defs[1].start()]

    start_pos = 0

    terminals_pos = [pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1]
    if len(terminals_pos) > 0:
        return completion[:min(terminals_pos)]
    else:
        return completion

In [None]:
def sample(
    device,
    model,
    tokenizer,
    context: str,
    pad_token_id: int,
    num_return_sequences: int,
    temp: float,
    top_p: float,
    max_length_sample: int,
    max_length: int
):

    input_ids = tokenizer(
        context,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt',
    ).input_ids

    input_ids_len = input_ids.shape[1]
    assert input_ids_len < max_length

    with torch.no_grad():
        input_ids = input_ids.to(device)
        tokens = model.generate(
            input_ids,
            do_sample=True,
            num_return_sequences=num_return_sequences,
            temperature=temp,
            max_length=input_ids_len + max_length_sample,
            top_p=top_p,
            pad_token_id=pad_token_id,
            use_cache=True,
        )
        text = tokenizer.batch_decode(tokens[:, input_ids_len:, ...])

    return text

In [None]:
"cuda:0"
torch.cuda.current_device()

0

In [None]:
# (1) params
device: str = "cuda:0"
rng_seed: int = 42
top_p: float = 0.95
temp: float = 0.01
max_length_sample: int = 128
batch_size: int = 1
pad_token_id: int = 50256
context: str = "def helloworld():"

# (2) preamble
device = torch.device(device)

In [None]:
"""
torch.manual_seed(42)
torch.rand(1,3)
"""

tensor([[0.8823, 0.9150, 0.3829]])

In [None]:
# (3) tokenizer
# For models fine-tuned on code
tokenizer = create_custom_gpt2_tokenizer()
tokenizer.padding_side = 'left'
tokenizer.pad_token = pad_token_id
# (4) sample
completion = sample(device=device, model=model, tokenizer=tokenizer, context=context
                    , pad_token_id=pad_token_id, num_return_sequences=batch_size, temp=temp
                    , top_p=top_p, max_length_sample=max_length_sample,  max_length=2048)[0]
truncation = truncate(completion)

print("Completion")
print('=' * 100)
print(completion)
print("Context + Truncation")
print('=' * 100)
print(context + truncation)

Completion

    print("Hello World")

helloworld()

def hello(name):
    print("Hello", name)

hello("John")

def hello(name, age):
    print("Hello", name, "you are", age)

hello("John", 25)

def hello(name, age):
    print("Hello", name, "you are", age)

hello("John", 25)

def hello(name, age):
    print("Hello", name, "you are", age)

hello("John", 25)

Context + Truncation
def helloworld():
    print("Hello World")

helloworld()

def hello(name):
    print("Hello", name)

hello("John")




In [None]:
curated_completion: str = ""
    
for line in completion.split("\n"):
    if "def" in line and "def helloworld():" not in line:
        break
    else:
        print(line)


    print("Hello World")

helloworld()



In [None]:
torch.cuda.get_rng_state()

tensor([255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2