IMPORT

In [1]:
import os
import subprocess
import time
import copy

import gc
import pickle

from transformers import AutoTokenizer#, GPTNeoForCausalLM,
import torch

TOKENIZER

In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [3]:
#@title Sampling settings
#@markdown You can modify sampling settings here. Don't forget to run the cell again after changing. The number of generated tokens is subtracted from the context window size, don't set it high.
top_k = 60 #@param {type:"number"}
top_p = 0.7 #@param {type:"number"}
temperature = 1#@param {type:"number"}
number_generated_tokens =  48#@param {type:"integer"}
repetition_penalty = 3.01 #@param {type:"number"}
repetition_penalty_range = 1000 #@param {type:"number"}
repetition_penalty_slope = 1 #@param {type:"number"}
#@markdown Temperatures seem to give results different from those in AID, so play around with it. Even 0.5 can give good results.

In [4]:
basic_prompt = "test " #@param {type:"string"}
ids = tokenizer(basic_prompt*10000, return_tensors="pt",truncation=True).input_ids
n_ids = ids.shape[1]
if n_ids < 1:
  n_ids = 1
  ids = torch.tensor([[tokenizer.eos_token_id]])
max_length = n_ids + number_generated_tokens
modelinputids = ids.long().to("cuda")

LOAD MODEL

In [5]:
#CODE TO SAVE MODEL:
"""
from transformers import GPTNeoForCausalLM
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B").half()
with open('gptneo.pkl', 'wb') as f:
    pickle.dump(model, f)
"""

'\nfrom transformers import GPTNeoForCausalLM\nmodel = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B").half()\nwith open(\'gptneo.pkl\', \'wb\') as f:\n    pickle.dump(model, f)\n'

In [6]:
print("1",gc.collect())
#Pickle is just a saved gpt-neo 2.7B model (GPTNeoForCausalLM) , Used here because it has lower peak ram usage
with open('gptneo.pkl', 'rb') as f:
    model = pickle.load(f)
    
print(model.eval().half().to("cpu"))

1 40
GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 2560)
    (wpe): Embedding(2048, 2560)
    (drop): Dropout(p=0, inplace=False)
    (h): ModuleList(
      (0): GPTNeoBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0, inplace=False)
            (resid_dropout): Dropout(p=0, inplace=False)
            (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
            (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
          )
        )
        (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2560, out_features=10240, bias=True)
          (c_proj): Linear(

# Modify the forward function of GPTNeoModel
Set the number_of_parts in new_forward. Recommend 4 for 8gb vram and 32 for 6gb vram. (4 may work on 6gb vram and 2 may work on 8 gb vram depending on os/cuda/etc.) <br> Speed difference between number of parts seems extremely minimal, indicating that ram->vram transfer is pretty much the only bottleneck of this process, and the gpu compute is almost instant in comparison.

In [7]:
number_of_parts = 32

In [8]:
from transformers import GPTNeoForCausalLM,GPTNeoModel
from transformers.modeling_outputs import BaseModelOutputWithPast

In [9]:
def new_forward(
    self,
    input_ids=None,
    past_key_values=None,
    attention_mask=None,
    token_type_ids=None,
    position_ids=None,
    head_mask=None,
    inputs_embeds=None,
    use_cache=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=None,
):
    global number_of_parts
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if input_ids is not None and inputs_embeds is not None:
        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
    elif input_ids is not None:
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])
        batch_size = input_ids.shape[0]
    elif inputs_embeds is not None:
        input_shape = inputs_embeds.size()[:-1]
        batch_size = inputs_embeds.shape[0]
    else:
        raise ValueError("You have to specify either input_ids or inputs_embeds")

    if token_type_ids is not None:
        token_type_ids = token_type_ids.view(-1, input_shape[-1])
    if position_ids is not None:
        position_ids = position_ids.view(-1, input_shape[-1])

    if past_key_values is None:
        past_length = 0
        past_key_values = tuple([None] * len(self.h))
    else:
        past_length = past_key_values[0][0].size(-2)
    if position_ids is None:
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
        position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])

    # Attention mask.
    if attention_mask is not None:
        assert batch_size > 0, "batch_size has to be defined and > 0"
        global_attention_mask = attention_mask.view(batch_size, -1)
        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        global_attention_mask = global_attention_mask[:, None, None, :]

        # Since global_attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        global_attention_mask = global_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
        global_attention_mask = (1.0 - global_attention_mask) * -10000.0
    else:
        global_attention_mask = None

    # Prepare head mask if needed
    # 1.0 in head_mask indicate we keep the head
    # attention_probs has shape bsz x num_headss x N x N
    # head_mask has shape n_layer x batch x num_headss x N x N
    head_mask = self.get_head_mask(head_mask, self.config.num_layers)

    if inputs_embeds is None:
        inputs_embeds = self.wte(input_ids)
    position_embeds = self.wpe(position_ids)
    hidden_states = inputs_embeds + position_embeds

    if token_type_ids is not None:
        token_type_embeds = self.wte(token_type_ids)
        hidden_states = hidden_states + token_type_embeds

    hidden_states = self.drop(hidden_states)

    output_shape = input_shape + (hidden_states.size(-1),)

    presents = () if use_cache else None
    all_self_attentions = () if output_attentions else None
    all_hidden_states = () if output_hidden_states else None

    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
        if number_of_parts == 2:
            if i == 0:
                cudastreams = {}
                for j in range(0,16):
                    cudastreams[j] = torch.cuda.Stream()
                    for param1,param2 in zip(self.h[j].parameters(),self.h[j+16].parameters()):
                        param1.data = param2.data
                        
                for j in range(0,16):
                    with torch.cuda.stream(cudastreams[j]):
                        for param1,param2 in zip(self.h[j].parameters(),self.extrastorage[j].parameters()):
                            param1.data.copy_(param2.data, non_blocking=True)
                        self.h[j].to("cuda", non_blocking=True)
                        
                torch.cuda.synchronize()
                del cudastreams
                
            if i == 16:
                cudastreams = {}
                for j in range(16,32):
                    cudastreams[j] = torch.cuda.Stream()
                    for param1,param2 in zip(self.h[j].parameters(),self.h[j-16].parameters()):
                        param1.data = param2.data
                        
                for j in range(16,32):  
                    with torch.cuda.stream(cudastreams[j]):
                        for param1,param2 in zip(self.h[j].parameters(),self.extrastorage[j].parameters()):
                            param1.data.copy_(param2.data, non_blocking=True)
                            pass
                        self.h[j].to("cuda", non_blocking=True)
                torch.cuda.synchronize()
                del cudastreams
                
        if number_of_parts == 4:
            if i == 0:
                cudastreams = {}
                for j in range(0,8):
                    cudastreams[j] = torch.cuda.Stream()
                    for param1,param2 in zip(self.h[j].parameters(),self.h[j+24].parameters()):
                        param1.data = param2.data
                for j in range(0,8):
                    with torch.cuda.stream(cudastreams[j]):
                        for param1,param2 in zip(self.h[j].parameters(),self.extrastorage[j].parameters()):
                            param1.data.copy_(param2.data, non_blocking=True)
                        self.h[j].to("cuda", non_blocking=True)
                torch.cuda.synchronize()
                del cudastreams
                
            if i == 8:
                cudastreams = {}
                for j in range(8,16):
                    cudastreams[j] = torch.cuda.Stream()
                    for param1,param2 in zip(self.h[j].parameters(),self.h[j-8].parameters()):
                        param1.data = param2.data
                for j in range(8,16):
                    with torch.cuda.stream(cudastreams[j]):
                        for param1,param2 in zip(self.h[j].parameters(),self.extrastorage[j].parameters()):
                            param1.data.copy_(param2.data, non_blocking=True)
                        self.h[j].to("cuda", non_blocking=True)
                torch.cuda.synchronize()
                del cudastreams
                    
            if i == 16:
                cudastreams = {}
                for j in range(16,24):
                    cudastreams[j] = torch.cuda.Stream()
                    for param1,param2 in zip(self.h[j].parameters(),self.h[j-8].parameters()):
                        param1.data = param2.data
                for j in range(16,24):
                    with torch.cuda.stream(cudastreams[j]):
                        for param1,param2 in zip(self.h[j].parameters(),self.extrastorage[j].parameters()):
                            param1.data.copy_(param2.data, non_blocking=True)
                        model.transformer.h[j].to("cuda", non_blocking=True)
                torch.cuda.synchronize()
                del cudastreams
                
            if i == 24:
                cudastreams = {}
                for j in range(24,32):
                    cudastreams[j] = torch.cuda.Stream()
                    for param1,param2 in zip(self.h[j].parameters(),self.h[j-8].parameters()):
                        param1.data = param2.data
                for j in range(24,32):
                    with torch.cuda.stream(cudastreams[j]):
                        for param1,param2 in zip(self.h[j].parameters(),self.extrastorage[j].parameters()):
                            param1.data.copy_(param2.data, non_blocking=True)
                        self.h[j].to("cuda", non_blocking=True)
                torch.cuda.synchronize()
                del cudastreams
                
        if number_of_parts == 32:
            
            if i == 0:
                for param1,param2 in zip(self.h[i].parameters(),self.h[31].parameters()):
                    param1.data = param2.data
                    
                for param1,param2 in zip(self.h[0].parameters(),self.extrastorage[0].parameters()):
                    param1.data = param2.data.to("cuda", non_blocking=True)
                self.h[0].to("cuda", non_blocking=True)
                    
                    
            if i >= 1:
                for param1,param2 in zip(self.h[i].parameters(),self.h[i-1].parameters()):
                    param1.data = param2.data
                    
                for param1,param2 in zip(self.h[i].parameters(),self.extrastorage[i].parameters()):
                    param1.data.copy_(param2.data, non_blocking=True)
                self.h[i].to("cuda", non_blocking=True)
        
        attn_type = self.config.attention_layers[i]
        attn_mask = global_attention_mask if attn_type == "global" else attention_mask

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if getattr(self.config, "gradient_checkpointing", False) and self.training:

            if use_cache:
                logger.warning(
                    "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                    "`use_cache=False`..."
                )
                use_cache = False

            def create_custom_forward(module):
                def custom_forward(*inputs):
                    # None for past_key_value
                    return module(*inputs, use_cache, output_attentions)

                return custom_forward

            outputs = torch.utils.checkpoint.checkpoint(
                create_custom_forward(block),
                hidden_states,
                None,
                attn_mask,
                head_mask[i],
            )
        else:
            outputs = block(
                hidden_states,
                layer_past=layer_past,
                attention_mask=attn_mask,
                head_mask=head_mask[i],
                use_cache=use_cache,
                output_attentions=output_attentions,
            )

        hidden_states = outputs[0]
        if use_cache is True:
            presents = presents + (outputs[1],)

        if output_attentions:
            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)

    hidden_states = self.ln_f(hidden_states)

    hidden_states = hidden_states.view(*output_shape)
    # Add last hidden state
    if output_hidden_states:
        all_hidden_states = all_hidden_states + (hidden_states,)

    if not return_dict:
        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)

    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=presents,
        hidden_states=all_hidden_states,
        attentions=all_self_attentions,
    )

In [10]:
print(GPTNeoModel.forward)
print(new_forward)
GPTNeoModel.forward = new_forward

<function GPTNeoModel.forward at 0x000002198FE07708>
<function new_forward at 0x0000021989774318>


In [11]:
print(GPTNeoModel.forward)

<function new_forward at 0x0000021989774318>


# Prepare model for ram-vram swaps

In [12]:
model.eval().to("cpu")
model.transformer.wte.to("cuda")
model.transformer.wpe.to("cuda")
model.transformer.ln_f.to("cuda")
model.lm_head.to("cuda")
torch.cuda.empty_cache()

In [13]:
for param in model.transformer.wte.parameters():
    param.requires_grad = False
for param in model.transformer.wpe.parameters():
    param.requires_grad = False
for i in range(32):
    for param in model.transformer.h[i].parameters():
        param.requires_grad = False
for param in model.transformer.ln_f.parameters():
    param.requires_grad = False
for param in model.lm_head.parameters():
    param.requires_grad = False

# extra storage for model.transformer.h (will use extra 5gb ram temporarily)

In [14]:
setattr(model.transformer,"extrastorage",None)

In [15]:
model.transformer.extrastorage = copy.deepcopy(model.transformer.h)

In [16]:
smalltensor = torch.tensor(0).to("cuda")
for j in range(32):
    for param1 in model.transformer.h[j].parameters():
        param1.data = smalltensor
gc.collect()
torch.cuda.empty_cache()

In [17]:
model.transformer.extrastorage.to("cpu")
for i in range(32):
    for param in model.transformer.extrastorage[i].parameters():
        param.requires_grad = False
        param.data.pin_memory()
gc.collect()
torch.cuda.empty_cache()

In [18]:
if number_of_parts == 2:
    for j in range(16,32):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
            param1.data = param2.data.to("cuda", non_blocking=True)
        model.transformer.h[j].to("cuda", non_blocking=True)  
    print("number_of_parts = 4" )
    
if number_of_parts == 4:
    for j in range(24,32):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
            param1.data = param2.data.to("cuda", non_blocking=True)
        model.transformer.h[j].to("cuda", non_blocking=True)  
    print("number_of_parts = 4" )
    
if number_of_parts == 32:
    for param1,param2 in zip(model.transformer.h[31].parameters(),model.transformer.extrastorage[31].parameters()):
        param1.data = param2.data.to("cuda", non_blocking=True)
    model.transformer.h[31].to("cuda", non_blocking=True)  
    print("number_of_parts = 32" )

number_of_parts = 32


RUN MODEL

In [19]:
with torch.no_grad():
    start_time = time.time()
    gc.collect()
    basic_output = model.generate(
        ids.long().to("cuda"),
        do_sample=True,
        num_beams=2,
        min_length=max_length,
        max_length=max_length,
        temperature=temperature,
        top_k = top_k,
        top_p = top_p,
        repetition_penalty = repetition_penalty,
        repetition_penalty_range = repetition_penalty_range,
        repetition_penalty_slope = repetition_penalty_slope,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences = 2
    ).long()

    torch.cuda.empty_cache()
    gc.collect()
    print(tokenizer.decode(basic_output[0]))

    print(time.time()  - start_time)

test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test 

# Old Debug VRAM Bandwidth test (2 parts)

In [20]:
smalltensor = torch.tensor(0).to("cuda")
for j in range(0,32):
    for param1 in model.transformer.h[j].parameters():
        param1.data = smalltensor
torch.cuda.empty_cache()
gc.collect()
start_time = time.time()
#0.8gb vram when no model, 5.7gb with full model. 1 loop = 4.9gb transfer to gpu
for i in range(50): 
    smalltensor = torch.tensor(0).to("cuda")
    for j in range(0,16):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
            param1.data = param2.data.to("cuda", non_blocking=True)
        model.transformer.h[j].to("cuda", non_blocking=True)
    for j in range(0,16):
        for param1 in model.transformer.h[j].parameters():
            param1.data = smalltensor
    #gc.collect()
    torch.cuda.empty_cache()
    for j in range(16,32):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
            param1.data = param2.data.to("cuda", non_blocking=True)
        model.transformer.h[j].to("cuda", non_blocking=True)
    for j in range(16,32):
        for param1 in model.transformer.h[j].parameters():
            param1.data = smalltensor
    torch.cuda.empty_cache()
print(time.time()  - start_time)

62.58732867240906


# REUSE MEMORY Bandwidth test 32 parts

In [21]:
smalltensor = torch.tensor(0).to("cuda")
for j in range(0,32):
    for param1 in model.transformer.h[j].parameters():
        param1.data = smalltensor
for param1,param2 in zip(model.transformer.h[31].parameters(),model.transformer.extrastorage[31].parameters()):
    param1.data = param2.data.to("cuda", non_blocking=True)
model.transformer.h[31].to("cuda", non_blocking=True)  

torch.cuda.empty_cache()
gc.collect()
start_time = time.time()
#0.8gb vram when no model, 5.7gb with full model. 1 loop = 4.9gb transfer to gpu
for i in range(50):
    for current_number in range(32):
        if current_number == 0:
            for param1,param2 in zip(model.transformer.h[0].parameters(),model.transformer.h[31].parameters()):
                param1.data = param2.data
            for param1,param2 in zip(model.transformer.h[current_number].parameters(),model.transformer.extrastorage[current_number].parameters()):
                param1.data.copy_(param2.data, non_blocking=True)
            model.transformer.h[0].to("cuda", non_blocking=True)
            
        if current_number >= 1:
            for param1,param2 in zip(model.transformer.h[current_number].parameters(),model.transformer.h[current_number-1].parameters()):
                param1.data = param2.data
            for param1,param2 in zip(model.transformer.h[current_number].parameters(),model.transformer.extrastorage[current_number].parameters()):
                param1.data.copy_(param2.data, non_blocking=True)
            model.transformer.h[current_number].to("cuda", non_blocking=True)
        
print(time.time()  - start_time)

45.50335764884949


# REUSE MEMORY 4 PART CUDA

In [22]:
smalltensor = torch.tensor(0).to("cuda")
for j in range(0,32):
    for param1 in model.transformer.h[j].parameters():
        param1.data = smalltensor

for j in range(24,32):
    for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
        param1.data = param2.data.to("cuda", non_blocking=True)
    model.transformer.h[j].to("cuda", non_blocking=True)
    
start_time = time.time()
#0.8gb vram when no model, 5.7gb with full model. 1 loop = 4.9gb transfer to gpu and delete
for i in range(50):
    #PART 1
    #SET STUFF
    for j in range(0,8):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.h[j+24].parameters()):
            param1.data = param2.data
    #CUDASTREAMS
    cudastreams = {}
    for j in range(0,8):
        cudastreams[j] = torch.cuda.Stream()
    for j in range(0,8):
        with torch.cuda.stream(cudastreams[j]):
            for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
                param1.data.copy_(param2.data, non_blocking=True)
                pass
            model.transformer.h[j].to("cuda", non_blocking=True)
    torch.cuda.synchronize()
    del cudastreams
    
    #PART 2
    #SET STUFF
    for j in range(8,16):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.h[j-8].parameters()):
            param1.data = param2.data
    #CUDASTREAMS
    cudastreams = {}
    for j in range(8,16):
        cudastreams[j] = torch.cuda.Stream()
    for j in range(8,16):
        with torch.cuda.stream(cudastreams[j]):
            for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
                param1.data.copy_(param2.data, non_blocking=True)
                pass
            model.transformer.h[j].to("cuda", non_blocking=True)
    torch.cuda.synchronize()
    del cudastreams
    
    #PART 3
    #SET STUFF
    for j in range(16,24):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.h[j-8].parameters()):
            param1.data = param2.data
    #CUDASTREAMS
    cudastreams = {}
    for j in range(16,24):
        cudastreams[j] = torch.cuda.Stream()
    for j in range(16,24):
        with torch.cuda.stream(cudastreams[j]):
            for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
                param1.data.copy_(param2.data, non_blocking=True)
                pass
            model.transformer.h[j].to("cuda", non_blocking=True)
    torch.cuda.synchronize()
    del cudastreams
    
    #PART 4
    #SET STUFF
    for j in range(24,32):
        for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.h[j-8].parameters()):
            param1.data = param2.data
    #CUDASTREAMS
    cudastreams = {}
    for j in range(24,32):
        cudastreams[j] = torch.cuda.Stream()
    for j in range(24,32):
        with torch.cuda.stream(cudastreams[j]):
            for param1,param2 in zip(model.transformer.h[j].parameters(),model.transformer.extrastorage[j].parameters()):
                param1.data.copy_(param2.data, non_blocking=True)
                pass
            model.transformer.h[j].to("cuda", non_blocking=True)  
    torch.cuda.synchronize()
    del cudastreams
    
print(time.time()  - start_time)

46.776641607284546
