# Platypus2-70B 


In [1]:
import gc
import logging
from time import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import ctypes
from functools import partial

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# For RAG
import torch

# For Platypus2
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights
from accelerate.utils.modeling import set_module_tensor_to_device
from safetensors.torch import load_file

def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()
    



## 2: Run Platypus2-70B

To such a large model on a single T4 GPU, we run it layer by layer and sample by sample

In [2]:
# Create symlinks from kaggle datasets to fake cached model

checkpoint_path = Path("/root/.cache/")
checkpoint_path.mkdir(exist_ok=True, parents=True)

for part in [1, 2]:
    source_dir = Path(f'/kaggle/input/platypus2-70b-instruct-part{part}')
    for path in source_dir.glob('*'):
        try:
            (checkpoint_path / path.name).symlink_to(path)
        except:
            pass

In [3]:
# Class for sharded llama
MAX_LENGTH = 4096

class ShardedLlama:
    
    def __init__(self, checkpoint_path, device='cuda:0', dtype=torch.float16):
        """
        Sharded version of LlamaForCausalLM : the model is splitted into layer shards to reduce GPU memory usage.
        During the forward pass, the inputs are processed layer by layer, and the GPU memory is freed after each layer.
        To avoid loading the layers multiple times, we could save all the intermediate activations in RAM, but
        as Kaggle accelerators have more GPU memory than CPU, we simply batch the inputs and keep them on the GPU.

        Parameters
        ----------
        checkpoint_path : str or Path
            path to the checkpoint
        device : str, optional
            device, by default 'cuda:0'
        dtype : torch.dtype, optional
            dtype, by default torch.float16
        """
        
        # Save parameters
        self.checkpoint_path = Path(checkpoint_path)
        self.device = device 
        self.dtype = dtype

        # Create model
        self.config = AutoConfig.from_pretrained(self.checkpoint_path)
        # For flash attention when Turing architecture will be supported : https://github.com/Dao-AILab/flash-attention/issues/542
        # self.config.auto_map = {"AutoModelForCausalLM" : "togethercomputer/LLaMA-2-7B-32K--modeling_flash_llama.LlamaForCausalLM"} 
        
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = 'right'
        self.init_model()        
        self.layer_names = ['model.embed_tokens'] + [f'model.layers.{i}' for i in range(len(self.model.model.layers))] + ['model.norm', 'lm_head']
        
    def init_model(self):
                
        # Load meta model (no memory used)
        with init_empty_weights():
            self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
            self.model.tie_weights()
            
        self.layers = [self.model.model.embed_tokens] + list(self.model.model.layers) + [self.model.model.norm, self.model.lm_head]
            
        # Move buffers to device (not that much GPU memory used)
        for buffer_name, buffer in self.model.named_buffers():
            set_module_tensor_to_device(self.model, buffer_name, self.device, value=buffer, dtype=self.dtype)
       
    def load_layer(self, layer_name):
        state_dict = load_file(self.checkpoint_path / (layer_name + '.safetensors'), device=self.device)
        for param_name, param in state_dict.items():
            assert param.dtype != torch.int8, 'int8 not supported (need to add fp16_statistics)'
            set_module_tensor_to_device(self.model, param_name, self.device, value=param, dtype=self.dtype)
        
    def __call__(self, inputs, output_token):
        # inputs = [(prefix, suffix), ...] with prefix.shape[0] = 1 and suffix.shape[0] = 5
        
        # Reboot the model to make sure buffers are loaded and memory is clean
        del self.model
        clean_memory()
        self.init_model()
        
       # Send batch to device
        batch = [(prefix.to(self.device), suffix.to(self.device)) for prefix, suffix in inputs]
        print(batch)
        n_suffixes = 1
#         n_suffixes = len(batch[0][1])

        suffix_eos = [(suffix != self.tokenizer.pad_token_id).sum(1) - 1 for _, suffix in inputs]

        # Create attention mask for the largest input, and position ids to use KV cache
        attention_mask = torch.finfo(self.dtype).min * torch.ones(MAX_LENGTH, MAX_LENGTH)
        attention_mask = attention_mask.triu(diagonal=1)[None, None, ...]
        attention_mask = attention_mask.to(self.device)
        position_ids = torch.arange(MAX_LENGTH, dtype=torch.long, device=self.device)[None, :]

        with ThreadPoolExecutor() as executor, torch.inference_mode():

            # Load first layer
            future = executor.submit(self.load_layer, 'model.embed_tokens')

            for i, (layer_name, layer) in enumerate(zip(self.layer_names, self.layers)):

                # Wait for previous layer to be loaded and load next layer
                start = time()
                future.result()
                if (i + 1) < len(self.layer_names):
                    future = executor.submit(self.load_layer, self.layer_names[i + 1])
                load_time = time() - start

                # Run layer
                for j, (prefix, suffix) in enumerate(batch):
                    if layer_name == 'model.embed_tokens':
                        batch[j] = (layer(prefix), layer(suffix))
                    elif layer_name == 'model.norm':
                        # Only keep the last hidden state at this point
                        batch[j] = (None, layer(suffix[torch.arange(n_suffixes), suffix_eos[j]][:, None]))
                    elif layer_name == 'lm_head':
                        batch[j] = (None, layer(suffix))
                    else:
                        # Run prefix
                        len_p, len_s = prefix.shape[1], suffix.shape[1]
                        new_prefix, (k_cache, v_cache) = layer(prefix, use_cache=True, attention_mask=attention_mask[:, :, -len_p:, -len_p:])
                        
                        # Run suffix
                        pos = position_ids[:, len_p:len_p + len_s].repeat(n_suffixes, 1)
                        attn = attention_mask[:, :, -len_s:, -len_p - len_s:].repeat(n_suffixes, 1, 1, 1)
                        kv_cache = (k_cache.repeat(n_suffixes, 1, 1, 1), v_cache.repeat(n_suffixes, 1, 1, 1))
                        new_suffix = layer(suffix, past_key_value=kv_cache, position_ids=pos, attention_mask=attn)[0]
                        batch[j] = (new_prefix, new_suffix)

                # Remove previous layer from memory (including buffers)
                layer.to('meta')
                print(f'device {self.device}, {layer_name}, load time : {load_time:.1f}, run time: {time() - start - load_time:.1f}s')

        # Get scores
        batch = [suffix[:, 0, output_token].detach().cpu().numpy() for _, suffix in batch]
        
        return batch

In [4]:
# Run model on the 2 GPUs
N_BATCHES = 4
MAX_CONTEXT = 2500

def get_tokens(row, tokenizer):
        system_prefix = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_prefix}"
        instruction = f"Your task is to analyze the sentence and it's meaning. If the sentiment is postive, respond postive, if it is not postive respond negative."
        input_prefix = f"Sentence: {row['text']}\nProposed answer: "
        prompt_prefix = system_prefix.format(instruction=instruction, input_prefix=input_prefix)
        prefix = tokenizer(prompt_prefix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH)['input_ids']
        prompt_suffix = [f"postive\n\n### Response:\n"]
        suffix = tokenizer(prompt_suffix, return_tensors="pt", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH, padding=True)['input_ids'][:, 1:]
        return prefix, suffix

def run_model(device, df):
    model = ShardedLlama(checkpoint_path, device=f'cuda:{device}')
    f = partial(get_tokens, tokenizer=model.tokenizer)
    inputs = df.apply(f, axis=1).values
    batches = np.array_split(inputs, N_BATCHES)
    outputs = []
    for batch in batches:
        outputs += model(batch, output_token=4874)
    return outputs



In [5]:
# sentence = ['I hate any one who can hurt you', 'I hate you' , 'Who hates you is right' , 'none can hate you' ]
sentence = ['I hate any one who can hurt you', 'I hate you']

df = pd.DataFrame(columns =['text'])
# row_to_append = pd.DataFrame([{'text': sentence}])
# df = pd.concat([df,row_to_append])
df['text'] = sentence

In [6]:
df

Unnamed: 0,text
0,I hate any one who can hurt you
1,I hate you


In [7]:
# Run model

with ThreadPoolExecutor() as executor:
    outputs = list(executor.map(run_model, [0,1], np.array_split(df, 2)))
    outputs = sum(outputs, [])

[(tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29892,
          3300,  2859,   411,   385,  1881,   393,  8128,  4340,  3030, 29889,
         14350,   263,  2933,   393,  7128,  2486,  1614,  2167,   278,  2009,
         29889,    13,    13,  2277, 29937,  2799,  4080, 29901,    13, 10858,
          3414,   338,   304, 27599,   278, 10541,   322,   372, 29915, 29879,
          6593, 29889,   960,   278, 19688,   338,  1400,   573, 29892, 10049,
          1400,   573, 29892,   565,   372,   338,   451,  1400,   573, 10049,
          8178, 29889,    13,    13,  2277, 29937, 10567, 29901,    13, 29903,
           296,   663, 29901,   306, 26277,   738,   697,  1058,   508, 21682,
           366,    13,  1184,  4752,  1234, 29901, 29871]], device='cuda:0'), tensor([[ 1400,   573,    13,    13,  2277, 29937, 13291, 29901,    13]],
       device='cuda:0'))]
[(tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29892,
          3300,  2859,   411

In [8]:
outputs

[array([1.757], dtype=float16), array([0.9873], dtype=float16)]

In [9]:
# Save results
n = len(df)
for i, scores in enumerate(outputs):
    top3 = np.argsort(scores)[::-1]
    df.loc[i, 'prediction'] = ' '.join(['ABCDE'[j] for j in top3])
df[['prediction']].to_csv('submission.csv')

In [10]:
# Display performances if train set is used

if 'answer' in df.columns:

    for i in range(n):
        df.loc[i, 'top_1'] = df.loc[i, 'prediction'][0]
        df.loc[i, 'top_2'] = df.loc[i, 'prediction'][2]        
        df.loc[i, 'top_3'] = df.loc[i, 'prediction'][4]

    top_i = [(df[f'top_{i}'] == df['answer']).sum() for i in [1, 2, 3]]
    print(f'top1 : {top_i[0]}/{n}, top2 : {top_i[1]}/{n}, top3 : {top_i[2]}/{n} (total={sum(top_i)} / {n})')
    print(f'Accuracy: {100*top_i[0]/n:.1f}%, map3: {100*(top_i[0] + top_i[1]*1/2 + top_i[2]*1/3).sum()/n:.1f}%')