Initialize repository, copy weights from Google drive.

In [None]:
import os
import sys
from google.colab import drive

# Mount google drive.
drive.mount('/drive')

#@markdown Location of tokenizer.
tokenizer_loc = '/drive/MyDrive/Colab Notebooks/ISO/llama/tokenizer/tokenizer.model' #@param {type:"string"}

# @markdown Location of directory containing model weights / parameters.
weight_loc = '/drive/MyDrive/Colab Notebooks/ISO/llama/7B-chat/' #@param {type:"string"}

!pip install -q fairscale sentencepiece
!git clone https://github.com/facebookresearch/llama.git

sys.path.insert(0, '/content/llama/')

!nvidia-smi

Mounted at /drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fairscale (pyproject.toml) ... [?25l[?25hdone
Cloning into 'llama'...
remote: Enumerating objects: 460, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 460 (delta 15), reused 31 (delta 12), pack-reused 417[K
Receiving objects: 100% (460/460), 1.11 MiB | 29.26 MiB/s, done.
Resolving deltas: 100% (233/233), done.
Thu Apr 18 10:43:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|--

In [None]:
!pip install -q -r '/content/llama/requirements.txt'

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/88.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.4/88.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fire (setup.py) ... [?25l[?25hdone


The 7B checkpoint is too large to fit into RAM. Run this cell if you need to split the 7B checkpoint. Will save the results to your 7B directory so you should only ever need to run this cell once. You may need to restart the runtime afterward.

In [None]:
import torch

# @markdown Choose if to split the model or if the split checkpoints are already created.
SPLIT = False #@param

if SPLIT:
    checkpoint = torch.load(os.path.join(weight_loc, 'consolidated.00.pth'),
                            map_location="cuda")

    d1 = dict(list(checkpoint.items())[:len(checkpoint)//2])
    torch.save(d1, os.path.join(weight_loc, 'consolidated.00.00.pth'))
    del(d1)

    d2 = dict(list(checkpoint.items())[len(checkpoint)//2:])
    torch.save(d2, os.path.join(weight_loc, 'consolidated.00.01.pth'))
    del(d2)

    del(checkpoint)

Prepare loading

In [None]:
from typing import List, Literal, Optional, Tuple, TypedDict
import os
import sys
import torch
import time
import json
from tqdm import tqdm
import pandas as pd

from pathlib import Path

import torch.nn.functional as F
from fairscale.nn.model_parallel.initialize import (
    get_model_parallel_rank,
    initialize_model_parallel,
    model_parallel_is_initialized,
)

from llama.model import ModelArgs, Transformer
from llama.tokenizer import Tokenizer
from llama.generation import Llama

In [None]:
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['MP'] = '1'
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '2223'

local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))

torch.distributed.init_process_group("gloo")
initialize_model_parallel(world_size)
torch.cuda.set_device(local_rank)

# seed must be the same in all processes
torch.manual_seed(42)

if local_rank > 0:
    sys.stdout = open(os.devnull, 'w')

# @markdown Context size. Can be up to 2048, but Colab GPU doesn't always play well with high values.
max_seq_len = 256 # @param {type:"number"}
# @markdown Maximum batch size. Recommended to keep it low.
batch_size = 4 # @param {type:"number"}

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


Load model.

In [None]:
def build_llama(
        ckpt_dir: str,
        tokenizer_path: str,
        max_seq_len: int,
        max_batch_size: int,
        model_parallel_size: Optional[int] = None,
        seed: int = 1,
    ) -> "Llama":
        """
        Build a Llama instance by initializing and loading a pre-trained model.

        Args:
            ckpt_dir (str): Path to the directory containing checkpoint files.
            tokenizer_path (str): Path to the tokenizer file.
            max_seq_len (int): Maximum sequence length for input text.
            max_batch_size (int): Maximum batch size for inference.
            model_parallel_size (Optional[int], optional): Number of model parallel processes.
                If not provided, it's determined from the environment. Defaults to None.

        Returns:
            Llama: An instance of the Llama class with the loaded model and tokenizer.

        Raises:
            AssertionError: If there are no checkpoint files in the specified directory,
                or if the model parallel size does not match the number of checkpoint files.

        Note:
            This method initializes the distributed process group, sets the device to CUDA,
            and loads the pre-trained model and tokenizer.

        """
        start_time = time.time()

        print("Loading")
        with open(Path(ckpt_dir) / "params.json", "r") as f:
            params = json.loads(f.read())

        model_args: ModelArgs = ModelArgs(max_seq_len=max_seq_len,
                                            max_batch_size=max_batch_size,
                                            **params)
        tokenizer = Tokenizer(model_path=tokenizer_path)
        model_args.vocab_size = tokenizer.n_words
        torch.set_default_tensor_type(torch.cuda.HalfTensor)
        model = Transformer(model_args).cuda().half()
        torch.set_default_tensor_type(torch.FloatTensor)

        checkpoint_paths = [os.path.join(weight_loc, 'consolidated.00.00.pth'),
                            os.path.join(weight_loc, 'consolidated.00.01.pth')]

        for checkpoint_path in checkpoint_paths:
            checkpoint = torch.load(checkpoint_path, map_location='cpu')
            model.load_state_dict(checkpoint, strict=False)
            del checkpoint

        generator = Llama(model=model, tokenizer=tokenizer)

        print(f"Loaded in {time.time() - start_time:.2f} seconds")
        return generator

In [None]:
generator = build_llama(
        ckpt_dir=weight_loc,
        tokenizer_path=tokenizer_loc,
        max_seq_len=max_seq_len,
        max_batch_size=batch_size,
        )

Loading


  _C._set_default_tensor_type(t)


Loaded in 231.76 seconds


In [None]:
prompts: List[str] = [
        # For these prompts, the expected answer is the natural continuation of the prompt
        "I believe the meaning of life is",
        "Simply put, the theory of relativity states that ",
        """A brief message congratulating the team on the launch:

        Hi everyone,

        I just """,
        # Few shot prompt (providing a few examples before asking model to complete more);
        """Translate English to French:

        sea otter => loutre de mer
        peppermint => menthe poivrée
        plush girafe => girafe peluche
        cheese =>""",
    ]

results = generator.text_completion(
        prompts[:batch_size],
        max_gen_len=220,
        temperature=1,
        top_p=0.9,
    )

results

[{'generation': 'to be kind to one another and try to alleviate the suffering of our fellow creatures wherever and whenever we can.\nThe next biggest asset after intelligence is mental toughness.\nMy greatest wish is that I may become so absorbed in the work of the Lord that I may never know anything of pain or sorrow.\nChrist is the supreme artist because He created a sense of love that helps us see beyond the surface and into the core of another person’s soul.\nDon’t be content with simplicity, but seek inspiration in the wonders of nature.\nSometimes you have to put yourself in your worst position in order to see that your lowest point is not so low after all.\nOur greatest glory is not in never failing, but in rising up every time we fall.\nEven the richest among us are no more than stewards of the wealth God has given us.\nIs not the beautiful and precious book of Nature the grandest epic?\nIt’s a fact that most men die before they are fully born'},
 {'generation': "1) the speed o

In [None]:
cd '/drive/MyDrive/Colab Notebooks/ISO/instances'

/drive/.shortcut-targets-by-id/1k2JFo63MNzskIgAHpOe33d-BrKTiK7mE/ISO/instances


In [None]:
import string
p = set(string.printable)

In [None]:
filename = 'llama_no_watermark.json'

df = pd.read_json('c4_selection.json', lines=True)

with open(filename, "r+" if os.path.exists(filename) else "w+") as f:
    with torch.no_grad():
        for i in tqdm(range(len(f.readlines()), len(df["input"]), batch_size)):
            line = df["input"][i:i+batch_size].values.tolist()

            out_text = generator.text_completion(
                                                        line,
                                                        max_gen_len=220,
                                                        temperature=1,
                                                        top_p=0.9,
                                                        logprobs=False
                                                    )

            for j in range(len(out_text)):
                output_text = out_text[j]["generation"]
                _line = ''.join(filter(lambda x: x in p, line[j])).replace('\n', '\\n').replace('"', '\\"')
                _output_text = ''.join(filter(lambda x: x in p, output_text)).replace('\n', '\\n').replace('"', '\\"')

                f.write(f'{{"input": "{_line}", "continuation": "{_output_text}"}}\n')

100%|██████████| 128/128 [30:20<00:00, 14.23s/it]
