Initialize repository, copy weights from Google drive.

In [1]:
import os
import sys
from google.colab import drive

# Mount google drive.
drive.mount('/drive')

#@markdown Location of tokenizer.
tokenizer_loc = '/drive/MyDrive/Colab Notebooks/ISO/llama/tokenizer/tokenizer.model' #@param {type:"string"}

# @markdown Location of directory containing model weights / parameters.
weight_loc = '/drive/MyDrive/Colab Notebooks/ISO/llama/7B-chat/' #@param {type:"string"}

!pip install -q fairscale sentencepiece
!git clone https://github.com/facebookresearch/llama.git

sys.path.insert(0, '/content/llama/')

!nvidia-smi

Mounted at /drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fairscale (pyproject.toml) ... [?25l[?25hdone
Cloning into 'llama'...
remote: Enumerating objects: 460, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 460 (delta 15), reused 31 (delta 12), pack-reused 417[K
Receiving objects: 100% (460/460), 1.11 MiB | 10.87 MiB/s, done.
Resolving deltas: 100% (233/233), done.
Wed Apr 24 19:26:08 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|--

In [2]:
!pip install -q -r '/content/llama/requirements.txt'

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/88.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m81.9/88.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.4/88.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fire (setup.py) ... [?25l[?25hdone


The 7B checkpoint is too large to fit into RAM. Run this cell if you need to split the 7B checkpoint. Will save the results to your 7B directory so you should only ever need to run this cell once. You may need to restart the runtime afterward.

In [3]:
import torch

# @markdown Choose if to split the model or if the split checkpoints are already created.
SPLIT = False #@param

if SPLIT:
    checkpoint = torch.load(os.path.join(weight_loc, 'consolidated.00.pth'),
                            map_location="cuda")

    d1 = dict(list(checkpoint.items())[:len(checkpoint)//2])
    torch.save(d1, os.path.join(weight_loc, 'consolidated.00.00.pth'))
    del(d1)

    d2 = dict(list(checkpoint.items())[len(checkpoint)//2:])
    torch.save(d2, os.path.join(weight_loc, 'consolidated.00.01.pth'))
    del(d2)

    del(checkpoint)

Prepare loading

In [4]:
from typing import List, Literal, Optional, Tuple, TypedDict
import os
import sys
import torch
import time
import json
from tqdm import tqdm
import pandas as pd

from pathlib import Path

import torch.nn.functional as F
from fairscale.nn.model_parallel.initialize import (
    get_model_parallel_rank,
    initialize_model_parallel,
    model_parallel_is_initialized,
)

from llama.model import ModelArgs, Transformer
from llama.tokenizer import Tokenizer
from llama.generation import Llama, Dialog

In [5]:
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['MP'] = '1'
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '2223'

local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))

torch.distributed.init_process_group("gloo")
initialize_model_parallel(world_size)
torch.cuda.set_device(local_rank)

# seed must be the same in all processes
torch.manual_seed(42)

if local_rank > 0:
    sys.stdout = open(os.devnull, 'w')

# @markdown Context size. Can be up to 2048, but Colab GPU doesn't always play well with high values.
max_seq_len = 1024 # @param {type:"number"}
# @markdown Maximum batch size. Recommended to keep it low.
batch_size = 2 # @param {type:"number"}

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


Load model.

In [6]:
def build_llama(
        ckpt_dir: str,
        tokenizer_path: str,
        max_seq_len: int,
        max_batch_size: int,
        model_parallel_size: Optional[int] = None,
        seed: int = 1,
    ) -> "Llama":
        """
        Build a Llama instance by initializing and loading a pre-trained model.

        Args:
            ckpt_dir (str): Path to the directory containing checkpoint files.
            tokenizer_path (str): Path to the tokenizer file.
            max_seq_len (int): Maximum sequence length for input text.
            max_batch_size (int): Maximum batch size for inference.
            model_parallel_size (Optional[int], optional): Number of model parallel processes.
                If not provided, it's determined from the environment. Defaults to None.

        Returns:
            Llama: An instance of the Llama class with the loaded model and tokenizer.

        Raises:
            AssertionError: If there are no checkpoint files in the specified directory,
                or if the model parallel size does not match the number of checkpoint files.

        Note:
            This method initializes the distributed process group, sets the device to CUDA,
            and loads the pre-trained model and tokenizer.

        """
        start_time = time.time()

        print("Loading")
        with open(Path(ckpt_dir) / "params.json", "r") as f:
            params = json.loads(f.read())

        model_args: ModelArgs = ModelArgs(max_seq_len=max_seq_len,
                                            max_batch_size=max_batch_size,
                                            **params)
        tokenizer = Tokenizer(model_path=tokenizer_path)
        model_args.vocab_size = tokenizer.n_words
        torch.set_default_tensor_type(torch.cuda.HalfTensor)
        model = Transformer(model_args).cuda().half()
        torch.set_default_tensor_type(torch.FloatTensor)

        checkpoint_paths = [os.path.join(weight_loc, 'consolidated.00.00.pth'),
                            os.path.join(weight_loc, 'consolidated.00.01.pth')]

        for checkpoint_path in checkpoint_paths:
            checkpoint = torch.load(checkpoint_path, map_location='cpu')
            model.load_state_dict(checkpoint, strict=False)
            del checkpoint

        generator = Llama(model=model, tokenizer=tokenizer)

        print(f"Loaded in {time.time() - start_time:.2f} seconds")
        return generator

In [7]:
generator = build_llama(
        ckpt_dir=weight_loc,
        tokenizer_path=tokenizer_loc,
        max_seq_len=max_seq_len,
        max_batch_size=batch_size,
        )

Loading


  _C._set_default_tensor_type(t)


Loaded in 253.66 seconds


In [8]:
cd '/drive/MyDrive/Colab Notebooks/ISO/instances'

/drive/.shortcut-targets-by-id/1k2JFo63MNzskIgAHpOe33d-BrKTiK7mE/ISO/instances


In [9]:
import string
p = set(string.printable)

system_prompt = "You are a paraphraser. You are given an input passage 'INPUT'. You should paraphrase 'INPUT' to print 'OUTPUT'. 'OUTPUT' shoud be diverse and different as much as possible from 'INPUT' and should not copy any part verbatim from 'INPUT'. 'OUTPUT' should preserve the meaning and content of 'INPUT' while maintaining text quality and grammar. 'OUTPUT' should not be much longer than 'INPUT'. You should print 'OUTPUT' and nothing else so that its easy for me to parse."

def build_dialog(text: str) -> Dialog:
    return [{"role": "system", "content": system_prompt}, {"role": "user", "content": "INPUT: " + text}]

def build_paraphrase(original_filename, paraphrase_filename):
    df = pd.read_json(original_filename, lines=True)

    with open(paraphrase_filename, "r+" if os.path.exists(paraphrase_filename) else "w+") as f:
        with torch.no_grad():
            for i in tqdm(range(len(f.readlines()), len(df["input"]), batch_size)):
                input = df["input"][i:i+batch_size].values.tolist()
                continuation = df["continuation"][i:i+batch_size].values.tolist()
                prompt = [build_dialog(input[i] + continuation[i]) for i in range(len(input))]

                out_text = generator.chat_completion(
                                                        prompt,
                                                        max_gen_len=1024,
                                                        temperature=1,
                                                        top_p=0.9
                                                    )

                for j in range(len(out_text)):
                    _line = ''.join(filter(lambda x: x in p, input[j])).replace('\n', '\\n').replace('"', '\\"')
                    _continuation = ''.join(filter(lambda x: x in p, continuation[j])).replace('\n', '\\n').replace('"', '\\"')
                    _paraphrase = ''.join(filter(lambda x: x in p, out_text[j]["generation"]["content"][8:])).replace('\n', '\\n').replace('"', '\\"')

                    f.write(f'{{"input": "{_line}", "continuation": "{_continuation}", "paraphrase": "{_paraphrase}"}}\n')

In [None]:
build_paraphrase("kirchenbauer_no_attack.json", "kirchenbauer_paraphrase.json")

100%|██████████| 117/117 [51:39<00:00, 26.49s/it]


In [None]:
build_paraphrase("kuditipudi_no_attack.json", "kuditipudi_paraphrase.json")

100%|██████████| 256/256 [1:46:17<00:00, 24.91s/it]


In [10]:
build_paraphrase("wang_no_attack.json", "wang_paraphrase.json")

100%|██████████| 219/219 [1:43:52<00:00, 28.46s/it]


In [None]:
build_paraphrase("opt_yang_no_attack.json", "opt_yang_paraphrase.json")

100%|██████████| 116/116 [52:55<00:00, 27.38s/it]


In [None]:
build_paraphrase("llama_yang_no_attack.json", "llama_yang_paraphrase.json")

100%|██████████| 32/32 [11:13<00:00, 21.06s/it]


In [None]:
build_paraphrase("zhao_no_attack.json", "zhao_paraphrase.json")

100%|██████████| 126/126 [1:01:53<00:00, 29.47s/it]


In [None]:
results = generator.chat_completion(
    [input],  # type: ignore
    max_gen_len=300,
    temperature=1,
    top_p=0.9,
)

In [None]:
results

[{'generation': {'role': 'assistant',
   'content': " OUTPUT: When dealing with questions that lack clarity or accuracy, it's essential to provide thorough explanations rather than offering inaccurate responses. By doing so, you can help the asker understand the flaws in their question and find relevant information. If you're unsure about the answer, it's better to remain silent rather than providing false information that could potentially confuse or mislead. By maintaining the integrity of the question and the answer, you can build trust and ensure a more meaningful exchange."}}]