In [2]:
import os
import json
import random
import re
import string
import tqdm
import argparse
import numpy as np
import pandas as pd
from multiprocessing import Pool
#from functools import partial
from rouge_score import rouge_scorer
#from gpt3_api import make_requests as make_gpt3_requests
from transformers import AutoTokenizer
import transformers
import torch

random.seed(42)


  from .autonotebook import tqdm as notebook_tqdm


{'batch_dir': 'data/gpt3_generations/', 'seed_tasks_path': 'data/galaxy-qa.jsonl', 'num_instructions_to_generate': 1, 'num_prompt_instructions': 2, 'request_batch_size': 1, 'use_clf_seed_tasks_only': False}


In [5]:
import os
import sys


from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

#from utils.callbacks import Iteratorize, Stream
#from utils.prompter import Prompter

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "CPU"

'''def main(
    load_8bit: bool = False,
    base_model: str = "",
    lora_weights: str = "tloen/alpaca-lora-7b",
    prompt_template: str = "",  # The prompt template to use, will default to alpaca.
):'''

load_8bit = True
base_model = "decapoda-research/llama-7b-hf"
lora_weights = "tloen/alpaca-lora-7b"


assert (
    base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"


tokenizer = LlamaTokenizer.from_pretrained(base_model)

if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        torch_dtype=torch.float16,
    )
elif device == "mps":
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        device_map={"": device},
        torch_dtype=torch.float16,
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        device_map={"": device},
        torch_dtype=torch.float16,
    )
else:
    model = LlamaForCausalLM.from_pretrained(
        base_model, device_map={"": device}, low_cpu_mem_usage=True
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        device_map={"": device},
    )

# unwind broken decapoda-research config
model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

if not load_8bit:
    model.half()  # seems to fix bugs for some users.

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33/33 [00:27<00:00,  1.19it/s]


In [None]:
args = {"batch_dir": "data/gpt3_generations/",
       "seed_tasks_path": "data/galaxy-qa.jsonl",
       "num_instructions_to_generate": 3,
        "num_prompt_instructions": 2,
        "request_batch_size": 1,
        "use_clf_seed_tasks_only": False
       }

print(args)

In [35]:
"""
A dedicated helper to manage templates and prompt building.
"""

import json
import os.path as osp
from typing import Union


class Prompter(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = "", verbose: bool = False):
        self._verbose = verbose
        if not template_name:
            # Enforce the default here, so the constructor can be called with '' and will not break.
            template_name = "alpaca"
        file_name = osp.join("templates", f"{template_name}.json")
        if not osp.exists(file_name):
            raise ValueError(f"Can't read {file_name}")
        with open(file_name) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res

    def get_response(self, output: str) -> str:
        return output #output.split(self.template["response_split"])[1].strip()

prompt_template = "alpaca_1"
prompter = Prompter(prompt_template)

In [36]:
def encode_prompt(prompt_instructions, classification=False):
    """Encode multiple prompt instructions into a single string."""
    if classification:
        prompt = "Come up with a series of classification tasks. Try to specify the possible output labels when possible.\n"
    else:
        #prompt = "Come up with a series of tasks:\n"
        prompt = ""
    for idx, instruction in enumerate(prompt_instructions):
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        prompt += f"{idx+1}. {instruction}\n"
    prompt += f"{len(prompt_instructions) + 1}."
    return prompt


def sample_machine_instructions(machine_instructions, similarities, n):
    """Sample n machine instructions from a list of machine instructions."""
    return random.sample(machine_instructions, min(n, len(machine_instructions)))


def find_word_in_string(w, s):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search(s)



In [37]:
def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    stream_output=False
):
    prompt = prompter.generate_prompt(instruction, input)
    print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
    )

    generate_params = {
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }

   # print(prompt, input_ids)

    # Without streaming
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return prompter.get_response(output)
    #return dir(generation_output)
    #print(generation_output)
    #s = generation_output.sequences[0]
    #output = tokenizer.decode(generation_output)
    #print(s)
    #yield output #prompter.get_response(output)

In [38]:
seed_tasks = [json.loads(l) for l in open(args["seed_tasks_path"], "r")]
if args["use_clf_seed_tasks_only"]:
    seed_tasks = [t for t in seed_tasks if t["is_classification"]]
seed_instructions = [t["instruction"] for t in seed_tasks]
print(f"Loaded {len(seed_instructions)} human-written seed instructions")

os.makedirs(args["batch_dir"], exist_ok=True)

Loaded 8 human-written seed instructions


In [39]:
# similarities = {}
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
machine_instructions = []
# now let's generate new instructions!
progress_bar = tqdm.tqdm(total=args["num_instructions_to_generate"])
if machine_instructions:
    progress_bar.update(len(machine_instructions))

with open(os.path.join(args["batch_dir"], "machine_generated_instructions.jsonl"), "a") as fout:
    #while len(machine_instructions) < args["num_instructions_to_generate"]:
    batch_inputs = []
    for _ in range(args["request_batch_size"]):
        # sample machine instructions from the pool
        prompt_instructions = sample_machine_instructions(
            machine_instructions, 
            similarities=None,
            n=2)
        #print("prompt_instructions")
        #print(prompt_instructions)
        #print()
        # sample human instructions from the pool
        prompt_instructions += random.sample(seed_instructions, args["num_prompt_instructions"] - len(prompt_instructions))
        random.shuffle(prompt_instructions)
        prompt = encode_prompt(prompt_instructions, classification=args["use_clf_seed_tasks_only"])
        #print("prompt")
        #print(prompt)
        #print()
        #batch_inputs.append(prompt)
        #print(batch_inputs)
        print("Response:", evaluate(prompt))


  0%|                                                                                                                                                                                | 0/1 [02:17<?, ?it/s][A


Extend the series of questions mentioned in instructions:

### Instruction:
1. Why do we need to barcode a read transcript too? Isn’t mapping it against the reference genome enough?
2. Does the distribution of target identifications differ from the decoy distribution?
3.

### Response:

Response: <unk>Extend the series of questions mentioned in instructions:

### Instruction:
1. Why do we need to barcode a read transcript too? Isn’t mapping it against the reference genome enough?
2. Does the distribution of target identifications differ from the decoy distribution?
3.

### Response:
### Instruction:
1. Barcoding a read transcript is necessary because it provides a unique identifier for each read transcript. This identifier can be used to link the read transcript to other data such as gene expression data.
2. No, the distribution of target identifications does not differ from the decoy distribution.
3. No, the distribution of target identifications does not differ from the decoy distrib

In [70]:
# testing code for readme
#for instruction in [
#    "Tell me about alpacas.",
#    "Tell me about the president of Mexico in 2019.",
    #"Tell me about the king of France in 2019.",
    #"List all Canadian provinces in alphabetical order.",
    #"Write a Python program that prints the first 10 Fibonacci numbers.",
    #"Write a program that prints the numbers from 1 to 100. But for multiples of three print 'Fizz' instead of the number and for the multiples of five print 'Buzz'. For numbers which are multiples of both three and five print 'FizzBuzz'.",  # noqa: E501
    #"Tell me five words that rhyme with 'shock'.",
    #"Translate the sentence 'I have no mouth but I must scream' into Spanish.",
    #"Count up from 1 to 500.",
#]:
#    print("Instruction:", instruction)
#    print("Response:", evaluate(instruction))
#    print()

In [19]:
'''seed_tasks = [json.loads(l) for l in open(args["seed_tasks_path"], "r")]
if args["use_clf_seed_tasks_only"]:
    seed_tasks = [t for t in seed_tasks if t["is_classification"]]
seed_instructions = [t["instruction"] for t in seed_tasks]
print(f"Loaded {len(seed_instructions)} human-written seed instructions")

os.makedirs(args["batch_dir"], exist_ok=True)
request_idx = 0
# load the LM-generated instructions
machine_instructions = []
if os.path.exists(os.path.join(args["batch_dir"], "machine_generated_instructions.jsonl")):
    with open(os.path.join(args["batch_dir"], "machine_generated_instructions.jsonl"), "r") as fin:
        for line in fin:
            instruction_info = json.loads(line)
            machine_instructions.append(instruction_info["instruction"])
            request_idx = instruction_info["request_idx"] + 1
    print(f"Loaded {len(machine_instructions)} machine-generated instructions")

# similarities = {}
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

# now let's generate new instructions!
progress_bar = tqdm.tqdm(total=args["num_instructions_to_generate"])
if machine_instructions:
    progress_bar.update(len(machine_instructions))

with open(os.path.join(args["batch_dir"], "machine_generated_instructions.jsonl"), "a") as fout:
    #while len(machine_instructions) < args["num_instructions_to_generate"]:
    batch_inputs = []
    for _ in range(args["request_batch_size"]):
        # sample machine instructions from the pool
        prompt_instructions = sample_machine_instructions(
            machine_instructions, 
            similarities=None,
            n=2)
        print("prompt_instructions")
        print(prompt_instructions)
        print()
        # sample human instructions from the pool
        prompt_instructions += random.sample(seed_instructions, args["num_prompt_instructions"] - len(prompt_instructions))
        random.shuffle(prompt_instructions)
        prompt = encode_prompt(prompt_instructions, classification=args["use_clf_seed_tasks_only"])
        print("prompt")
        print(prompt)
        print()
        batch_inputs.append(prompt)

        sequences = pipeline(
            #'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
            batch_inputs,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            max_length=200,
        )
        print("Responses:")
        print(sequences)
        break'''

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 