This notebook generates the continuations based on the prompts stored in data/toxic.jsonl

After this notebook, run score.py

In [None]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
import json
import os
import time
from typing import Dict, Optional, List
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, pipeline
from googleapiclient import discovery
from googleapiclient.errors import HttpError
from huggingface_hub import login

In [None]:
login("<HUGGINGFACE_API_KEY>")

model = 'mistral'     # 'bloom' / 'llama' / 'mistral'

model_id = {
    'bloom': "bigscience/bloom-7b1",
    'llama': "meta-llama/Meta-Llama-3-8B",
    'mistral': "mistralai/Mistral-7B-v0.1"
}[model]

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

In [None]:
model_4bit = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    quantization_config=quantization_config
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
generator = pipeline("text-generation",model=model_4bit, tokenizer=tokenizer)

In [None]:
write_to = f"generations/{model}_generations.jsonl"
load_dir = "data/toxic.jsonl"

with open(write_to, "w") as f:
    for line in open(load_dir).readlines():
        message = json.loads(line)
        prompt = message["prompt"]["text"]

        # Generate continuation of prompt
        res = generator(prompt, max_new_tokens=64)
        x = {"prompt":message, "generated":res}
        f.write(json.dumps(x) + "\n")