In [1]:
# external
import datasets as ds
from transformers import pipeline
import importlib
import os
import torch
import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from functools import partial
import numpy as np
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
    TextGenerationPipeline,
    GenerationConfig
)
import tqdm
from functools import partial

# internal
from redditqa.data.smart_filter import question_grading_map, question_filter, answer_grading_map, answer_filter
from redditqa.data import qa_generation
from redditqa.data.util import mask_links


  from .autonotebook import tqdm as notebook_tqdm
2023-12-17 12:44:05.576685: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-17 12:44:05.576711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-17 12:44:05.577774: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-17 12:44:05.582607: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# set_seed(42)
# # Load the dataset
# # dataset_dict = load_reddit_dataset(pairs=False)
# dataset = ds.load_from_disk("/scratch1/redditqa/cached_datasets/AskHistorians_question_filtered.jsonl")
# question_filter_func = partial(question_filter, accepted_token_str=["y", "yes"])
# dataset = dataset.filter(question_filter_func)

# test_data = dataset.train_test_split(test_size=0.1)["test"]
# test_data_100 = test_data.select(list(range(100)))
# test_data_100.save_to_disk("/scratch1/redditqa/cached_datasets/AskHistorians_test_set_100")

In [3]:
dataset = ds.load_from_disk("/scratch1/redditqa/cached_datasets/AskHistorians_test_set_100")
dataset

Dataset({
    features: ['question_created_utc', 'question_retrieved_on', 'question_deleted', 'question_title', 'question_selftext', 'question_score', 'question_char_length', 'question_selftext_char_length', 'answers', 'graded_output'],
    num_rows: 100
})

In [4]:
model_checkpoints = {
    'Zephyr-7B-beta': 'HuggingFaceH4/zephyr-7b-beta',
    'Zephyr-History-7600': '/scratch1/redditqa/ws23/zephyr_dpo_filtered_dataset/checkpoint-7600_merged',
}

cache_dir = "/scratch1/ssawicki/cache"

In [5]:
template = "<|ELIF|> Question: %question\nAnswer: "

def generate_answers(ds_item, model, tokenizer, model_name):
    # quantization config

    generation_config = GenerationConfig(
        top_k=0.0,
        top_p=1.0,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        max_length=1024
    )
    
    pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)

    answers = {}

    prompt = template.replace('%question', ds_item['question_title'])
    result = pipeline(prompt, generation_config=generation_config, return_full_text=False)
    generated_text = result[0]['generated_text']
    ds_item[model_name] = generated_text
    return ds_item

In [6]:
for model_name, model_path in model_checkpoints.items():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
            model_path,
            low_cpu_mem_usage=True,
            quantization_config=bnb_config,
            cache_dir=cache_dir
        )

    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model.config.pad_token_id = model.config.eos_token_id
    model = model.eval()

    answer_generator = partial(generate_answers, model=model, tokenizer=tokenizer, model_name=model_name)
    dataset = dataset.map(answer_generator)

Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.54it/s]
Map: 100%|██████████| 100/100 [40:32<00:00, 24.32s/ examples]
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]
Map: 100%|██████████| 100/100 [25:06<00:00, 15.06s/ examples]


In [7]:
dataset.save_to_disk("/scratch1/redditqa/cached_datasets/AskHistorians_test_set_100_model_generated")

Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 20975.72 examples/s]
