In [14]:
from datasets import load_dataset, Dataset

In [None]:
# Save Dataset

In [15]:
ds = load_dataset("gretelai/synthetic_text_to_sql")

In [None]:
ds = ds["train"].select(range(1000))
ds.to_csv("gretel_data_1k.csv")

In [1]:
# Load Finetuned Model

In [1]:
import csv
import yaml
from typing import Tuple
import re
from functools import partial

from datasets import load_dataset
from transformers import AutoTokenizer, BitsAndBytesConfig, GenerationConfig
from peft import AutoPeftModelForCausalLM
import torch

from llmtune.pydantic_models.config_model import Config

In [2]:
EXPERIMENT_HASH = "9pQFRZ"
config_path = f"./experiment/{EXPERIMENT_HASH}/config/config.yml"

with open(config_path, "r") as f:
    config = yaml.safe_load(f)
    config = Config(**config)

In [3]:
weights_path = f"./experiment/{EXPERIMENT_HASH}/weights/"
model = AutoPeftModelForCausalLM.from_pretrained(
    weights_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)
model = model.merge_and_unload()
tok = AutoTokenizer.from_pretrained(weights_path, device_map="auto")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [4]:
def infer(prompt: str, model, tok: AutoTokenizer) -> str:
    """
    Outputs predicted sequence and probability
    """
    input_ids = tok.encode(prompt, return_tensors="pt", truncation=True).cuda()
    gen_config = GenerationConfig(
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.2,
        pad_token_id=tok.pad_token_id,
        return_dict_in_generate=True,
        output_scores=True,
        )

    gen_results = model.generate(input_ids, gen_config)
    gen_tok_ids = gen_results.sequences.squeeze(0)[input_ids.shape[1]:]

    seq = tok.decode(gen_tok_ids, skip_special_tokens=True)

    seq_score = torch.stack(gen_results.scores).squeeze(1) # get raw output scores 
    seq_score = torch.log(torch.softmax(seq_score, dim=1)) # softmax and log to get log_prob
    seq_score = seq_score.gather(dim=1, index=gen_tok_ids.view(-1, 1)).squeeze()
    seq_score = torch.exp(torch.sum(seq_score)).item() # sum and exp to get prob

    return seq, seq_score

In [5]:
with open("prompts.csv", mode='r', newline='') as file:
    reader = csv.reader(file)
    headers = next(reader)  # Read the header
    data = [tuple(row) for row in reader]  # Read the data and convert each row to a tuple

processed_preds = {}
for idx, row in enumerate(data):
    out, _ = infer(row[1], model, tok)
    out = out.strip()
    out += f"\t----- bird -----\t{data[idx][0]}"
    processed_preds[str(idx)] = out
    print(f"Done {idx+1}/{len(data)}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Done 1/500
Done 2/500
Done 3/500
Done 4/500
Done 5/500
Done 6/500
Done 7/500
Done 8/500
Done 9/500
Done 10/500
Done 11/500
Done 12/500
Done 13/500
Done 14/500
Done 15/500
Done 16/500
Done 17/500
Done 18/500
Done 19/500
Done 20/500
Done 21/500
Done 22/500
Done 23/500
Done 24/500
Done 25/500
Done 26/500
Done 27/500
Done 28/500
Done 29/500
Done 30/500
Done 31/500
Done 32/500
Done 33/500
Done 34/500
Done 35/500
Done 36/500
Done 37/500
Done 38/500
Done 39/500
Done 40/500
Done 41/500
Done 42/500
Done 43/500
Done 44/500
Done 45/500
Done 46/500
Done 47/500
Done 48/500
Done 49/500
Done 50/500
Done 51/500
Done 52/500
Done 53/500
Done 54/500
Done 55/500
Done 56/500
Done 57/500
Done 58/500
Done 59/500
Done 60/500
Done 61/500
Done 62/500
Done 63/500
Done 64/500
Done 65/500
Done 66/500
Done 67/500
Done 68/500
Done 69/500
Done 70/500
Done 71/500
Done 72/500
Done 73/500
Done 74/500
Done 75/500
Done 76/500
Done 77/500
Done 78/500
Done 79/500
Done 80/500
Done 81/500
Done 82/500
Done 83/500
Done 84/500
D

In [6]:
import json
with open("predict_mini_dev_mistral-gretel-2_SQLite.json", 'w') as json_file:
    json.dump(processed_preds, json_file, indent=4)