In [1]:
import os
import json
import pandas as pd
from typing import Any
import random

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments
import datasets 

def dump_dataset():
    # Convert dataset to OAI messages
    system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
    SCHEMA:
    {schema}"""

    def create_conversation(sample):
      return {
        "messages": [
          {"role": "system", "content": system_message.format(schema=sample["context"])},
          {"role": "user", "content": sample["question"]},
          {"role": "assistant", "content": sample["answer"]}
        ]
      }

    # Load dataset from the hub
    dataset = datasets.load_dataset("b-mc2/sql-create-context", split="train")
    # dataset = dataset.shuffle().select(range(12500))

    # Convert dataset to OAI messages
    dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
    # split dataset into 10,000 training samples and 2,500 test samples
    dataset = dataset.train_test_split(test_size=2500/12500)

    # save datasets to disk
    dataset["train"].to_json("train_dataset.json", orient="records")
    dataset["test"].to_json("test_dataset.json", orient="records")


def load_dataset(path: str) -> list[dict[str, Any]]:
    with open(path, 'r') as f_:
        data = json.load(f_)

    final_data = []
    for d in data:
        final_data.append(create_conversation_as_prompt(d))

    return final_data

def create_conversation(row) -> str:
    system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""
    return {
        "messages": [
            {"role": "system", "content": system_message.format(schema=row['context'])},
            {"role": "user", "content": row['question']},
            {"role": "assistant", "content": row['answer']}
        ]
    }

def create_conversation_as_prompt(row) -> dict[str, str]:
    query_prompt = f"""You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{row['context']}
User:
{row['question']}"""
    query_answer = f"""Assistant: 
{row['answer']}"""
    return {"prompt": query_prompt, "response": query_answer, 'text': query_prompt + '\n' + query_answer}


PATH_DATA = '../data/sql-create-context/sql_create_context_v4.json'
MODEL_ID = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

dump_dataset()
# data = load_dataset(PATH_DATA)
# take 80% as training
# data_train = data[:int(0.8 * len(data))][:50]
# data_test = data[len(data_train):]

data_train = datasets.load_dataset('json', data_files="train_dataset.json", split='train')

# dump to json, since Dataset requires JSON
# with open('../data/sql-create-context/data_train.json', 'w') as f_:
#     for d in data_train:
#         f_.write(json.dumps(d) + '\n')
# dataset_train = datasets.load_dataset('json', data_files='../data/sql-create-context/data_train.json')
# dataset_train = pd.DataFrame(data_train)
dataset_train = data_train.select(range(50))
print(len(dataset_train))
# for d in dataset_train:
#     print(d)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    attn_implementation="flash_attention_2", # since we support flash attention `torch.cuda.get_device_capability()[0] >= 8`
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.padding_side = 'right' # to prevent warnings

## since TinyLlama is adhere to OAI chatML,  we do not need to setup it. 
## this is used to add new token to the tokenizer as part of conversation
# model, tokenizer = setup_chat_format(model, tokenizer) 

# Using qlora
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

args = TrainingArguments(
    output_dir="code-llama-7b-text-to-sql", # directory to save and repository id
    num_train_epochs=10,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

max_seq_length = 3072 # max sequence length for model and packing of the dataset
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset_train,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    # dataset_text_field='text',
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
) 

# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()


ModuleNotFoundError: No module named 'pandas'

In [2]:
import sys
import time
import glob
import uvicorn
import json
from typing import Any

import logging
from datetime import datetime
from logging.handlers import TimedRotatingFileHandler
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import langchain
langchain.verbose = True
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain_huggingface import HuggingFacePipeline
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA


experiment = 'base'
# experiment = 'finetuned'
if experiment == 'base': 
    MODEL_ID = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
        device_map='auto',
        quantization_config=quantization_config
    )
elif experiment == 'finetuned': 
    print("Finetuned!")
    model_path = './code-llama-7b-text-to-sql'
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_path, 
        trust_remote_code=True,
        device_map='auto',
        quantization_config=quantization_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
else:
    raise ValueError(f"Unrecognize experiment {experiment}")

print("Done!")

model.eval()

generate_text = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    max_new_tokens=1200,
    repetition_penalty=1.1,
    model_kwargs={
        "device_map": "auto",
        "max_length": 1200,
        "temperature": 0.0,
        "torch_dtype":torch.bfloat16}
    )

llm = HuggingFacePipeline(pipeline=generate_text)
llm_chat = ChatHuggingFace(llm=llm)

def convert_to_prompt(d: dict[str, Any]):
    chat = []
    for m in d['messages'][:-1]:
        chat.append((m['role'], m['content']))
    return ChatPromptTemplate(chat)

def print_data(d: dict[str, Any]):
    for m in d['messages']:
        print('{}: {}'.format(m['role'], m['content']))

data =  []
with open('train_dataset.json', 'r') as f_:
    for l in f_:
        data.append(json.loads(l))

sample_idx = 15 
print("Training: ")
print_data(data[sample_idx])
# print(data[sample_idx]['prompt'])
# print(data[sample_idx]['response'])

prompt = convert_to_prompt(data[sample_idx])
# result = (prompt | llm_chat.bind(skip_prompt=True)).invoke({})
result = (prompt | llm_chat).invoke({})
print("\n\n")
print("LLM: ")
if hasattr(result, 'content'):
    print(result.content)
else:
    print(result)


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'langchain_huggingface'