In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import transformers
import textwrap
from transformers import LlamaTokenizer, LlamaForCausalLM
import sys
from typing import List

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)

import fire
import torch
from datasets import load_dataset
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from pylab import rcParams

from transformers.generation.utils import GreedySearchDecoderOnlyOutput
from peft import PeftModel
from transformers import GenerationConfig

import json

%matplotlib inline
sns.set(rc={'figure.figsize':(10, 7)})
sns.set(rc={'figure.dpi':100})
sns.set(style='white', palette='muted', font_scale=1.2)

#DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#torch.cuda.set_device(1)

## load the model

In [5]:
BASE_MODEL = "model/llama2-7B-hf"

In [6]:
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# # continue fine-tuning
# BASE_MODEL = "model/llama2-7B-hf"
# LORA_WEIGHTS = "model/llama2_epoch12_lr3e-5_openhour/checkpoint-2052"

# model = LlamaForCausalLM.from_pretrained(
#     BASE_MODEL,
#     load_in_8bit=True,
#     torch_dtype=torch.float16,
#     device_map={'': 0},
#     local_files_only=True,
# )

# model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16, device_map={'': 0})

In [8]:
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

In [9]:
data = load_dataset("json", data_files="../data/alpaca-api-enrichment-openhour-dataset.json")
data["train"]

Found cached dataset json (/home/bangzhao/.cache/huggingface/datasets/json/default-3c96d78c638832ba/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 3632
})

In [10]:
data['train'][0]

{'instruction': 'Your task is to predict the open hour of the venue based on its name, category, description, average price, customer reviews and other features. ',
 'input': 'Venue Name: Verrazzano-Narrows Bridge.\nVenue Category: Bridge.\nVenue Short Description: The Verrazzano-Narrows Bridge is a double-decked suspension bridge in the U.S. state of New York that connects the New York City boroughs of Staten Island and Brooklyn..\nThe Customer Reviews:\n 1. Avoid the Verrazano Bridge during storms and heavy gusts of wind.\n 2. named for the italian explorer giovanni da verrazzano, though for about half a century it was officially spelled “verrazano”\n 3. Best views ever!\n 4. Bottom level usually less traffic but beautiful views.\n 5. Beautiful streamlined bridge. Love seeing it from a distance. Watched it being built and drove over it the day it opened to the public..',
 'output': 'Open Daily 12:00 AM-12:00 AM'}

### Generate Prompt

In [11]:
def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""


def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [12]:
CUTOFF_LEN=2048

In [13]:
train_val = data["train"].train_test_split(
    test_size=0.25, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

Loading cached split indices for dataset at /home/bangzhao/.cache/huggingface/datasets/json/default-3c96d78c638832ba/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-9fd891e5cfca29b9.arrow and /home/bangzhao/.cache/huggingface/datasets/json/default-3c96d78c638832ba/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-691a4ece1590b43b.arrow
Loading cached processed dataset at /home/bangzhao/.cache/huggingface/datasets/json/default-3c96d78c638832ba/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-da59bc0f4127ee52.arrow
Loading cached processed dataset at /home/bangzhao/.cache/huggingface/datasets/json/default-3c96d78c638832ba/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-e4f2bdb224b306d2.arrow


In [14]:
train_val

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 2724
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 908
    })
})

## training

In [12]:
LORA_R = 8 # attention head
LORA_ALPHA = 16 # alpha scaling
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

# BATCH_SIZE = 128
MICRO_BATCH_SIZE = 16
LEARNING_RATE = 7e-5
OUTPUT_DIR = "model/open_hour_lr7e-5"

In [13]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()



trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


In [14]:
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    per_device_eval_batch_size=MICRO_BATCH_SIZE,
    num_train_epochs=10,
    learning_rate=LEARNING_RATE,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    do_eval=True,
    fp16=True,
    optim="adamw_torch",
    output_dir=OUTPUT_DIR,
    load_best_model_at_end=True,
    logging_steps = int(len(train_val['train']) / MICRO_BATCH_SIZE)
)

In [15]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [16]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False
# old_state_dict = model.state_dict
# model.state_dict = (
#     lambda self, *_, **__: get_peft_model_state_dict(
#         self, old_state_dict()
#     )
# ).__get__(model, type(model))

model = torch.compile(model)

trainer.train(resume_from_checkpoint=False)
model.save_pretrained(OUTPUT_DIR)



Epoch,Training Loss,Validation Loss
1,1.2586,1.004652
2,0.9765,0.951041
3,0.9256,0.91782
4,0.8976,0.902191
5,0.8866,0.892858
6,0.8732,0.885319
7,0.8615,0.880928
8,0.8519,0.878467
9,0.8518,0.876848
10,0.8456,0.876355




In [16]:
from huggingface_hub import notebook_login
 
notebook_login()
 
model.push_to_hub("bangzhao/llama2_lora_api_enrichment_open_hour_predictor", use_auth_token=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

adapter_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/bangzhao/llama2_lora_api_enrichment_open_hour_predictor/commit/5b01996113c9a1227348bd900e9608adc278b3e7', commit_message='Upload model', commit_description='', oid='5b01996113c9a1227348bd900e9608adc278b3e7', pr_url=None, pr_revision=None, pr_num=None)

## Predicting

In [15]:
BASE_MODEL = "model/llama2-7B-hf"
LORA_WEIGHTS = "model/open_hour_lr7e-5"

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map={'': 0},
    local_files_only=True,
)

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16, device_map={'': 0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
"""

def generate_response(prompt: str, model: PeftModel) -> GreedySearchDecoderOnlyOutput:
    encoding = tokenizer(prompt, return_tensors="pt")
    input_ids = encoding["input_ids"].to(DEVICE)

    generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        repetition_penalty=1.1,
    )
    with torch.inference_mode():
        return model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=128,
        )
    
def format_response(response: GreedySearchDecoderOnlyOutput) -> str:
    decoded_output = tokenizer.decode(response.sequences[0])
    response = decoded_output.split("### Response:")[1].strip()
    return "\n".join(textwrap.wrap(response))

def ask_alpaca(prompt: str, model: PeftModel = model) -> str:
    prompt = generate_prompt(prompt)
    #print(prompt)
    response = generate_response(prompt, model)
    return format_response(response)

In [17]:
DEVICE = 'cuda:0'

In [18]:
ask_alpaca(val_data[8])

'Mon-Thu 11:30 AM-9:00 PM; Fri-Sun 11:30 AM-10:00 PM</s>'

### Load model and Lora weight

In [20]:
predictions = []

for idx, venue in enumerate(val_data):
    
    
    prediction = {}
    prediction['text'] = venue['input']
    prediction['truth'] = venue['output']
    prediction['generated'] = ask_alpaca(venue)
    
    predictions.append(prediction)
    
    if idx % 5 == 0:
        print(str(round(idx/len(val_data) * 100, 2)) + " %")
        filename = f"../result/openhour_predict_name_des_cate_pri_fea_tip_epoch10.json"
        with open(filename, "w") as json_file:
            json.dump(predictions, json_file)

0.0 %
0.55 %
1.1 %
1.65 %
2.2 %
2.75 %
3.3 %
3.85 %
4.41 %
4.96 %
5.51 %
6.06 %
6.61 %
7.16 %
7.71 %
8.26 %
8.81 %
9.36 %
9.91 %
10.46 %
11.01 %
11.56 %
12.11 %
12.67 %
13.22 %
13.77 %
14.32 %
14.87 %
15.42 %
15.97 %
16.52 %
17.07 %
17.62 %
18.17 %
18.72 %
19.27 %
19.82 %
20.37 %
20.93 %
21.48 %
22.03 %
22.58 %
23.13 %
23.68 %
24.23 %
24.78 %
25.33 %
25.88 %
26.43 %
26.98 %
27.53 %
28.08 %
28.63 %
29.19 %
29.74 %
30.29 %
30.84 %
31.39 %
31.94 %
32.49 %
33.04 %
33.59 %
34.14 %
34.69 %
35.24 %
35.79 %
36.34 %
36.89 %
37.44 %
38.0 %
38.55 %
39.1 %
39.65 %
40.2 %
40.75 %
41.3 %
41.85 %
42.4 %
42.95 %
43.5 %
44.05 %
44.6 %
45.15 %
45.7 %
46.26 %
46.81 %
47.36 %
47.91 %
48.46 %
49.01 %
49.56 %
50.11 %
50.66 %
51.21 %
51.76 %
52.31 %
52.86 %
53.41 %
53.96 %
54.52 %
55.07 %
55.62 %
56.17 %
56.72 %
57.27 %
57.82 %
58.37 %
58.92 %
59.47 %
60.02 %
60.57 %
61.12 %
61.67 %
62.22 %
62.78 %
63.33 %
63.88 %
64.43 %
64.98 %
65.53 %
66.08 %
66.63 %
67.18 %
67.73 %
68.28 %
68.83 %
69.38 %
69.93 %
70.48 %

In [21]:
filename = f"../result/openhour_predict_name_des_cate_pri_fea_tip_epoch10.json"
with open(filename, "w") as json_file:
    json.dump(predictions, json_file)