In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import transformers
import textwrap
from transformers import LlamaTokenizer, LlamaForCausalLM
import os
import sys
from typing import List

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)

import fire
import torch
from datasets import load_dataset
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from pylab import rcParams

from transformers.generation.utils import GreedySearchDecoderOnlyOutput
from peft import PeftModel
from transformers import GenerationConfig

%matplotlib inline
sns.set(rc={'figure.figsize':(10, 7)})
sns.set(rc={'figure.dpi':100})
sns.set(style='white', palette='muted', font_scale=1.2)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

## Load model

In [2]:
BASE_MODEL = "model/llama2-7B-hf"

In [3]:
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# # continue fine-tuning
# BASE_MODEL = "model/llama2-7B-hf"
# LORA_WEIGHTS = "model/llama2_epoch15_lr4e-3/checkpoint-148"

# model = LlamaForCausalLM.from_pretrained(
#     BASE_MODEL,
#     load_in_8bit=True,
#     torch_dtype=torch.float16,
#     device_map={'': 0},
#     local_files_only=True,
# )

# model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16, device_map={'': 0})

In [5]:
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

In [6]:
data = load_dataset("json", data_files="../data/alpaca-api-enrichment-price-dataset.json")
data["train"]

Found cached dataset json (/home/bangzhao/.cache/huggingface/datasets/json/default-c8c012706a999bea/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 3156
})

In [7]:
data['train'][0]

{'instruction': "Your task is to predict the average consuming price of a venue based on its description, which includes its name and category. The venue's price will fall into one of four categories: Cheap, Moderate, Expensive, and Very Expensive. Remember, the name and category of the venue can be significant indicators of its price. For instance, fast-food chains like 'McDonald' might typically be 'Cheap', while upscale restaurants with names suggesting fine dining might be 'Expensive' or 'Very Expensive.'",
 'input': "Venue Name: Taïm Mobile Falafel & Smoothie Truck.\nVenue Category: Food Truck.\n Venue Short Description: Taim's famous falafel and fresh fruit smoothies roaming the streets of NYC on wheels! Vegan and gluten free options, and don't forget, we're kosher..\nThe Features: lunch, takeout\n The reviews of customers are:\n 1. Taïm\xa0offers a high-end twist on Tel Aviv street food. The falafel sandwich—green falafel, hummus, tahini and Israeli salad, all nestled inside a t

### generate prompt

In [8]:
def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""


def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [9]:
CUTOFF_LEN=2048

In [10]:
train_val = data["train"].train_test_split(
    test_size=0.25, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

Loading cached split indices for dataset at /home/bangzhao/.cache/huggingface/datasets/json/default-c8c012706a999bea/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0cc6d9c8621ec7ba.arrow and /home/bangzhao/.cache/huggingface/datasets/json/default-c8c012706a999bea/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-92003ed07fa4138c.arrow
Loading cached processed dataset at /home/bangzhao/.cache/huggingface/datasets/json/default-c8c012706a999bea/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-f2773ef1429e4811.arrow
Loading cached processed dataset at /home/bangzhao/.cache/huggingface/datasets/json/default-c8c012706a999bea/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-f5be150cca57f1a1.arrow


In [11]:
train_val

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 2367
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 789
    })
})

## training

In [12]:
LORA_R = 8 # attention head
LORA_ALPHA = 16 # alpha scaling
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

# BATCH_SIZE = 128
MICRO_BATCH_SIZE = 16
LEARNING_RATE = 1e-4
OUTPUT_DIR = "model/price_lr1e-4_v2"

In [13]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()



trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


In [14]:
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    per_device_eval_batch_size=MICRO_BATCH_SIZE,
    num_train_epochs=8,
    learning_rate=LEARNING_RATE,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    do_eval=True,
    fp16=True,
    optim="adamw_torch",
    output_dir=OUTPUT_DIR,
    load_best_model_at_end=True,
    logging_steps = int(len(train_val['train']) / MICRO_BATCH_SIZE)
)

In [16]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [17]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False
# old_state_dict = model.state_dict
# model.state_dict = (
#     lambda self, *_, **__: get_peft_model_state_dict(
#         self, old_state_dict()
#     )
# ).__get__(model, type(model))

model = torch.compile(model)

trainer.train(resume_from_checkpoint=True)
model.save_pretrained(OUTPUT_DIR)



Epoch,Training Loss,Validation Loss
6,0.7273,0.746346
7,0.7194,0.745417
8,0.7206,0.744675




In [None]:
from huggingface_hub import notebook_login
 
notebook_login()
 
model.push_to_hub("bangzhao/llama2_lora_api_enrichment_price_predictor", use_auth_token=True)

### Load Model and lora weight

In [3]:
BASE_MODEL = "model/llama2-7B-hf"
LORA_WEIGHTS = "model/price_lr7e-5"

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map={'': 0},
    local_files_only=True,
)

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16, device_map={'': 0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
"""

def generate_response(prompt: str, model: PeftModel) -> GreedySearchDecoderOnlyOutput:
    encoding = tokenizer(prompt, return_tensors="pt")
    input_ids = encoding["input_ids"].to(DEVICE)

    generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        repetition_penalty=1.1,
    )
    with torch.inference_mode():
        return model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=128,
        )
    
def format_response(response: GreedySearchDecoderOnlyOutput) -> str:
    decoded_output = tokenizer.decode(response.sequences[0])
    response = decoded_output.split("### Response:")[1].strip()
    return "\n".join(textwrap.wrap(response))

def ask_alpaca(prompt: str, model: PeftModel = model) -> str:
    prompt = generate_prompt(prompt)
    #print(prompt)
    response = generate_response(prompt, model)
    return format_response(response)

In [19]:
ask_alpaca(val_data[0])



'Moderate</s>'

In [20]:
val_data

Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 789
})

In [21]:
val_data['output'][:4]

['Moderate', 'Cheap', 'Cheap', 'Expensive']

In [22]:
predictions = []

for idx, venue in enumerate(val_data):
    
    
    prediction = {}
    prediction['text'] = venue['input']
    prediction['truth'] = venue['output']
    prediction['generated'] = ask_alpaca(venue)
    
    predictions.append(prediction)
    
    if idx % 50 == 0:
        print(str(round(idx/len(val_data) * 100, 2)) + " %")

0.0 %
6.34 %
12.67 %
19.01 %
25.35 %
31.69 %
38.02 %
44.36 %
50.7 %
57.03 %
63.37 %
69.71 %
76.05 %
82.38 %
88.72 %
95.06 %


### make predicted text numerical

In [25]:
for venue in predictions:
    if 'cheap' in venue['truth'].lower():
        venue['truth'] = 1
    elif 'moderate' in venue['truth'].lower():
        venue['truth'] = 2 
    elif 'expensive' in venue['truth'].lower() and ('very' not in venue['truth'].lower()):
        venue['truth'] = 3
    elif 'very expensive' in venue['truth'].lower():
        venue['truth'] = 4

    else:
        venue['truth'] = None
        
    if 'cheap' in venue['generated'].lower():
        venue['generated'] = 1
    elif 'moderate' in venue['generated'].lower():
        venue['generated'] = 2 
    elif 'expensive' in venue['generated'].lower() and ('very' not in venue['generated'].lower()):
        venue['generated'] = 3
    elif 'very expensive' in venue['generated'].lower():
        venue['generated'] = 4
    else:
        venue['generated'] = None

AttributeError: 'int' object has no attribute 'lower'

In [28]:
import json

filename = f"../result/price_predict_name_des_cate_v2_epoch8.json"
with open(filename, "w") as json_file:
    json.dump(predictions, json_file)

In [45]:
predictions_df = pd.DataFrame(predictions)

In [46]:
predictions_df

Unnamed: 0,text,truth,generated
0,Venue Name: Taproom No. 307.\n Venue Category:...,2,2
1,Venue Name: Food Gallery 32.\n Venue Category:...,1,2
2,Venue Name: Bagels on the Square.\n Venue Cate...,1,1
3,Venue Name: The Sunburnt Calf.\n Venue Categor...,3,2
4,Venue Name: Ost Cafe.\n Venue Category: Coffee...,1,2
...,...,...,...
784,Venue Name: Morning Coffee.\n Venue Category: ...,1,2
785,"Venue Name: Nespresso Boutique Bar, SoHo.\n Ve...",2,2
786,Venue Name: Dunkin'.\n Venue Category: Coffee ...,1,1
787,Venue Name: Tropix Bar & Lounge.\n Venue Categ...,2,2


### some basic evaluation

In [49]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [50]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(predictions_df['truth'], predictions_df['generated'])
print(f"Mean Absolute Error (MAE): {mae}")

# Mean Squared Error (MSE)
mse = mean_squared_error(predictions_df['truth'], predictions_df['generated'])
print(f"Mean Squared Error (MSE): {mse}")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 0.44740177439797213
Mean Squared Error (MSE): 0.467680608365019
Root Mean Squared Error (RMSE): 0.6838717777222708


In [51]:
dummy1 = [2] * len(predictions)
mae = mean_absolute_error(predictions_df['truth'],dummy1)
print(f"Mean Absolute Error (MAE): {mae}")
mse = mean_squared_error(predictions_df['truth'], dummy1)
print(f"Mean Squared Error (MSE): {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 0.5361216730038023
Mean Squared Error (MSE): 0.5640050697084917
Root Mean Squared Error (RMSE): 0.7510027095214049


In [52]:
dummy2 = [3] * len(predictions)

mae = mean_absolute_error(predictions_df['truth'],dummy2)
print(f"Mean Absolute Error (MAE): {mae}")
mse = mean_squared_error(predictions_df['truth'], dummy2)
print(f"Mean Squared Error (MSE): {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 1.2877059569074778
Mean Squared Error (MSE): 2.0836501901140685
Root Mean Squared Error (RMSE): 1.4434854312094973
