In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import json
import pandas as pd
import numpy as np

from transformers import LlamaTokenizer, LlamaForCausalLM
import torch

from transformers.generation.utils import GreedySearchDecoderOnlyOutput
from transformers import GenerationConfig
import textwrap

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
with open('../../data/keywords_without_label.json', 'r') as jsonfile:
    prompt_venue_text = json.load(jsonfile)

In [3]:
#prompt_venue_text

In [4]:
BASE_MODEL = r"model/llama2-7B-hf"

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    #device_map="auto",
)

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

In [6]:
PROMPT_TEMPLATE1 = """
Your task is to generate a series of keywords of the venue based on its name, category, description, average price, customer reviews, etc. The generated keywords should be able to describe every aspect of the venue well.

### Example 1:

#### Input:
Venue Name: Chipotle Mexican Grill. 
Venue Category: Fast Food Restaurant, Mexican Restaurant. 
Venue Short Description: Chipotle is a restaurant that prepares burritos and tacos at reasonable prices. Our Food With Integrity philosophy includes unprocessed, sustainable, nutritious, responsibly raised, and organic foods where possible.. 
Venue price: cheap. 
The Open Hour: Mon-Fri 10:00 AM-10:00 PM; Sat-Sun 10:45 AM-10:00 PM 
The Features: cocktails, brunch, lunch, happy_hour, dinner, music, live_music
The Customer Reviews: 
 1. If you ask for double meat before the rice, sometimes they’ll forget to charge you extra. 
 2. Disrespectful staff. I asked for extra chicken and rice on my burrito. The rice made it in but the chicken was thrown in the bag with no lid. And they rolled their eyes when I asked! Never again! 
 3. Good portions. 
 4. They're pretty speedy despite the insane lunch line!! 5. Great place for a quick chicken steak or pork bowl.

#### Response:
meats, good service, chicken, well, lunch, cocktails, good for working, rice, lines, dinner, great value, burritos, onions, margaritas, dips, salsa, trendy, corn, good for a quick meal, barbacoa, black beans, red chili.

### Example 2:

#### Input:
Venue Name: Tony's Pizza. 
Venue Category: Pizzeria. 
Venue price: cheap. 
The Open Hour: Mon-Thu 10:00 AM-11:00 PM; Fri-Sat 10:00 AM-11:59 PM; Sun 10:30 AM-11:00 PM 
The Features: brunch, lunch, dinner, delivery 
The Customer Reviews: 
 1. What you order isn’t necessarily what you’ll get. Unless perhaps you order cramps and indigestion. 
 2. Love this spot :) cheap and good pizza 
 3. Simpler than Carmine's but just as good if not better. 
 4. Eat your upside down slice fuckin backwards 
 5. If you get pizza on Graham Ave, you're either a Tony's guy or a Carmine's guy. I'm a Carmine's guy..

#### Response:
chicken, pizza, soup, cheese, city, great value, bacon, good for a quick meal, hot sauce, Sicilian, garlic knots, carmine.

### Example 3:

#### Input:
Venue Name: Panera Bread. 
Venue Category: Bakery, Cafe, Coffee, and Tea House, Fast Food Restaurant. 
Venue Short Description: From focusing on quality, clean ingredients to serving our food to you in a warm and welcoming environment, Panera Bread is committed to being an ally to our guests. That means crafting a menu of soups, salads and sandwiches that we are proud to feed our families.... 
Venue price: moderate. 
The Open Hour: Mon-Sat 8:00 AM-8:00 PM 
The Features: full_bar, delivery 
The Customer Reviews: 
 1. Easy off and on the highway 
 2. I love the savory flavor of the French onion soup. My new fave is the cinnamon crunch bagel toasted with cream cheese 
 3. Smoked chicken. Fresh avocado. Melted smoked Gouda. And freshly-baked Black Pepper Focaccia. Just a few good reasons to crave our Chipotle Chicken Avocado Melt. At Panera. Food as it should be. 

#### Response:
restaurants, bar, alcohol, good service, breakfast food, sandwiches, soup, lunch, good for working, cake, bread, dinner, chili, trendy, good for a quick meal, cream cheese, iced green tea.

### Your Task:
#### Input:
{"[INPUT]"}

#### Response:
"""

In [12]:
def generate_response(text_input: str, model):
    
    prompt = PROMPT_TEMPLATE1.replace("[INPUT]", text_input).replace("'", '')
    encoding = tokenizer(prompt, return_tensors="pt")
    input_ids = encoding["input_ids"].to(DEVICE)
    
    generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        repetition_penalty=1.1,
    )
    with torch.inference_mode():
        response = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=128,
        )
    
    decoded_output = tokenizer.decode(response.sequences[0])
    response = decoded_output.split("#### Response:")[4].strip()
    output = "\n".join(textwrap.wrap(response))
    return output

def load_checkpoint(filepath=None):
    
    if filepath == None:
        return {}
    
    # Provide the path to your JSON file
    text_file = filepath

    # Load JSON data from the file
    with open(text_file, "r") as json_file:
        generated_formatted_venue_text = json.load(json_file)
        
    return generated_formatted_venue_text

In [8]:
text_inputs = []
labels = []

for idx, venue in prompt_venue_text.items():
    text_inputs.append(venue['text'])
    labels.append(venue['truth'])

In [9]:
text_inputs[0]

"Venue Name: Pearl Art & Craft Supply.\nVenue Category: Arts and Crafts Store.\nThe Customer Reviews:\n 1. A paradise for any kind of artist, or even if you're not one yet. 10% discount if you have a student or teacher ID !\n 2. Take the stairs\n 3. Great place to buy all you need, but the place looks like its going to collapse\n 4. Muy buena selección de cosas y personal experto, un poco en caida pero aun asi de lo mejor de ny\n 5. There's an elevator in the back!."

In [None]:
from transformers import GenerationConfig

def generate_batch_responses(text_inputs, model, tokenizer, device, batch_size=8):
    responses = []
    
    # Prepare batches of prompts
    for i in range(0, len(text_inputs), batch_size):
        batch_prompts = text_inputs[i:i + batch_size]
        prompts = [PROMPT_TEMPLATE1.replace("[INPUT]", text).replace("'", '') for text in batch_prompts]
        
        # Tokenize all prompts in the current batch
        #encoding = tokenizer(prompts, padding=True, return_tensors="pt")
        encoding = tokenizer(prompts, padding=True, return_tensors="pt")
        input_ids = encoding["input_ids"].to(device)
        
        # Generate responses for the entire batch
        with torch.inference_mode():
            batch_responses = model.generate(
                input_ids=input_ids,
                max_new_tokens=128,  # Assuming 128 is the desired maximum length
                temperature=0.1,
                top_p=0.75,
                repetition_penalty=1.1,
                #pad_token_id=tokenizer.eos_token_id,  # Ensure padding with the EOS token
                return_dict_in_generate=True,
                output_scores=True,
            )
        
        # Decode each response in the batch
        for response in batch_responses.sequences:
            decoded_output = tokenizer.decode(response)
            split_response = decoded_output.split("#### Response:")[4].strip()
            wrapped_response = "\n".join(textwrap.wrap(split_response))
            responses.append(wrapped_response)
            
        if i % 10 == 0:
            filename = "data/keywords/keywords_predict_name_des_tip_batch.json"
            with open(filename, "w") as json_file:
                json.dump(responses, json_file)

    return responses

generated_formatted_venue_text = generate_batch_responses(
    text_inputs=text_inputs,
    model=model,
    tokenizer=tokenizer,
    device=DEVICE,
    batch_size=8  # Adjust batch size as needed based on available memory
)

# Save the generated responses to a file
filename = "../../result/keywords_predict_name_des_tip_batch.json"
with open(filename, "w") as json_file:
    json.dump(generated_formatted_venue_text, json_file)

