Product pricer using LLMs

In [1]:
import os
import re
import math
import json
import random
from dotenv import load_dotenv
from huggingface_hub import login
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter
from openai import OpenAI
import google.generativeai

In [2]:
# Enviroments
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

#Logging to HuggingFace
hf_token = os.environ['HUGGINGFACEHUB_API_TOKEN']
login(hf_token,add_to_git_credential=True)

In [3]:
from items import Item

In [4]:
from testing import Tester

In [5]:
openai = OpenAI()

In [6]:
%matplotlib inline

In [9]:
#Loading the data
with open('train.pkl','rb') as file:
    train = pickle.load(file)
with open('test.pkl','rb') as file:
    test = pickle.load(file)

In [11]:
fine_tune_train = train[:50]
fine_tune_validation = train[50:100]

## Step 1

In [12]:
# Creating JSONL jsonline format

def messages_for(item):
    system_messgae = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace("to the nearest dollar","").replace("\n\nPrice is $","")
    return[
        {"role":"system","content":system_messgae},
        {"role":"system","content":user_prompt},
        {"role":"assistant","content":f"Price is ${item.price:.2f}"}
    ]
    

In [13]:
# COnvert itens into a list of json object - jsonl
# Each ros is of this form:
# {"messgae":[{"role":"system","content":"You estimate..."}]}

def make_jsonl(items):
    result = ""
    for item in items:
        messages = messages_for(item)
        message_str = json.dumps(messages)
        result += '{"messgaes": ' + message_str + '}\n'
    return result.strip()

In [16]:
# COnvert items to jsonl and write into afile
def write_jsnol(items, filename):
    with open(filename, "w") as f:
        jsnol = make_jsonl(items)
        f.write(jsnol)

In [32]:
write_jsnol(fine_tune_train, "fine_tune_train.jsonl")
write_jsnol(fine_tune_validation,"fine_tune_validation.jsonl")

In [18]:
with open("fine_tune_train.jsonl", 'rb')as f:
    train_file = openai.files.create(file=f, purpose="fine-tune")

In [20]:
with open("fine_tune_validation.jsonl", "rb") as f:
    validation_file = openai.files.create(file=f, purpose="fine-tune")

## Step 2

In [21]:
wandb_integration = {"type": "wandb", "wandb": {"project": "gpt-pricer"}}

In [23]:
openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4o-mini-2024-07-18",
    seed=42,
    hyperparameters={"n_epochs": 1},
    suffix="pricer"
)

BadRequestError: Error code: 400 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details.', 'type': 'invalid_request_error', 'param': None, 'code': 'exceeded_quota'}}

In [24]:
from google import genai
from google.genai import types
client = genai.Client() # Get the key from the GOOGLE_API_KEY env variable

for model_info in client.models.list():
    print(model_info.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-04-17
models/gemini-2.5-flash-preview-04-17-thinking
models/gemini-2.5-pro-preview-05-06
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-

In [27]:
tuning_job = client.tunings.tune(
    base_model='models/models/gemini-1.5-flash-001-tuning',
    training_dataset=train_file,
    config=types.CreateTuningJobConfig(
        epoch_count= 2,
        batch_size=4,
        learning_rate=0.001,
        tuned_model_display_name="test tuned model"
    )
)

ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': '* CreateTunedModelRequest.tuned_model.base_model: Unexpected model name format.\n* CreateTunedModelRequest.tuned_model.tuning_task.training_data: Too few training data.\n', 'status': 'INVALID_ARGUMENT'}}

In [34]:
import json
from google.genai import types
from google.genai.types import TuningDataset

In [35]:
def load_chat_jsonl_to_tuning_dataset(path):
    examples = []
    with open(path, 'r') as f:
        for line in f:
            entry = json.loads(line)
            messages = entry.get("messages", [])

            input_parts = []
            output_text = None

            for m in messages:
                role = m.get("role")
                content = m.get("content", "")

                if role == "assistant":
                    output_text = content  # Assume only one assistant reply
                else:
                    input_parts.append(f"{role}: {content}")

            if input_parts and output_text:
                text_input = "\n".join(input_parts)
                examples.append(TuningExample(text_input=text_input, output=output_text))

    return TuningDataset(examples=examples)

# Usage
training_dataset = load_chat_jsonl_to_tuning_dataset("fine_tune_train.jsonl")


In [36]:
client = genai.Client() # Get the key from the GOOGLE_API_KEY env variable


In [37]:
tuning_job = client.tunings.tune(
    base_model='models/gemini-1.5-flash-001-tuning',
    training_dataset=training_dataset,
    config=types.CreateTuningJobConfig(
        epoch_count= 5,
        batch_size=4,
        learning_rate=0.001,
        tuned_model_display_name="gemini-fine-tuned-pricer"
    )
)

ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': "* CreateTunedModelRequest.tuned_model.tuning_task.training_data: Too few training data.\n* CreateTunedModelRequest.tuned_model.tuning_task.training_data.examples.example_list: required one_of 'example_list' must have one initialized field\n", 'status': 'INVALID_ARGUMENT'}}

In [41]:
print("Number of examples:", len(training_dataset.examples))
for ex in training_dataset.examples[:3]:  # preview a few
    print("INPUT:", ex.text_input)
    print("OUTPUT:", ex.output)


Number of examples: 0


In [40]:
from google.genai.models import CreateTuningJobConfig

tuning_job = client.tunings.tune(
    base_model="models/gemini-1.5-flash-001-tuning",
    training_dataset=training_dataset,
    config=CreateTuningJobConfig(
        epoch_count=5,
        batch_size=4,
        learning_rate=0.001,
        tuned_model_display_name="price-estimation-model"
    )
)


ImportError: cannot import name 'CreateTuningJobConfig' from 'google.genai.models' (C:\Users\Annu\anaconda3\envs\llms\Lib\site-packages\google\genai\models.py)

In [42]:
from google import genai
from google.genai import types
client = genai.Client() # Get the key from the GOOGLE_API_KEY env variable

for model_info in client.models.list():
    print(model_info.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-04-17
models/gemini-2.5-flash-preview-04-17-thinking
models/gemini-2.5-pro-preview-05-06
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-

In [44]:
# create tuning model
training_dataset =  [
    ["1", "2"],
    ["3", "4"],
    ["-3", "-2"],
    ["twenty two", "twenty three"],
    ["two hundred", "two hundred one"],
    ["ninety nine", "one hundred"],
    ["8", "9"],
    ["-98", "-97"],
    ["1,000", "1,001"],
    ["10,100,000", "10,100,001"],
    ["thirteen", "fourteen"],
    ["eighty", "eighty one"],
    ["one", "two"],
    ["three", "four"],
    ["seven", "eight"],
]
training_dataset=types.TuningDataset(
        examples=[
            types.TuningExample(
                text_input=i,
                output=o,
            )
            for i,o in training_dataset
        ],
    )
tuning_job = client.tunings.tune(
    base_model='models/gemini-1.5-flash-001-tuning',
    training_dataset=training_dataset,
    config=types.CreateTuningJobConfig(
        epoch_count= 5,
        batch_size=4,
        learning_rate=0.001,
        tuned_model_display_name="test tuned model"
    )
)



In [51]:
response = client.models.generate_content(
    model=tuning_job.tuned_model.model,
    contents='III'
)

AttributeError: 'NoneType' object has no attribute 'model'

In [49]:
tuning_job_name = tuning_job.name  # Save this after submission


In [47]:
completed_job = client.tunings.get_tuning_job(name=tuning_job_name)

# Now get the tuned model
tuned_model_name = completed_job.tuned_model.model


AttributeError: 'Tunings' object has no attribute 'get_tuning_job'