In [23]:
import os
import re
import math
import json
import random
from dotenv import load_dotenv
from huggingface_hub import login
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter
from openai import OpenAI
# import google.generativeai as genai
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
# google_api_key = os.getenv('GOOGLE_API_KEY')
hf_token = os.getenv('HF_TOKEN')

In [2]:
login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
from items import Item
from testing import Tester

In [3]:
openai = OpenAI()

In [5]:
%matplotlib inline

In [10]:
with open('train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('test.pkl', 'rb') as file:
    test = pickle.load(file)

In [7]:
fine_tune_train = train[:300]
fine_tune_validation = train[300:350]

In [8]:
def messages_for(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": f"Price is ${item.price:.2f}"}
    ]

In [9]:
messages_for(train[100])

[{'role': 'system',
  'content': 'You estimate prices of items. Reply only with the price, no explanation'},
 {'role': 'user',
  'content': "How much does this cost?\n\nRefresh NSF-53 Premium Replacement Refrigerator Water Filter Compatible with LG and Kenmore (2 Pack)\nWhy Refresh? Because quality and value are our top priorities; buy with confidence from Refresh and refresh your water today! Refresh your family's water with a PREMIUM PLUS Refresh brand refrigerator water filter. This filter is certified to the higher NSF-53 to remove chlorine, odor, as well as heavy metals, particulates, cysts, and many volatile organic compounds. Save big by purchasing a Refresh filter at up to 50% less than the manufacturer's OEM part, and save more by purchasing a 2 pack or 3 pack! This REFRESH FILTER IS COMPATIBLE with the following Kenmore models 9890 This REFRESH FILTER is also COMPATIBLE with these LG models LT"},
 {'role': 'assistant', 'content': 'Price is $34.99'}]

In [10]:
def make_jsonl(items):
    result = ""
    for item in items:
        messages = messages_for(item)
        messages_str = json.dumps(messages)
        result += '{"messages": ' + messages_str +'}\n'
    return result.strip()

In [11]:
print(make_jsonl(train[:3]))

{"messages": [{"role": "system", "content": "You estimate prices of items. Reply only with the price, no explanation"}, {"role": "user", "content": "How much does this cost?\n\nRefrigerator Door Handle Replacement for Frigidaire Refrigerator - Compatible with Black Door Handle - UpStart Components Brand\nUpStart Components Replacement Refrigerator Door Handle for Frigidaire RefrigeratorPlease note This is an UpStart Components Brand replacement part, NOT an OEM product. All mentions of brand names or model descriptions are made strictly to illustrate compatibility. All brand names and logos are registered trademarks of their respective owners. This product is not affiliated with any brands and is not covered under any warranties offered by the original manufacturers. Any warranties for this product are offered solely by UpStart Components. UpStart Components Replacement Refrigerator Door Handle for Frigidaire Refrigerator Ultra durable, high quality material resists wear and tear over 

In [12]:
def write_jsonl(items, filename):
    with open(filename, "w") as f:
        jsonl = make_jsonl(items)
        f.write(jsonl)

In [13]:
write_jsonl(fine_tune_train, "fine_tune_train.jsonl")

In [14]:
write_jsonl(fine_tune_validation, "fine_tune_validation.jsonl")

In [15]:
with open("fine_tune_train.jsonl", "rb") as f:
    train_file = openai.files.create(file=f, purpose="fine-tune")

In [16]:
train_file

FileObject(id='file-B8bn6VoqCDpuy1pDqsewpz', bytes=291388, created_at=1757499048, filename='fine_tune_train.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [18]:
with open("fine_tune_validation.jsonl", "rb") as f:
    validation_file = openai.files.create(file=f, purpose="fine-tune")

In [19]:
validation_file

FileObject(id='file-3ZqHs73aUnCQGUVbytALRA', bytes=48899, created_at=1757499069, filename='fine_tune_validation.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [21]:
openai.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=validation_file.id,
    model="gpt-4o-mini-2024-07-18",
    seed=42,
    hyperparameters={"n_epochs": 1},
    # integrations = [wandb_integration],
    suffix="pricer:bhavin-advant"
)

FineTuningJob(id='ftjob-e6IKEPbdRkRwwsZ76qw0TWpR', created_at=1757499106, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-oYbtrqUicBStOPsd7eP3rhap', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-B8bn6VoqCDpuy1pDqsewpz', validation_file='file-3ZqHs73aUnCQGUVbytALRA', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1))), user_provided_suffix='pricer:bhavin-advant', usage_metrics=None, shared_with_openai=False, eval_id=None)

In [23]:
print(openai.fine_tuning.jobs.list(limit=3))

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-e6IKEPbdRkRwwsZ76qw0TWpR', created_at=1757499106, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-oYbtrqUicBStOPsd7eP3rhap', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-B8bn6VoqCDpuy1pDqsewpz', validation_file='file-3ZqHs73aUnCQGUVbytALRA', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=1))), user_provided_suffix='pricer:bhavin-advant', usage_metrics=None, shared_with_openai=False, eval_id=None), FineTuningJob(id='ftjob-uOIxbMtHzFOl5sgazejTFGae', created_at=1753440

In [26]:
job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id
print(job_id)

ftjob-e6IKEPbdRkRwwsZ76qw0TWpR


In [27]:
openai.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-e6IKEPbdRkRwwsZ76qw0TWpR', created_at=1757499106, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-oYbtrqUicBStOPsd7eP3rhap', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-B8bn6VoqCDpuy1pDqsewpz', validation_file='file-3ZqHs73aUnCQGUVbytALRA', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=1))), user_provided_suffix='pricer:bhavin-advant', usage_metrics=None, shared_with_openai=False, eval_id=None)

In [35]:
openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=25).data

[FineTuningJobEvent(id='ftevent-V9GgZKKq1uBHUxbI08HK7jmy', created_at=1757499655, level='info', message='Step 209/300: training loss=1.06', object='fine_tuning.job.event', data={'step': 209, 'train_loss': 1.0601089000701904, 'total_steps': 300, 'train_mean_token_accuracy': 0.75}, type='metrics'),
 FineTuningJobEvent(id='ftevent-SSzjjvUcoC7AyE0w4ijm9nyf', created_at=1757499652, level='info', message='Step 208/300: training loss=1.47', object='fine_tuning.job.event', data={'step': 208, 'train_loss': 1.46714448928833, 'total_steps': 300, 'train_mean_token_accuracy': 0.75}, type='metrics'),
 FineTuningJobEvent(id='ftevent-mLou1hpP71tiPT3xBH3d7xNH', created_at=1757499652, level='info', message='Step 207/300: training loss=1.37', object='fine_tuning.job.event', data={'step': 207, 'train_loss': 1.3689844608306885, 'total_steps': 300, 'train_mean_token_accuracy': 0.75}, type='metrics'),
 FineTuningJobEvent(id='ftevent-BAI16H6Nz17uiUpznYg6HrUM', created_at=1757499652, level='info', message='Ste

In [36]:
import time
while True:
    status = openai.fine_tuning.jobs.retrieve(job_id).status
    print(f"Job status: {status}")
    if status in ["succeeded", "failed"]:
        break
    time.sleep(60)

Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: succeeded


In [30]:
import wandb
from wandb.integration.openai.fine_tuning import WandbLogger

wandb.login()
WandbLogger.sync(fine_tune_job_id=job_id, project="gpt-pricer")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:


Abort: 

In [6]:
fine_tuned_model_name = openai.fine_tuning.jobs.retrieve("ftjob-e6IKEPbdRkRwwsZ76qw0TWpR").fine_tuned_model
print(fine_tuned_model_name)

ft:gpt-4o-mini-2024-07-18:advant:pricer-bhavin-advant:CECGxWa0


In [8]:
def messages_for_testing(item):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Price is $"}
    ]

In [9]:
messages_for_testing(test[0])

NameError: name 'test' is not defined

In [4]:
openai.fine_tuning.jobs.list()

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-e6IKEPbdRkRwwsZ76qw0TWpR', created_at=1757499106, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:advant:pricer-bhavin-advant:CECGxWa0', finished_at=1757499801, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-oYbtrqUicBStOPsd7eP3rhap', result_files=['file-KyqKuHZRswHyf7gpCYpqq4'], seed=42, status='succeeded', trained_tokens=57932, training_file='file-B8bn6VoqCDpuy1pDqsewpz', validation_file='file-3ZqHs73aUnCQGUVbytALRA', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=1))), user_provided_suffix='pricer:bhavin-advant', usage_metrics=None, shared_with_openai=False, eval_id=Non

In [20]:
def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

In [None]:
get_price("The price is roughly $99.99 becausecytcyt 89 blah blah")

89.0

In [15]:
def gpt_fine_tuned(item):
    response = openai.chat.completions.create(
        model=fine_tuned_model_name,
        messages=messages_for_testing(item),
        seed=42,
        max_tokens=7
    )
    reply = response.choices[0].message.content
    return get_price(reply)

In [16]:
print(test[0].price)
print(gpt_fine_tuned(test[0]))

26.97
29.77


In [45]:
print(test[0].test_prompt())

How much does this cost to the nearest dollar?

Discount Parts Direct Oven Bake Element Heating Element for Frigidaire Kenmore, Replaces
Replaces the following part numbers Works for brands Electrolux, Frigidaire, Gibson, Kelvinator, Westinghouse, and others This part fixes the following symptoms Little to no heat when baking Element will not heat Oven not heating evenly Will Not Start Contact Us If you are not sure if part is correct, ask us in Customer questions & answers section or contact us by visiting the Discount Parts Direct storefront. Business Wholesale We are a small local company from Houston, Texas offer discount parts for retail and wholesale. If you need to purchase parts for business, please contact us for lower rates. Part Number Replaces Works for brand This oven heating element replacement Works for Electrolux, Frigidaire

Price is $


In [18]:
from testing import Tester

In [19]:
Tester.test(gpt_fine_tuned, test[:10])

[92m1: Guess: $29.77 Truth: $26.97 Error: $2.80 SLE: 0.01 Item: Discount Parts Direct 316075104 Oven Bak...[0m
[92m2: Guess: $30.47 Truth: $36.99 Error: $6.52 SLE: 0.04 Item: BestRec CHK100ZW Dryer Vent Heat Keeper ...[0m
[92m3: Guess: $22.99 Truth: $41.33 Error: $18.34 SLE: 0.32 Item: Fuxury 12 Inch Impulse Bag Sealer, Heat ...[0m
[92m4: Guess: $21.95 Truth: $45.02 Error: $23.07 SLE: 0.48 Item: Compatible Dryer Heating Element for Whi...[0m
[92m5: Guess: $22.99 Truth: $13.97 Error: $9.02 SLE: 0.22 Item: 694089 Dryer Blower Wheel by FreeCat,Rep...[0m
[92m6: Guess: $31.99 Truth: $55.65 Error: $23.66 SLE: 0.29 Item: Compatible with Whirlpool WP61005273 Ref...[0m
[92m7: Guess: $15.99 Truth: $20.99 Error: $5.00 SLE: 0.07 Item: Replacement Aluminum Range Hood Filter C...[0m
[92m8: Guess: $66.76 Truth: $30.68 Error: $36.08 SLE: 0.58 Item: Lifetime Appliance Parts UPGRADED 240350...[0m
[92m9: Guess: $43.66 Truth: $57.99 Error: $14.33 SLE: 0.08 Item: OEM LG Refrigerator Door B

IndexError: list index out of range

In [24]:
openai = OpenAI()

In [29]:
res = openai.files.list(limit=2)

for i in res:
    print(i)

FileObject(id='file-AppaTYMEtyMh5DS1Wzme8h', bytes=1617331, created_at=1757531580, filename='G-Paper-2-2015 - PandoraGAN.pdf', object='file', purpose='assistants', status='processed', expires_at=None, status_details=None)
FileObject(id='file-KyqKuHZRswHyf7gpCYpqq4', bytes=8108, created_at=1757500548, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', expires_at=None, status_details=None)
FileObject(id='file-3ZqHs73aUnCQGUVbytALRA', bytes=48899, created_at=1757499069, filename='fine_tune_validation.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)
FileObject(id='file-B8bn6VoqCDpuy1pDqsewpz', bytes=291388, created_at=1757499048, filename='fine_tune_train.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)
FileObject(id='file-Dcttq54ELn9kLRLVWL6HwM', bytes=48899, created_at=1757498467, filename='fine_tune_validation.jsonl', object='file', purpose=

In [44]:
openai_validation_file = "file-3ZqHs73aUnCQGUVbytALRA"

In [45]:
res = openai.files.content(openai_validation_file)


In [46]:
with open('file_content1.jsonl','wb') as f:
    f.write(res.content)

In [47]:
with open('file_content1.jsonl','r') as f:
    file1 = f.read()
file1



In [49]:
with open('fine_tune_validation.jsonl','r') as f:
    file2 = f.read()
file2



In [50]:
file1 == file2

True

In [41]:
openai_train_file='file-B8bn6VoqCDpuy1pDqsewpz'
res = openai.files.content(openai_train_file)
with open('file_content1.jsonl','wb') as f:
    f.write(res.content)


In [43]:
with open('file_content1.jsonl','r') as f:
    file1 = f.read()
file1
with open('fine_tune_train.jsonl','r') as f:
    file2 = f.read()
file2
file1 == file2

True

In [74]:
res = openai.fine_tuning.jobs.list()
models = []
for i in res:
    modelname=i.to_dict()['fine_tuned_model']
    print(modelname)
    models.append(modelname)

ft:gpt-4o-mini-2024-07-18:advant:pricer-bhavin-advant:CECGxWa0
ft:gpt-4o-mini-2024-07-18:advant:pricer:BxARcEmQ


In [62]:
res = openai.fine_tuning.jobs.list()
models = []
for i in res:
    modelname=i.to_dict()
    print(modelname)
    models.append(modelname)

{'id': 'ftjob-e6IKEPbdRkRwwsZ76qw0TWpR', 'created_at': 1757499106, 'error': {}, 'fine_tuned_model': 'ft:gpt-4o-mini-2024-07-18:advant:pricer-bhavin-advant:CECGxWa0', 'finished_at': 1757499801, 'hyperparameters': {'batch_size': 1, 'learning_rate_multiplier': 1.8, 'n_epochs': 1}, 'model': 'gpt-4o-mini-2024-07-18', 'object': 'fine_tuning.job', 'organization_id': 'org-oYbtrqUicBStOPsd7eP3rhap', 'result_files': ['file-KyqKuHZRswHyf7gpCYpqq4'], 'seed': 42, 'status': 'succeeded', 'trained_tokens': 57932, 'training_file': 'file-B8bn6VoqCDpuy1pDqsewpz', 'validation_file': 'file-3ZqHs73aUnCQGUVbytALRA', 'estimated_finish': None, 'integrations': [], 'metadata': None, 'method': {'type': 'supervised', 'supervised': {'hyperparameters': {'batch_size': 1, 'learning_rate_multiplier': 1.8, 'n_epochs': 1}}}, 'user_provided_suffix': 'pricer:bhavin-advant', 'usage_metrics': None, 'shared_with_openai': False, 'eval_id': None}
{'id': 'ftjob-uOIxbMtHzFOl5sgazejTFGae', 'created_at': 1753440112, 'error': {}, 'f

In [64]:
from datetime import datetime

In [71]:
create_time=datetime.utcfromtimestamp(models[0]['created_at']).strftime("%D")
create_time

'09/10/25'

In [None]:
completion_time=datetime.utcfromtimestamp(models[0]['finished_at']).strftime('%D')
completion_time

In [73]:
create_time=datetime.utcfromtimestamp(models[1]['created_at'])
completion_time=datetime.utcfromtimestamp(models[1]['finished_at'])
print(completion_time-create_time)

0:18:07


In [None]:
openai.fine_tuning.jobs.

In [77]:
response = openai.files.delete(openai_train_file)
print(response)

FileDeleted(id='file-B8bn6VoqCDpuy1pDqsewpz', deleted=True, object='file')


In [78]:
response = openai.files.delete(openai_validation_file)
print(response)

FileDeleted(id='file-3ZqHs73aUnCQGUVbytALRA', deleted=True, object='file')


In [83]:
my_model = models[0]
my_model

'ft:gpt-4o-mini-2024-07-18:advant:pricer-bhavin-advant:CECGxWa0'

In [84]:
response = openai.models.delete(my_model)
print(response)

PermissionDeniedError: Error code: 403 - {'error': {'message': 'You have insufficient permissions for this operation. Check that you have the correct role in your organization (Owner).', 'type': 'invalid_request_error', 'param': None, 'code': None}}