In [5]:
!pip install -q --upgrade litellm groq

In [2]:
import sys
sys.path.append("..")

In [3]:
from litellm import completion
from dotenv import load_dotenv
import json
from pricer.batch_runner import Batch
from pricer.items import Item
import os
load_dotenv(override=True)

True

## Here we will go with the full dataset but we will use small for open ai finetunning

In [4]:
from huggingface_hub import login
os.environ['HF_TOKEN'] = "BRO ENTER YOUR OWN TOKEN LOL"
login(os.environ['HF_TOKEN'], add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
from datasets import load_dataset
username = "ujalaarshad17"
dataset_name = f"{username}/items_raw_full"
train, val , test = Item.get_from_hub(dataset_name)

In [6]:
items = train + val + test

In [7]:
len(items)

30000

In [8]:
for index, item in enumerate(items):
    item.id = index

In [14]:
SYSTEM_PROMPT = """Create a concise description of a product. Respond only in this format. Do not include part numbers.
Title: Rewritten short precise title
Category: eg Electronics
Brand: Brand name
Description: 1 sentence description
Details: 1 sentence on features"""

In [16]:
print(items[0].description)

Queen Brass Valve Trombone Bb Nickel ["Queen Brass trombones are ideal for beginner to medium level or student musicians. This trombone features a phosphorus copper lead mouth pipe, with 3 smooth action valves. Every trombone is play tested at our factory and re-tested by a professional to ensure that their high-quality standards are met. This is why thousands of instructors have approved these trombones. This trombones package includes a hard shell case, a pair of gloves, a soft cleaning cloth, and a bottle of valve oil. Buy with confidence as it comes with a 14 months warranty against any manufacturer's defects."] {"Is Discontinued By Manufacturer": "No", "Date First Available": "December 11, 2017", "Material Type": "Brass, Nickel", "Instrument Key": "A", "Brand": "Queen Brass", "Material": "Brass, Nickel", "Style": "Modern", "Finish Type": "Polished"}


In [17]:
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": items[0].description}]
response = completion(messages=messages, model="groq/openai/gpt-oss-20b", reasoning_effort="low")

print(response.choices[0].message.content)
print()
print(f"Input tokens: {response.usage.prompt_tokens}")
print(f"Output tokens: {response.usage.completion_tokens}")
print(f"Cost: {response._hidden_params['response_cost']*100:.3f} cents")

Title: Queen Brass Nickel Trombone Bb  
Category: Musical Instruments  
Brand: Queen Brass  
Description: A polished brass and nickel trombone designed for beginner to intermediate players.  
Details: Features a phosphorus copper lead mouth pipe, three smooth action valves, a hard shell case, gloves, cleaning cloth, valve oil, and 14‑month warranty.

Input tokens: 320
Output tokens: 88
Cost: 0.008 cents


In [18]:
MODEL = "openai/gpt-oss-20b"

In [21]:
def make_jsonl(item):
    body = {"model": MODEL, "messages": [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": item.description}], "reasoning_effort": "low"}
    line = {"custom_id": str(item.id), "method": "POST", "url": "/v1/chat/completions", "body": body}
    return json.dumps(line)

In [22]:
make_jsonl(items[0])

'{"custom_id": "0", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "openai/gpt-oss-20b", "messages": [{"role": "system", "content": "Create a concise description of a product. Respond only in this format. Do not include part numbers.\\nTitle: Rewritten short precise title\\nCategory: eg Electronics\\nBrand: Brand name\\nDescription: 1 sentence description\\nDetails: 1 sentence on features"}, {"role": "user", "content": "Queen Brass Valve Trombone Bb Nickel [\\"Queen Brass trombones are ideal for beginner to medium level or student musicians. This trombone features a phosphorus copper lead mouth pipe, with 3 smooth action valves. Every trombone is play tested at our factory and re-tested by a professional to ensure that their high-quality standards are met. This is why thousands of instructors have approved these trombones. This trombones package includes a hard shell case, a pair of gloves, a soft cleaning cloth, and a bottle of valve oil. Buy with confidence as it 

In [23]:
def make_file(start, end, filename):
    batch_file = filename
    with open(batch_file, "w") as f:
        for i in range(start, end):
            f.write(make_jsonl(items[i]))
            f.write("\n")

In [26]:
make_file(0, 1000, "../jsonl/0_1000.jsonl")

In [27]:
import os
from groq import Groq

groq = Groq(api_key=os.environ.get("GROQ_API_KEY"))

In [31]:
with open("../jsonl/0_1000.jsonl", "rb") as f:
    response = groq.files.create(file = f, purpose = "batch")

In [32]:
response

FileCreateResponse(id='file_01kcb92f7rer8senky6vak9hsj', bytes=2557930, created_at=1765610175, filename='0_1000.jsonl', object='file', purpose='batch', size=0, md5='NEj64i01INTzk5WvOIBHPw==', content_type='application/jsonl')

In [33]:
file_id = response.id
file_id

'file_01kcb92f7rer8senky6vak9hsj'

In [34]:
response = groq.batches.create(completion_window="24h", endpoint="/v1/chat/completions", input_file_id=file_id)
response

BatchCreateResponse(id='batch_01kcb93f55e9r92y5qkn2dypbz', completion_window='24h', created_at=1765610208, endpoint='/v1/chat/completions', input_file_id='file_01kcb92f7rer8senky6vak9hsj', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1765696608, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=RequestCounts(completed=0, failed=0, total=0), project_id='project_01jtkerdbyf6t83w03yt2z4y31')

In [35]:
result = groq.batches.retrieve(response.id)
result

BatchRetrieveResponse(id='batch_01kcb93f55e9r92y5qkn2dypbz', completion_window='24h', created_at=1765610208, endpoint='/v1/chat/completions', input_file_id='file_01kcb92f7rer8senky6vak9hsj', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1765610219, error_file_id=None, errors=None, expired_at=None, expires_at=1765696608, failed_at=None, finalizing_at=1765610218, in_progress_at=1765610211, metadata=None, output_file_id='file_01kcb93rsvf8wakwt6hcsjrp64', request_counts=RequestCounts(completed=1000, failed=0, total=1000), project_id='project_01jtkerdbyf6t83w03yt2z4y31')

In [37]:
response = groq.files.content(result.output_file_id)
response.write_to_file("../jsonl/batch_results.jsonl")

In [38]:
import json

In [39]:
with open("../jsonl/batch_results.jsonl","r") as f:
    for line in f:
        json_line = json.loads(line)
        id = int(json_line['custom_id'])
        summary = json_line["response"]["body"]["choices"][0]["message"]["content"]
        items[id].summary = summary

In [41]:
print(items[999].summary)

Title: 1/3 HP 208‑230V 1075 RPM Condenser Motor  
Category: Electronics  
Brand: ProTech  
Description: OEM replacement condenser motor for Rheem, Weathering, and Sure Comfort models.  
Details: Direct‑drive, shell screw mounting, 1/3 hp, 208‑230 V AC, 1075 rpm, 1 speed, 1‑year warranty.


In [9]:
len(items)

30000

In [10]:
Batch.create(items, False)

Created 30 batches


In [11]:
Batch.run()

  0%|          | 0/30 [00:00<?, ?it/s]

Submitted 30 batches


In [15]:
Batch.fetch()

  0%|          | 0/30 [00:00<?, ?it/s]

Finished 30 of 30 batches


In [16]:
for index, item in enumerate(items):
    if not item.summary:
        print(index)

In [17]:
print(items[29999].summary)

Title: NC Whisper Momma Atmospheric Forge – Double Burner Open End  
Category: Industrial Tools  
Brand: NC Tool Company  
Description: A lightweight, transportable double‑burner forge that heats and reheats metal quickly with self‑regulating heat up to 2350 °F.  
Details: Features a 3” × 1½” open port at each end, push‑button ignition, inclusive hose/gauge/regulator, and high‑efficiency burners for uniform heat.


In [18]:
for item in items:
    item.description = None
    item.id = None

In [19]:
username = "ujalaarshad17"
full = f"{username}/items_transformed_full"

train = items[:24000]
val = items[24000:27000]
test = items[27000:30000]

Item.push_to_hub(full, train, val, test)


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            