In [1]:
import sys
import os
# Add the project root to the Python path to allow imports from src
project_root = os.path.abspath(os.path.join(os.getcwd()))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
# Let's avoid curating all our data again! Load in the pickle files:
import pickle

with open('data/train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('data/test.pkl', 'rb') as file:
    test = pickle.load(file)

In [3]:
print(train[0].price)

11.99


In [7]:
import json

all_items = train + test
print(f"Loaded {len(all_items)} items.")

train_dataset = []
for item in train:
    if item.include:
        train_dataset.append({
            "prompt": item.prompt,
            "price": item.price
        })

test_dataset = []
for item in test:
    if item.include:
        test_dataset.append({
            "prompt": item.test_prompt(),
            "price": item.price
        })

train_output_file = "huggingface_dataset_train.jsonl"
print(f"Saving dataset to {train_output_file}...")
with open(train_output_file, 'w') as f:
    for entry in train_dataset:
        json.dump(entry, f)
        f.write('\n')

test_output_file = "huggingface_dataset_test.jsonl"
print(f"Saving dataset to {test_output_file}...")
with open(test_output_file, 'w') as f:
    for entry in test_dataset:
        json.dump(entry, f)
        f.write('\n')

print("Dataset created successfully.")

Loaded 27000 items.
Saving dataset to huggingface_dataset_train.jsonl...
Saving dataset to huggingface_dataset_test.jsonl...
Dataset created successfully.


In [15]:
from datasets import dataset_dict, load_dataset, DatasetDict
from src.config import HF_USER, PROJECT_NAME
from datetime import datetime
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv(override=True)
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

 # Hugging Face Configuration
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

# Load the dataset
train_dataset = load_dataset('json', data_files='huggingface_dataset_train.jsonl', split='train')
test_dataset = load_dataset('json', data_files='huggingface_dataset_test.jsonl', split='train')

dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Push to Hub (replace with your username and desired dataset name)
dataset_dict.push_to_hub(HUB_MODEL_NAME)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########| 9.84MB / 9.84MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  780kB /  780kB            

CommitInfo(commit_url='https://huggingface.co/datasets/alistermarc/llama3-pricer-2025-08-30_02.01.02/commit/ef450ef950703f97c0bbcfdea9b3a061b17b7372', commit_message='Upload dataset', commit_description='', oid='ef450ef950703f97c0bbcfdea9b3a061b17b7372', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/alistermarc/llama3-pricer-2025-08-30_02.01.02', endpoint='https://huggingface.co', repo_type='dataset', repo_id='alistermarc/llama3-pricer-2025-08-30_02.01.02'), pr_revision=None, pr_num=None)