In [1]:
import os

import datasets
from huggingface_hub import HfApi
import pandas as pd

from dotenv import find_dotenv, load_dotenv
env_path = find_dotenv(".env")
load_dotenv(env_path);

In [2]:
token = os.environ.get("HUGGING_FACE_TOKEN")
hf_api = HfApi(token=token)

repo_id = "camfruss/bread_proofing"
hf_api.list_repo_files(repo_id, repo_type="dataset")

# hf_api.delete_folder(path_in_repo="data/", repo_id=repo_id, repo_type="dataset")
# hf_api.delete_file(path_in_repo="dataset_script.py", repo_id=repo_id, repo_type="dataset")

['.gitattributes', 'README.md']

In [3]:
df = pd.read_csv("./metadata.csv")
df = df.rename(columns={"file_name": "image"})

In [4]:
df_train = df[df["image"].str.startswith("train")].to_dict("records")
df_validate = df[df["image"].str.startswith("validate")].to_dict("records")
df_test = df[df["image"].str.startswith("test")].to_dict("records")

In [5]:
features = datasets.Features(
    {
        "image": datasets.Image(),
        "upvotes": datasets.Value("int32"),
        "under_proof": datasets.Value("float16"),
        "over_proof": datasets.Value("float16"),
        "perfect_proof": datasets.Value("float16"),
        "unsure_proof": datasets.Value("float16")
    }
)

In [6]:
train_dataset = datasets.Dataset.from_list(df_train).cast(features)
validate_dataset = datasets.Dataset.from_list(df_validate).cast(features)
test_dataset = datasets.Dataset.from_list(df_test).cast(features)

dataset = datasets.DatasetDict({
    "train": train_dataset,
    "valid": validate_dataset,
    "test": test_dataset
})

Casting the dataset:   0%|          | 0/3040 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/380 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/381 [00:00<?, ? examples/s]

In [7]:
dataset.push_to_hub("camfruss/bread_proofing")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3040 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/380 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/907 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/camfruss/bread_proofing/commit/ed643ab45f083cc0e3ca92319c84ee6d8d904b82', commit_message='Upload dataset', commit_description='', oid='ed643ab45f083cc0e3ca92319c84ee6d8d904b82', pr_url=None, pr_revision=None, pr_num=None)