In [117]:
import json
import os
from pathlib import Path
from pprint import pprint

import pandas as pd
import polars as pl
from datasets import Dataset
from huggingface_hub import login

## Paths

In [115]:
marqo_full_dir = Path("~/Datasets/marqo-gs-dataset/marqo_gs_full_10m").expanduser()
if not marqo_full_dir.exists():
    raise FileExistsError(f"Missing dataset file on location: {marqo_full_dir}.")

In [16]:
dataset_files = list(marqo_full_dir.iterdir())
print(f"Number of files in dataset dir: {len(dataset_files)}")
print("Files in dataset dir:")
pprint(sorted([file.name for file in dataset_files]))

Number of files in dataset dir: 18
Files in dataset dir:
['corpus_1.json',
 'corpus_2.json',
 'query_0_product_id_0.csv',
 'query_0_product_id_0_gt_dev.json',
 'query_0_product_id_0_gt_test.json',
 'query_0_product_id_0_queries.json',
 'query_0_product_id_1.csv',
 'query_0_product_id_1_gt_dev.json',
 'query_0_product_id_1_gt_test.json',
 'query_0_product_id_1_queries.json',
 'query_1_product_id_0.csv',
 'query_1_product_id_0_gt_dev.json',
 'query_1_product_id_0_gt_test.json',
 'query_1_product_id_0_queries.json',
 'query_1_product_id_1.csv',
 'query_1_product_id_1_gt_dev.json',
 'query_1_product_id_1_gt_test.json',
 'query_1_product_id_1_queries.json']


From the dataset GitHub Repository we conclude that dataset structure is following:

> 4 splits: training split with 80% of queries and 50% of documents, novel query splitwith the other 20% of queries and the same documents as the training split, novel corpus split with the same queries as the training split and unseen documents with the equal size of the training corpus, and zero-shot split with unseen queries and documents. <br><br>
For each dataset such as marqo_gs_full_10m, there are 4 splits as discussed before. <br>
    - **query_0_product_id_0** represents in-domain set, <br>
    - **query_1_product_id_0** represents novel query set, <br>
    - **query_0_product_id_1** represents novel document set, <br>
    - **query_1_product_id_1** represents zero shot set, <br> <br>
For each split, there is a ground truth csv containing triplet information, a set of validation ground truth and a set of test ground truth.

This means that for our model we will be using only first, "in-domain" set.

🔗 ***Note:** Link to the original dataset [Github Repository](https://github.com/marqo-ai/GCL?tab=readme-ov-file).*

## Dataset Analysis

- `query_0_product_id_0.csv`: 
    - query (str),
    - product_id (int), 
    - image_local (str),
    - position (int),
    - title (str),
    - pair_id (str),
    - score_linear (int),
    - score_reciprocal (float),
    - no_score (int),
    - query_id (int)
- `query_0_product_id_0_queries.json`: Dict[str, List[str]] **dev** and **test** keys with the list of queries (List[str])
- `query_0_product_id_0_gt_dev.json`: Dict[str, Dict[str, int]] Queries as **keys** that inside have dictionaries with product IDs as a **keys** and a **values** are the rankings of the products.
- `corpus_1/2.json`: List[Dict[str, Dict]] Product IDs as **keys** and values are dictionaries with **keys** and **values** same as columns in `query_0_product_id_0.csv`.

In [100]:
test_path = marqo_full_dir / "query_0_product_id_0.csv"
test_df = pd.read_csv(test_path)
num_samples = len(test_df)
print(f"Number of data samples: {num_samples:_}")

Number of data samples: 3_926_764


Clear row where title is smaller than query

In [101]:
mask = test_df.apply(lambda row: len(row["title"]) >= len(row["query"]), axis=1)
test_df = test_df[mask]
print(f"Removing {num_samples - mask.sum():_} rows.")
num_samples = len(test_df)

Removing 123_254 rows.


Taking Queries that have minimum of 3 products

In [105]:
test_df = test_df[test_df.groupby(by="query_id")["query_id"].transform("size") >= 3]
print(f"Removing {num_samples - len(test_df):_} rows.")
num_samples = len(test_df)

Removing 2 rows.


In [106]:
print(f"Number of unique queries: {test_df['query_id'].nunique():_}")

Number of unique queries: 78_586


Dropping unnecessary columns.

In [112]:
test_df = test_df.drop(columns=["image_local", "score_reciprocal", "no_score"])

In [113]:
test_df.head()

Unnamed: 0,query,product_id,position,title,pair_id,score_linear,query_id
0,Earmuffs,11950591053179551937,2,La Carrie Faux Fur Headband with Stretch Women...,Earmuffs-11950591053179551937,99,0
1,Earmuffs,13060356563414168615,3,Ugg Women's Shearling Earmuffs - Black,Earmuffs-13060356563414168615,98,0
2,Earmuffs,6741082963333937131,5,Prettylittlething Women's Cream Soft Faux Fur ...,Earmuffs-6741082963333937131,96,0
3,Earmuffs,8848678524883684053,8,Prettylittlething Women's Camel Soft Faux Fur ...,Earmuffs-8848678524883684053,93,0
4,Earmuffs,11334298244441157208,12,Michael Michael Kors Women's Embellished Faux-...,Earmuffs-11334298244441157208,89,0


## Upload to Hugging Face

In [118]:
login(token=os.environ["HF_TOKEN"])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/studeni/.cache/huggingface/token
Login successful


In [119]:
dataset = Dataset.from_pandas(df=test_df, split="train")

In [124]:
repo_id = "Studeni/marqo-gs-dataset"

In [127]:
dataset.push_to_hub(repo_id=repo_id, split="train")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1902 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1902 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Studeni/marqo-gs-dataset/commit/fbc5d3a867ad3647ca4849428bf0ac2722f7db22', commit_message='Upload dataset', commit_description='', oid='fbc5d3a867ad3647ca4849428bf0ac2722f7db22', pr_url=None, pr_revision=None, pr_num=None)