In [None]:
# default_exp process_bson

# Process BSON

> Script to process BSON data into JPGs. Ideas from [here](https://www.kaggle.com/inversion/processing-bson-files/notebook).

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import pandas as pd
from fastcore.all import *
import io
import bson
from PIL import Image
from multiprocessing import Pool
from typing import List

In [None]:
#export
def save_images(product, save_dir):
    """Saves product's images to disk."""
    for i, img in enumerate(product["imgs"]):
        save_path = save_dir/f"{product['_id']}_{i}.jpg"
        if save_path.exists(): continue
        picture = Image.open(io.BytesIO(img["picture"]))
        picture.save(save_path)

In [None]:
#export
def get_mapping(product, columns: List[str]): return [product[col] for col in columns]

In [None]:
#export
@call_parse
def bson_to_jpeg(
    path: Param("Path to BSON", Path),
):
    """Coverts BSON to JPGs and saves product id to category mapping as CSV."""
    path = Path(path)
    save_dir = path.parent/"images"
    save_dir.mkdir(exist_ok=True)
    csv_save_path = path.parent/f"{path.stem}.csv"
    is_test = path.stem == "test"
    print(f"Converting {path} to JPGs in {save_dir}. Mapping saved in {csv_save_path}.")

    def parallel_map(func):
        with Pool() as pool:
            with path.open("rb") as file:
                return [res for res in pool.imap(func, bson.decode_file_iter(file), chunksize=10000)]

    print("Starting call to save images.")
    parallel_map(partial(save_images, save_dir=save_dir))
    print("Finished saving images.")

    cols = ["_id"]
    if not is_test: cols.append("category_id")
    print("Starting call to gather mapping.")
    mappings = parallel_map(partial(get_mapping, columns=cols))
    print("Finished gathering mapping.")

    df = pd.DataFrame(mappings, columns=cols)
    df.to_csv(csv_save_path, index=False)
    print(f"Saved CSV to {csv_save_path}.")
    print("Completed successfully.")
    return df

In [None]:
!rm data/train_example.csv

In [None]:
path = Path("./data/train_example.bson")

In [None]:
%time bson_to_jpeg(path)

Converting data/train_example.bson to JPGs in data/images. Mapping saved in data/train_example.csv.
Starting call to save images.
Finished saving images.
Starting call to gather mapping.
Finished gathering mapping.
Saved CSV to data/train_example.csv.
Completed successfully.
CPU times: user 51 ms, sys: 198 ms, total: 249 ms
Wall time: 238 ms


Unnamed: 0,_id,category_id
0,0,1000010653
1,1,1000010653
2,2,1000004079
3,3,1000004141
4,4,1000015539
...,...,...
77,95,1000010653
78,97,1000010683
79,98,1000010667
80,99,1000014053


In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted 01_find_duplicates.ipynb.
Converted index.ipynb.
