In [None]:
# default_exp process_bson

# Process BSON

> Script to process BSON data into JPGs. Ideas from [here](https://www.kaggle.com/inversion/processing-bson-files/notebook).

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import pandas as pd
from fastcore.all import *
import io
import bson
from PIL import Image
from multiprocessing import Pool

In [None]:
#export
def save_images(product, save_dir):
    """Saves product's images to disk."""
    for i, img in enumerate(product["imgs"]):
        save_path = save_dir/f"{product['_id']}_{i}.jpg"
        if save_path.exists(): continue
        picture = Image.open(io.BytesIO(img["picture"]))
        picture.save(save_path)

In [None]:
#export
def get_mapping(product): return product["_id"], product["category_id"]

In [None]:
#export
@call_parse
def bson_to_jpeg(
    path: Param("Path to BSON", Path),
):
    """Coverts BSON to JPGs and saves product id to category mapping as CSV."""
    path = Path(path)
    save_dir = path.parent/"images"
    save_dir.mkdir(exist_ok=True)
    csv_save_path = path.parent/f"{path.stem}.csv"
    is_test = path.stem == "test"
    print(f"Converting {path} to JPGs in {save_dir}. Mapping saved in {csv_save_path}.")
    
    
    print("Starting call to save images.")
    with Pool() as pool:
        with path.open("rb") as file:
            for _ in pool.imap(partial(save_images, save_dir=save_dir), bson.decode_file_iter(file), chunksize=10000):
                pass
    print("Finished saving images.")
    
    df = None
    if not is_test:
        # Only have category_id's for train data
        print("Starting call to gather mapping.")
        mappings = []
        with Pool() as pool:
            with path.open("rb") as file:
                for mapping in pool.imap(get_mapping, bson.decode_file_iter(file), chunksize=10000):
                    mappings.append(mapping)
        print("Finished gathering mapping.")
        df = pd.DataFrame(mappings, columns=["_id", "category_id"])
        df.to_csv(csv_save_path, index=False)
        print(f"Saved CSV to {csv_save_path}.")
    print("Completed successfully.")
    return df

In [None]:
!rm data/train_example.csv

In [None]:
path = Path("./data/train_example.bson")

In [None]:
%time bson_to_jpeg(path)

Converting data/train_example.bson to JPGs in data/images. Mapping saved in data/train_example.csv.
Starting call to save images.
Finished saving images.
Starting call to gather mapping.
Finished gathering mapping.


KeyboardInterrupt: 

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
