In [None]:
from base64 import b64encode
import requests

def get_image_blob(url: str) -> str:
    resp = requests.get(url)
    if resp.status_code == 200:
        return b64encode(resp.content).decode("utf-8")
    return ""

### Books

In [None]:
from datasets import load_dataset

data = load_dataset("ada-datadruids/books", cache_dir="../.data/datasets", split="train")

In [None]:
weaviate_data = data.map(lambda x: {
    "cover_image_blob": get_image_blob(x["cover_image"]),
}).remove_columns(["book_id","rating_count","review_count","series","series_num","five_stars","four_stars","three_stars","two_stars","one_star","standardized_rating","normalized_rating"])

In [None]:
len(weaviate_data)

In [None]:
broken_books = ["Take Three Tenses: A Fugue in Time", "Beloved", "Live Flesh", "On Stranger Tides",
                "Funny Farm", "Back Street", "Evil Come, Evil Go", "Fighting Caravans"]

weaviate_data = weaviate_data.filter(lambda example: example["title"] not in broken_books)

In [None]:
len(weaviate_data)

Save the data with images converted to base64 strings

In [None]:
weaviate_data.save_to_disk(dataset_path="../.data/datasets_saved_to_disk/books")

### Amazon

In [None]:
from datasets import load_dataset

data = load_dataset("bprateek/amazon_product_description", cache_dir="../.data/datasets", split="train")

In [None]:
from helpers import get_image_blob

amazon_data = data.map(lambda x: {
    "name": x["Product Name"],
    "description": x["About Product"],
    "technical_details": x["Technical Details"],
    "image_url": x["Image"],
    "image_blob": get_image_blob(x["Image"]),
}).remove_columns(["Uniq Id", "Product Name", "Brand Name", "Asin", "Category", "Upc Ean Code", "List Price", "Selling Price", "Quantity", "Model Number", "About Product", "Product Specification", "Technical Details", "Shipping Weight", "Product Dimensions", "Image", "Variants", "Sku", "Product Url", "Stock", "Product Details", "Dimensions", "Color", "Ingredients", "Direction To Use", "Is Amazon Seller", "Size Quantity Variant", "Product Description"])

In [None]:
amazon_data

Save the data with images converted to base64 strings

In [None]:
amazon_data.save_to_disk(dataset_path="../.data/datasets_saved_to_disk/amazon")