# Amazon Bedrock Multimodal Workshop
## Content Search -- Dataset Preparation

In this notebook we are going to download and prepare the dataset we are going to be using for a search example.

In [None]:
import json
import csv
import os
import subprocess

### Download the raw dataset

In [None]:
!wget https://amazon-berkeley-objects.s3.us-east-1.amazonaws.com/archives/abo-listings.tar --no-check-certificate
!wget https://amazon-berkeley-objects.s3.us-east-1.amazonaws.com/images/metadata/images.csv.gz --no-check-certificate
!mkdir items-metadata
!gzip -d images.csv.gz
!tar xvf abo-listings.tar -C items-metadata
!rm -f abo-listings.tar
!gzip -d items-metadata/listings/metadata/listings_0.json.gz
!cp items-metadata/listings/metadata/listings_0.json listings_0.json

### Load the raw dataset

In [None]:
items_file = "listings_0.json"
items_metadata = []

with open(items_file, 'r') as json_file:
    items_metadata = list(map(json.loads, list(json_file)))

print("items metadata contains {} records".format(len(items_metadata)))

### Create the curated dataset
We are only going to use products with english descriptions and color

In [None]:
en_US_items = list(filter(
    lambda l: any(
        c['language_tag'] == "en_US" for c in l.get('color', [])
    ) and any(
        i['language_tag'] == "en_US" for i in l.get('item_name', [])
    ) and any(
        bp['language_tag'] == "en_US" for bp in l.get('bullet_point', [])
    ),
    items_metadata
))
print("There are {} items in with data in English".format(len(en_US_items)))

Now we have our items, let's create a list containing the item data and image location

In [None]:
# Load image information from CSV file into a dictionary
image_info = {}

with open('images.csv', 'r') as csvfile:
    csvreader = csv.DictReader(csvfile)
    for row in csvreader:
        image_id = row['image_id']
        image_info[image_id] = {
            'height': int(row['height']),
            'width': int(row['width']),
            'path': row['path']
        }
        
curated_dataset = []

for item in en_US_items:
    item_id = item.get('item_id', '')
    item_name = item.get('item_name', [{'value': ''}])[0]['value']
    color = item.get('color', [{'value': ''}])[0]['value']
    brand = item.get('brand', [{'value': ''}])[0]['value']
    bullet_points = [bp['value'] for bp in item.get('bullet_point', [])]
    description = ' '.join(bullet_points)

    image_id = item.get('main_image_id', '')
    image_info_item = image_info.get(image_id, {'height': 0, 'width': 0, 'path': ''})
    image_path = image_info_item['path']

    curated_dataset.append({
        'item_id': item_id,
        'item_name': item_name,
        'color': color,
        'brand': brand,
        'description': description,
        'image_path': image_path
    })

image_paths = [item['image_path'] for item in curated_dataset]
duplicate_image_paths = [path for path in image_paths if image_paths.count(path) > 1]

if duplicate_image_paths:
    print("Removing items with duplicated images:")
    for path in set(duplicate_image_paths):
        duplicated_items = [item for item in curated_dataset if item['image_path'] == path]
        #print(f"Image path: {path}, Removed items: {duplicated_items}")
        curated_dataset = [item for item in curated_dataset if item['image_path'] != path]
else:
    print("No items with duplicated images.")

print("Curated dataset contains {} records.".format(len(curated_dataset)))

### Download the images 

In [None]:
import os
import subprocess

# Folder to store downloaded images
download_folder = 'images'

# Create the download folder if it doesn't exist
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Download images using wget and remove items with download errors
curated_dataset_updated = []

for item in curated_dataset:
    image_path = item['image_path']
    image_url = f'https://amazon-berkeley-objects.s3.us-east-1.amazonaws.com/images/original/{image_path}'
    image_filename = os.path.join(download_folder, os.path.basename(image_path))

    # Use subprocess to run wget command
    result = subprocess.run(['wget', image_url,'--no-check-certificate', '-O', image_filename])

    if result.returncode == 0:
        # Image downloaded successfully, update the image_path in curated_dataset_updated
        item['image_path'] = os.path.basename(image_path)
        curated_dataset_updated.append(item)
        
print("Images downloaded successfully.")

# Update curated_dataset with only the items that were successfully downloaded
curated_dataset = curated_dataset_updated
print(f"Number of items after removing failed downloads: {len(curated_dataset)}")

print("Curated dataset with updated image paths saved to 'curated_dataset_updated.json'.")

In [None]:
# Save the result to a JSON file
with open('curated_dataset.json', 'w') as jsonfile:
    json.dump(curated_dataset, jsonfile, indent=2)

In [None]:
num_images = len([f for f in os.listdir(download_folder) if os.path.isfile(os.path.join(download_folder, f))])
print(f"Number of images in {download_folder}: {num_images}")
print("Number of items in curated_dataset is: {}".format(len(curated_dataset)))

### Clean files not needed

In [None]:
!rm -f images.csv
!rm -r items-metadata/