# Download, extract and store data locally

# User params

In [5]:
# Possible categories to download:
# --------------------------------
# Books
# CDs_and_Vinyl
# Cell_Phones_and_Accessories
# Clothing_Shoes_and_Jewelry
# Digital_Music
# Electronics
# Gift_Cards
# Grocery_and_Gourmet_Food
# Handmade_Products
# Health_and_Household
# Health_and_Personal_Care
# Home_and_Kitchen
# Industrial_and_Scientific
# Kindle_Store
# Magazine_Subscriptions
# Movies_and_TV
# Musical_Instruments
# Office_Products
# Patio_Lawn_and_Garden
# Pet_Supplies
# Software
# Sports_and_Outdoors
# Subscription_Boxes
# Tools_and_Home_Improvement
# Toys_and_Games
# Video_Games

category_to_download = "Subscription_Boxes"


# Utility Functions

In [3]:
# add src folder to path
import sys
import os
sys.path.append(os.path.abspath('../src'))

# other imports
import requests
from pathlib import Path
from src.utils.io import decompress_to_json


def get_data_urls(data_name: str) -> dict:
    """
    create urls from category name
    """
    base_url_reviews = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/"
    base_url_metadata = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/"
    
    link_review = f"{base_url_reviews}{data_name}.jsonl.gz"
    link_metadata = f"{base_url_metadata}meta_{data_name}.jsonl.gz"
    
    return {"review_url": link_review, "metadata_url": link_metadata}

    
def download_and_convert_file(url, dest_folder, json_filename):
    """
    download a file based on its url
    """
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    gz_filename = os.path.join(dest_folder, json_filename + ".gz")
    json_path = os.path.join(dest_folder, json_filename)

    # Download the file
    response = requests.get(url, stream=True)
    with open(gz_filename, 'wb') as gz_file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                gz_file.write(chunk)
       
    # decompress it and clean      
    decompress_to_json(gz_filename, json_path)
    os.remove(gz_filename)  # Remove the .gz file after decompression
    print(f"Downloaded and converted {json_filename} to {dest_folder}")

def download_data(data_name: str):
    """
    download reviews and metadata
    """
    urls = get_data_urls(data_name)
    if not urls:
        print(f"No URLs found for {data_name}")
        return

    # Adjust the path to go up one level from the current working directory
    dest_folder = Path("../data/raw")
    
    print("-- download reviews --")
    download_and_convert_file(urls["review_url"], dest_folder, f"{data_name}_review.json")
    
    print("-- download metadata --")
    download_and_convert_file(urls["metadata_url"], dest_folder, f"{data_name}_metadata.json")


# Processing

In [6]:
download_data(category_to_download)

Decompressed and saved to ..\data\raw\Subscription_Boxes_review.json
Downloaded and converted Subscription_Boxes_review.json to ..\data\raw
Decompressed and saved to ..\data\raw\Subscription_Boxes_metadata.json
Downloaded and converted Subscription_Boxes_metadata.json to ..\data\raw
