# Download All Files

## Imports

In [1]:
import urllib.request
import shutil
import gzip
import tarfile
import re
import os


## Definitions

In [2]:
use_google_drive = False

try:
  import google.colab
  from google.colab import drive
  use_google_drive = True
except Exception:
  pass


In [3]:
if use_google_drive:
  metadata_folder = "/content/gdrive/MyDrive/ColabData/amazon/"
  metadata_source = "/content/gdrive/MyDrive/ColabData/amazon/metadata.csv"
  tar_dataset = "/content/gdrive/MyDrive/ColabData/amazon/abo-images-small.tar"

  drive.mount("/content/gdrive")
else:
  metadata_folder = "metadata\\"
  metadata_source = "metadata\\metadata.csv"
  tar_dataset = "dataset\\abo-images-small.tar"


## Helper Functions

In [4]:
#https://stackoverflow.com/a/57923425/1930773

def gzip_file(gzip_file, work_dir):
  filename = os.path.split(gzip_file)[-1]
  filename = re.sub(r"\.gz$", "", filename, flags=re.IGNORECASE)

  with gzip.open(gzip_file, 'rb') as f_in:
    with open(os.path.join(work_dir, filename), 'wb') as f_out:
      shutil.copyfileobj(f_in, f_out)


if not 'gz' in [format[0] for format in shutil.get_unpack_formats()]:
  shutil.register_unpack_format('gz', ['.gz', ], gzip_file)


## Downloading Files

In [5]:
if not os.path.exists(f"{metadata_folder}\\abo-listings.tar"):
  urllib.request.urlretrieve("https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar", f"{metadata_folder}\\abo-listings.tar")

if not os.path.exists(tar_dataset):
  urllib.request.urlretrieve("https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar", tar_dataset)


## Extracting Metadata

In [6]:
tar = tarfile.open(tar_dataset)
tar.extract(tar.getmember("images/metadata/images.csv.gz"), path=metadata_folder)
shutil.unpack_archive(os.path.join(metadata_folder, "images", "metadata", "images.csv.gz"), metadata_folder)
shutil.rmtree(os.path.join(metadata_folder, "images"), ignore_errors=True)


In [7]:
shutil.unpack_archive(os.path.join(metadata_folder, "abo-listings.tar"), metadata_folder)

for gzfile in os.listdir(os.path.join(metadata_folder, "listings", "metadata")):
  shutil.unpack_archive(os.path.join(metadata_folder, "listings", "metadata", gzfile), metadata_folder)

shutil.rmtree(os.path.join(metadata_folder, "listings"), ignore_errors=True)
