# Data download from zenodo.org
In this notebook, we demonstrate how to use the DataDownloader to connect to zenodo.org and download the given .zip file in a specific directory. Additionally, we demonstrate how to use caching, so that the data is not downloaded twice.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berni-lehner/zenodo_data_interface/blob/main/notebooks/data_download.ipynb)

In [1]:
random_state = None

In [2]:
import sys
import os
import time
import glob
from pathlib import Path
import sklearn
import pandas as pd
import numpy as np
from collections import Counter


# only relevant for COLAB
COLAB_REPO_NAME = "zenodo_data_interface"

# location for downloaded data
DATA_PATH = Path(r"../data/")

DATA_URL = r"https://sandbox.zenodo.org/record/1179871/files/mask_dataset.zip"


In [3]:
def init():
    IN_COLAB = 'google.colab' in sys.modules

    if(IN_COLAB):
        !git clone https://github.com/berni-lehner/zenodo_data_interface.git
        sys.path.insert(0,"/content/" + COLAB_REPO_NAME + "/src")
        sys.path.insert(0,"/content/" + COLAB_REPO_NAME + "/data")
    else:
        sys.path.insert(0,"../src")
        sys.path.insert(0,"../data")

    try:
        from zippeekiyay import namelist
    except ImportError or ModuleNotFoundError:
        print('installing zippee-ki-yay...')
        !pip install git+https://github.com/berni-lehner/zippee-ki-yay.git

        from zippeekiyay import namelist

In [5]:
init()

from DataDownloader import DataDownloader as ddl

start_time = time.perf_counter()
dl_succeed = ddl.download_and_unpack(DATA_URL, DATA_PATH, cache=True)
end_time = time.perf_counter()
print(f"time passed: {end_time-start_time:.2f} s")
print(f"downloading synthetic data successful: {dl_succeed}")

time passed: 659.04 s
downloading synthetic data successful: True


In [10]:
file_names = list(DATA_PATH.glob('**/*.png'))
len(file_names)

41904

In [9]:
start_time = time.perf_counter()
dl_succeed = ddl.download_and_unpack(DATA_URL, DATA_PATH, cache=True)
end_time = time.perf_counter()
print(f"time passed: {end_time-start_time:.2f} s")
print(f"downloading successful: {dl_succeed}")

time passed: 4.52 s
downloading successful: True
