In [2]:
from azureml.core import Workspace, Datastore, Dataset
ws = Workspace.from_config()

In [16]:
# check data on compute - note: /tmp/ is wiped on every compute restart
!du -h /tmp/geopotential_500
!du -h /tmp/temperature_850

du: cannot access '/tmp/geopotential_500': No such file or directory
2.7G	/tmp/temperature_850


# Note: Azure ML vs Azure Data Factory for intial load of data
Here we use a scripted download via the Azure ML compute instance to obtain extenal data, and upload it from this temporary location
to the Azure Data Lake as a one-off operation (The datalake is already connected as a linked Azure ML Datastore).

Azure Data Factory provides alternatives for directly ingesting external data into the Data Lake, thus skipping the download to a local temporary directory.

Once the data is in the Data Lake, we can register the data as Azure ML Dataset(s), allowing us to pass the reference when submitting Azure ML Jobs.



In [None]:
# download geopotential data
!wget "https://dataserv.ub.tum.de/s/m1524895/download?path=%2F5.625deg%2Fgeopotential_500&files=geopotential_500_5.625deg.zip" -O geopotential_500_5.625deg.zip --no-check-certificate

In [None]:
!mkdir -p /tmp/geopotential_500
!unzip -d /tmp/geopotential_500/ geopotential_500_5.625deg.zip

In [4]:
# download temperature data
!wget "https://dataserv.ub.tum.de/s/m1524895/download?path=%2F5.625deg%2Ftemperature_850&files=temperature_850_5.625deg.zip" -O temperature_850_5.625deg.zip --no-check-certificate

--2022-07-11 20:35:07--  https://dataserv.ub.tum.de/s/m1524895/download?path=%2F5.625deg%2Ftemperature_850&files=temperature_850_5.625deg.zip
Resolving dataserv.ub.tum.de (dataserv.ub.tum.de)... 138.246.224.34, 2001:4ca0:800::8af6:e022
Connecting to dataserv.ub.tum.de (dataserv.ub.tum.de)|138.246.224.34|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 2254621588 (2.1G) [application/zip]
Saving to: ‘temperature_850_5.625deg.zip’


2022-07-11 20:36:33 (25.2 MB/s) - ‘temperature_850_5.625deg.zip’ saved [2254621588/2254621588]



In [5]:
!mkdir -p /tmp/temperature_850
!unzip -d /tmp/temperature_850/ temperature_850_5.625deg.zip

Archive:  temperature_850_5.625deg.zip
  inflating: /tmp/temperature_850/temperature_850hPa_1979_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1980_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1981_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1982_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1983_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1984_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1985_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1986_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1987_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1988_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1989_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1990_5.625deg.nc  
  inflating: /tmp/temperature_850/temperature_850hPa_1991_5.625deg.nc  
  inflating: /tmp/tempera

In [6]:
# check local directory size
!du -h /tmp/geopotential_500
!du -h /tmp/temperature_850


du: cannot access '/tmp/geopotential_500': No such file or directory
2.7G	/tmp/temperature_850


# Upload Local Files to datalake
doc link: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.dataset_factory.filedatasetfactory?view=azure-ml-py#azureml-data-dataset-factory-filedatasetfactory-upload-directory


In [7]:
# specify the target datastore that will hold the datasets (already configured in workspace)
datastore = Datastore.get(ws, 'azuregigdatalake_bronze')


In [None]:
# upload the geopotential data
geo500 = Dataset.File.upload_directory(src_dir = "/tmp/geopotential_500", 
                                       target=(datastore,"nc/TUM/WeatherBench/5.625deg/geopotential_500"), 
                                       pattern="*.nc", 
                                       overwrite=False, 
                                       show_progress=True)

In [None]:
# we now have an anonymous file dataset object
print(type(geo500))
print(geo500)

In [None]:
# [DEMO]: Register the dataset
geo500ds = 

In [8]:
# upload the temperature data
temp850 = Dataset.File.upload_directory(src_dir = "/tmp/temperature_850", 
                                       target=(datastore,"nc/TUM/WeatherBench/5.625deg/temperature_850"), 
                                       pattern="*.nc", 
                                       overwrite=False, 
                                       show_progress=True)

Validating arguments.
Arguments validated.
Uploading file to nc/TUM/WeatherBench/5.625deg/temperature_850
Filtering files with pattern matching *.nc
Uploading an estimated of 40 files
Uploading /tmp/temperature_850/temperature_850hPa_1982_5.625deg.nc
Uploaded /tmp/temperature_850/temperature_850hPa_1982_5.625deg.nc, 1 files out of an estimated total of 40
Uploading /tmp/temperature_850/temperature_850hPa_2005_5.625deg.nc
Uploaded /tmp/temperature_850/temperature_850hPa_2005_5.625deg.nc, 2 files out of an estimated total of 40
Uploading /tmp/temperature_850/temperature_850hPa_2018_5.625deg.nc
Uploaded /tmp/temperature_850/temperature_850hPa_2018_5.625deg.nc, 3 files out of an estimated total of 40
Uploading /tmp/temperature_850/temperature_850hPa_2013_5.625deg.nc
Uploaded /tmp/temperature_850/temperature_850hPa_2013_5.625deg.nc, 4 files out of an estimated total of 40
Uploading /tmp/temperature_850/temperature_850hPa_1980_5.625deg.nc
Uploaded /tmp/temperature_850/temperature_850hPa_1980

In [12]:
# we now have an anonymous file dataset object
print(type(temp850))
print(temp850)

<class 'azureml.data.file_dataset.FileDataset'>
FileDataset
{
  "source": [
    "('azuregigdatalake_bronze', '/nc/TUM/WeatherBench/5.625deg/temperature_850')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}


In [15]:
# Once registered, we can always get the this dataset object (pointer to data location) from the workspace from any compute
temp850ds = temp850.register(ws, 
                 name='temperature_850_5.625deg',
                 description="WeatherBench dataset 1979-2018. Source: Stephan Rasp, Peter D. Dueben, Sebastian Scher, Jonathan A. Weyn, Soukayna Mouatadid, and Nils Thuerey, 2020. WeatherBench: A benchmark dataset for data-driven weather forecasting. arXiv: https://arxiv.org/abs/2002.00469",
                 tags={
                     "data":"temperature", 
                     "grid":"5.625deg", 
                     "vlevel":"850hpa", 
                     "filetype":"nc" },
                 create_new_version=False
            )