In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Downloading the ClimateNet dataset

In [None]:
import requests, os, bs4
from bs4 import BeautifulSoup

### List all links to NetCDF files at a given url

def list_nc_datasets(index_url):

    # Parse target url
    reqs = requests.get(index_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    # Find all link tags in the page and list their target href
    urls = [] 

    for link in soup.find_all('a'):
        urls.append(link.get('href'))

    # Keep only links to NetCDF file
    nc_data_urls = [x for x in urls if x.endswith('.nc')]

    return nc_data_urls

### Download a file to Google Drive

def download_file_gdrive(index_url, file_url, dest_dir):

    # Create folder
    os.makedirs('/content/gdrive/My Drive/CS230/Data/'+dest_dir, exist_ok=True)

    # Stream GET request
    r = requests.get(index_url+file_url, stream = True)
    blocks = []  

    # Save the image to folder
    with open(os.path.join('/content/gdrive/My Drive/CS230/Data/'+dest_dir, os.path.basename(file_url)), "wb") as file:

        for block in r.iter_content(chunk_size = None):
            if block:
                blocks.append(block)

        file.write(b''.join(blocks))

        # Display file size
        file.seek(0, os.SEEK_END)
        print ("Download complete: "+file_url+" – Size: "+str(file.tell())+" bytes.")
        file.close()
    
    return

### Download all NetCDF files rom a target url ###

def download_climate_net(index_url, dest_dir):

    nc_data_urls = list_nc_datasets(index_url)

    for i, file_url in enumerate(nc_data_urls):
        print(str(i+1)+"/"+str(len(nc_data_urls)), end=" ")
        download_file_gdrive(index_url, file_url, dest_dir)

    return

### Downloading test dataset



In [None]:
download_climate_net('https://portal.nersc.gov/project/ClimateNet/climatenet_new/test/', 'Test')

1/61 Download complete: data-2011-06-01-01-1_0.nc – Size: 63744786 bytes.
2/61 Download complete: data-2011-06-03-01-1_0.nc – Size: 63744786 bytes.
3/61 Download complete: data-2011-06-03-01-1_1.nc – Size: 63744786 bytes.
4/61 Download complete: data-2011-06-16-01-1_0.nc – Size: 63744786 bytes.
5/61 Download complete: data-2011-07-26-01-1_0.nc – Size: 63744786 bytes.
6/61 Download complete: data-2011-07-26-01-1_1.nc – Size: 63744786 bytes.
7/61 Download complete: data-2011-07-29-01-1_0.nc – Size: 63744786 bytes.
8/61 Download complete: data-2011-08-08-01-1_0.nc – Size: 63744786 bytes.
9/61 Download complete: data-2011-08-08-01-1_1.nc – Size: 63744786 bytes.
10/61 Download complete: data-2011-09-06-01-1_0.nc – Size: 63744786 bytes.
11/61 Download complete: data-2011-09-06-01-1_1.nc – Size: 63744786 bytes.
12/61 Download complete: data-2011-09-09-01-1_0.nc – Size: 63744786 bytes.
13/61 Download complete: data-2011-09-09-01-1_1.nc – Size: 63744786 bytes.
14/61 Download complete: data-2011

## Downloading train dataset

In [None]:
download_climate_net('https://portal.nersc.gov/project/ClimateNet/climatenet_new/train/', 'Train')

1/398 Download complete: data-1996-06-09-01-1_0.nc – Size: 63744786 bytes.
2/398 Download complete: data-1996-07-11-01-1_0.nc – Size: 63744786 bytes.
3/398 Download complete: data-1996-07-18-01-1_0.nc – Size: 63744786 bytes.
4/398 Download complete: data-1996-07-18-01-1_1.nc – Size: 63744786 bytes.
5/398 Download complete: data-1996-07-18-01-1_2.nc – Size: 63744786 bytes.
6/398 Download complete: data-1996-09-01-01-1_0.nc – Size: 63744786 bytes.
7/398 Download complete: data-1996-09-01-01-1_1.nc – Size: 63744786 bytes.
8/398 Download complete: data-1996-09-12-01-1_0.nc – Size: 63744786 bytes.
9/398 Download complete: data-1996-09-26-01-1_0.nc – Size: 63744786 bytes.
10/398 Download complete: data-1996-09-26-01-1_1.nc – Size: 63744786 bytes.
11/398 Download complete: data-1996-10-03-01-1_0.nc – Size: 63744786 bytes.
12/398 Download complete: data-1996-10-03-01-1_1.nc – Size: 63744786 bytes.
13/398 Download complete: data-1997-06-01-01-1_0.nc – Size: 63744786 bytes.
14/398 Download compl

# Upload to S3

In [None]:
! pip install boto3
! pip install --upgrade sagemaker
import os
import boto3


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
train_data_path = "/content/drive/MyDrive/Stanford/CS230/Code/Data/train"
test_data_path = "/content/drive/MyDrive/Stanford/CS230/Code/Data/test"

# Add access key and secret access key

session = boto3.Session(
aws_access_key_id='',
aws_secret_access_key=''
)

s3 = session.resource('s3')


In [None]:
def upload(file_path, s3_path):
  bucket = s3.Bucket("cs230-climatenet")
  for root, dirs, files in os.walk(file_path):
    file_num = 1
    for file in files: 
      bucket.upload_file(f"{file_path}/{file}", f"{s3_path}/{file}") 
      print(f"{file} file  {file_num} of {len(files)} uploaded") 
      file_num= file_num + 1 

In [None]:
upload(train_data_path, "train")

data-1996-07-11-01-1_0.nc file  0 of 398 uploaded
data-1996-07-18-01-1_0.nc file  1 of 398 uploaded
data-1996-06-09-01-1_0.nc file  2 of 398 uploaded
data-1996-07-18-01-1_1.nc file  3 of 398 uploaded
data-1996-07-18-01-1_2.nc file  4 of 398 uploaded
data-1996-09-01-01-1_0.nc file  5 of 398 uploaded
data-1996-09-01-01-1_1.nc file  6 of 398 uploaded
data-1996-09-12-01-1_0.nc file  7 of 398 uploaded
data-1996-09-26-01-1_0.nc file  8 of 398 uploaded
data-1996-09-26-01-1_1.nc file  9 of 398 uploaded
data-1996-10-03-01-1_0.nc file  10 of 398 uploaded
data-1996-10-03-01-1_1.nc file  11 of 398 uploaded
data-1997-06-01-01-1_0.nc file  12 of 398 uploaded
data-1997-06-02-01-1_0.nc file  13 of 398 uploaded
data-1997-06-05-01-1_0.nc file  14 of 398 uploaded
data-1997-06-18-01-1_0.nc file  15 of 398 uploaded
data-1997-07-09-01-1_0.nc file  16 of 398 uploaded
data-1997-07-09-01-1_1.nc file  17 of 398 uploaded
data-1997-08-14-01-1_0.nc file  18 of 398 uploaded
data-1997-08-14-01-1_1.nc file  19 of 398

In [None]:
upload(test_data_path, "test")

data-2011-06-01-01-1_0.nc file  0 of 61 uploaded
data-2011-06-03-01-1_0.nc file  1 of 61 uploaded
data-2011-06-03-01-1_1.nc file  2 of 61 uploaded
data-2011-06-16-01-1_0.nc file  3 of 61 uploaded
data-2011-07-26-01-1_0.nc file  4 of 61 uploaded
data-2011-08-08-01-1_1.nc file  5 of 61 uploaded
data-2011-07-26-01-1_1.nc file  6 of 61 uploaded
data-2011-09-06-01-1_0.nc file  7 of 61 uploaded
data-2011-07-29-01-1_0.nc file  8 of 61 uploaded
data-2011-09-06-01-1_1.nc file  9 of 61 uploaded
data-2011-08-08-01-1_0.nc file  10 of 61 uploaded
data-2011-09-09-01-1_0.nc file  11 of 61 uploaded
data-2011-09-09-01-1_1.nc file  12 of 61 uploaded
data-2011-09-09-01-1_2.nc file  13 of 61 uploaded
data-2011-09-09-01-1_3.nc file  14 of 61 uploaded
data-2011-09-12-01-1_0.nc file  15 of 61 uploaded
data-2011-09-30-01-1_0.nc file  16 of 61 uploaded
data-2011-09-30-01-1_1.nc file  17 of 61 uploaded
data-2011-09-30-01-1_2.nc file  18 of 61 uploaded
data-2011-09-30-01-1_3.nc file  19 of 61 uploaded
data-2011-