https://www.youtube.com/watch?v=XGUS6DYZfCc&list=PLLxyyob7YmEE8S3QDs1PZQkiBxA4zn_Gx&index=2

In [23]:
import requests
import numpy as np
import os, sys

from functools import partial
from multiprocessing import Pool
import re
from typing import List

# https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
import requests
from bs4 import BeautifulSoup

In [24]:
import constants
constants.LONGRUNMIP_DIR
sys.path.append(constants.MODULE_DIR)
from utils import pprint_list

# Getting List of All Files Names

In [25]:
def list_files_with_extension(url: str, file_extension: str='') -> List[str]:
    '''
    Gets all the hyper-linked items ('a' - <a> ... </a>) from an url.
    
    Parameters
    ----------
    url: str
        The url to get the files from
    file_extension: str
        The extensions on the files being looked for 
        (e.g. .nc for netcdf files)
    
    Returns
    -------
    - List of all the paths to the files being downloaded.
    '''
    # Get the text from the page
    page = requests.get(url, auth=(username, password)).text
    # Turn that text into html code
    soup = BeautifulSoup(page, 'html.parser')
    
    
    # Loop through all the items in soup. Each one is a contained piece of html.
    # E.g. <h1> ... </h1>, <p> ... </p>.
    # However, this is only being done with items with <a> ... </a> (final_all('a')) (hyper-linked items). 
    # Then, only add the items with 'file_exntension' to the list

    files = [url + '/' + node.get('href') for node in soup.find_all('a')
             if node.get('href').endswith(file_extension)]
    return files

In [26]:
def download_longrunmip_nc_file(fpath: str, SAVE_DIRECTORY: str='', debug=True) -> None:
    '''
    Downloads the netcdf file on longrunmip website
    '''
    # The name of the file
    fname = os.path.basename(fpath)
    if debug:
        print(f'{fname} -> {SAVE_DIRECTORY}')
    
    r = requests.get(fpath, auth=(username, password), allow_redirects=True)
    
    # Saving the file
    with open(os.path.join(SAVE_DIRECTORY, fname), 'wb') as f:
        f.write(r.content)
    if debug:
        print(f'{fname} complete')

In [27]:
def experiment_is_needed(fname: str, needed_experiments: List[str]) -> bool:
    '''Checks if the experiment is actually needed. If not return False'''
    for n_exp in needed_experiments:
        if re.search(f'\w+{n_exp}\w+', fname):
            return True
    return False

In [28]:
list(constants.VARIABLE_INFO)

['tas', 'pr', 'netTOA', 'sic', 'psl', 'tos', 'surf']

In [29]:
# Username and possword for longrunmip
username = 'longrunmip'
password = 'data4you'

In [30]:
variable = 'tas' # Completed: tas, pr, netTOA, tas, psl, tos, surf
url = f'https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/{variable}'
url

'https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/tas'

In [32]:
files = list_files_with_extension(url, file_extension='.nc')
files = [f for f in files if experiment_is_needed(f, ['picontrol', 'control', 'abrupt4x'])]
pprint_list(files)

lenght = 32
 0. https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/tas/tas_ann_FAMOUS_abrupt4x_3000_g025.nc
 1. https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/tas/tas_ann_FAMOUS_control_3000_g025.nc


In [33]:
SAVE_DIRECTORY = os.path.join(constants.LONGRUNMIP_DIR, variable, 'regrid')
SAVE_DIRECTORY

'/g/data/w40/ab2313/PhD/longrunmip/tas/regrid'

In [35]:
len(os.listdir(SAVE_DIRECTORY))

32

In [34]:
try:
    os.mkdir(SAVE_DIRECTORY)
except FileExistsError as e:
    pass

In [223]:
%%time
download_longrunmip_nc_file_with_dir = partial(download_longrunmip_nc_file, 
                                               SAVE_DIRECTORY = SAVE_DIRECTORY, debug=True)
for i, fname in enumerate(files):
    print(f'{i}: ', end='')
    download_longrunmip_nc_file_with_dir(fname) 

0: surf_ann_CCSM3_abrupt4x_2130_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/surf/regrid
surf_ann_CCSM3_abrupt4x_2130_g025.nc complete
1: surf_ann_CCSM3_control_1530_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/surf/regrid
surf_ann_CCSM3_control_1530_g025.nc complete
2: surf_ann_CESM104_abrupt4x_5900_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/surf/regrid
surf_ann_CESM104_abrupt4x_5900_g025.nc complete
3: surf_ann_CESM104_control_1000_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/surf/regrid
surf_ann_CESM104_control_1000_g025.nc complete
4: surf_ann_CNRMCM61_abrupt4x_1850_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/surf/regrid
surf_ann_CNRMCM61_abrupt4x_1850_g025.nc complete
5: surf_ann_CNRMCM61_control_2000_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/surf/regrid
surf_ann_CNRMCM61_control_2000_g025.nc complete
6: surf_ann_ECHAM5MPIOM_control_100_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/surf/regrid
surf_ann_ECHAM5MPIOM_control_100_g025.nc complete
7: surf_ann_FAMOUS_abrupt4x_3000_

In [135]:
# [download_longrunmip_nc_file_with_dir(fname) for fname in files]
# with Pool() as pool:
#     pool.map(download_longrunmip_nc_file_with_dir, files)
# [download_longrunmip_nc_file(fpath, SAVE_DIRECTORY) for fpath in files]