https://www.youtube.com/watch?v=XGUS6DYZfCc&list=PLLxyyob7YmEE8S3QDs1PZQkiBxA4zn_Gx&index=2

In [176]:
import requests
import numpy as np
import os

from functools import partial
from multiprocessing import Pool
import re
from typing import List

# https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
import requests
from bs4 import BeautifulSoup

In [183]:
import constants
constants.LONGRUNMIP_DIR
sys.path.append(constants.MODULE_DIR)
from utils import pprint_list

# Getting List of All Files Names

In [98]:
def list_files_with_extension(url: str, file_extension: str='') -> List[str]:
    '''
    Gets all the hyper-linked items ('a' - <a> ... </a>) from an url.
    
    Parameters
    ----------
    url: str
        The url to get the files from
    file_extension: str
        The extensions on the files being looked for 
        (e.g. .nc for netcdf files)
    
    Returns
    -------
    - List of all the paths to the files being downloaded.
    '''
    # Get the text from the page
    page = requests.get(url, auth=(username, password)).text
    # Turn that text into html code
    soup = BeautifulSoup(page, 'html.parser')
    
    
    # Loop through all the items in soup. Each one is a contained piece of html.
    # E.g. <h1> ... </h1>, <p> ... </p>.
    # However, this is only being done with items with <a> ... </a> (final_all('a')) (hyper-linked items). 
    # Then, only add the items with 'file_exntension' to the list

    files = [url + '/' + node.get('href') for node in soup.find_all('a')
             if node.get('href').endswith(file_extension)]
    return files

In [151]:
def download_longrunmip_nc_file(fpath: str, SAVE_DIRECTORY: str='', debug=True) -> None:
    '''
    Downloads the netcdf file on longrunmip website
    '''
    # The name of the file
    fname = os.path.basename(fpath)
    if debug:
        print(f'{fname} -> {SAVE_DIRECTORY}')
    
    r = requests.get(fpath, auth=(username, password), allow_redirects=True)
    
    # Saving the file
    with open(os.path.join(SAVE_DIRECTORY, fname), 'wb') as f:
        f.write(r.content)
    if debug:
        print(f'{fname} complete')

In [177]:
def experiment_is_needed(fname: str, needed_experiments: List[str]) -> bool:
    '''Checks if the experiment is actually needed. If not return False'''
    for n_exp in needed_experiments:
        if re.search(f'\w+{n_exp}\w+', fname):
            return True
    return False

In [178]:
# Username and possword for longrunmip
username = 'longrunmip'
password = 'data4you'

variables = {'tas': 'Near-surface air temperature', 'pr':'Precipitation', 'netTOA': 'Net TOA flux',
            'sic': "Sea ice are fraction (monthly)", 'psl': 'Sea level pressure'}

In [196]:
variable = 'psl' # Completed: tas, pr, sic, netTOA
url = f'https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/{variable}'
url

'https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/psl'

In [197]:
files = list_files_with_extension(url, file_extension)
files = [f for f in files if experiment_is_needed(f, ['picontrol', 'control', 'abrupt4x'])]
pprint_list(files)

lenght = 28
 0. https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/psl/psl_ann_FAMOUS_abrupt4x_3000_g025.nc
 1. https://data.iac.ethz.ch/longrunmip/modeloutput/regrid/psl/psl_ann_FAMOUS_control_3000_g025.nc


In [198]:
SAVE_DIRECTORY = os.path.join(constants.LONGRUNMIP_DIR, variable, 'regrid')
SAVE_DIRECTORY

'/g/data/w40/ab2313/PhD/longrunmip/psl/regrid'

In [199]:
try:
    os.mkdir(SAVE_DIRECTORY)
except FileExistsError as e:
    pass

In [None]:
%%time
download_longrunmip_nc_file_with_dir = partial(download_longrunmip_nc_file, 
                                               SAVE_DIRECTORY = SAVE_DIRECTORY, debug=True)

for fname in files:
    download_longrunmip_nc_file_with_dir(fname) 

psl_ann_FAMOUS_abrupt4x_3000_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/psl/regrid
psl_ann_FAMOUS_abrupt4x_3000_g025.nc complete
psl_ann_FAMOUS_control_3000_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/psl/regrid
psl_ann_FAMOUS_control_3000_g025.nc complete
psl_ann_HadGEM2ES_abrupt4x_1299_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/psl/regrid
psl_ann_HadGEM2ES_abrupt4x_1299_g025.nc complete
psl_ann_HadGEM2ES_control_239_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/psl/regrid
psl_ann_HadGEM2ES_control_239_g025.nc complete
psl_ann_MPIESM11_abrupt4x_4459_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/psl/regrid
psl_ann_MPIESM11_abrupt4x_4459_g025.nc complete
psl_ann_MPIESM11_control_2000_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/psl/regrid
psl_ann_MPIESM11_control_2000_g025.nc complete
psl_mon_CCSM3_abrupt4x_2120_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/psl/regrid
psl_mon_CCSM3_abrupt4x_2120_g025.nc complete
psl_mon_CCSM3_control_1530_g025.nc -> /g/data/w40/ab2313/PhD/longrunmip/p

In [135]:
# [download_longrunmip_nc_file_with_dir(fname) for fname in files]
# with Pool() as pool:
#     pool.map(download_longrunmip_nc_file_with_dir, files)
# [download_longrunmip_nc_file(fpath, SAVE_DIRECTORY) for fpath in files]