In [1]:
#beautiful soup and requests are used for web scraping. here, i used them to obtain links for every NLDAS file on hydro1 (link below).

from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer
only_contentUrl_tags = SoupStrainer(itemprop="contentUrl")
import requests


#numpy is the math / matrix handling package.
import numpy as np

#used for getting lists of files in folders
import glob

#for moving / deleting / renaming things on operating system level
import os
import sys
import shutil

In [64]:
pwd

'/work/albertl_uri_edu/fluxtoflow/nldas'

In [66]:
# creates a list of links to every year we're going to download. 

addy = []
year = np.arange(2015,2023,1)
print(year)

for x in year:
    linktemp = f'https://hydro1.gesdisc.eosdis.nasa.gov/data/NLDAS/NLDAS_VIC0125_H.2.0/{x}'
    addy.append(linktemp)


[2015 2016 2017 2018 2019 2020 2021 2022]


In [67]:
# this script provides a link to every day in the seven years we're looking at.
# when idx or index equals 7, that's pointing to year 2022 so number of days is short
# this script also accounts for leap years.

daylink = []
day = np.arange(1,367,1)
day1 = np.arange(1,366,1)
day2 = np.arange(1,61,1)

for idx,x in enumerate(addy):
    if idx == 7:
        for y in day2:
            addy_withdaystemp = x + "/" + str(y).zfill(3)
            daylink.append(addy_withdaystemp)
    elif (idx == 1) | (idx == 5):
        for y in day:
            addy_withdaystemp = x + "/" + str(y).zfill(3)
            daylink.append(addy_withdaystemp)
    else:
        for y in day1:
            addy_withdaystemp = x + "/" + str(y).zfill(3)
            # print(addy_withdaystemp)
            daylink.append(addy_withdaystemp)  

In [58]:
import arrow as a
a.now('US/Eastern')

<Arrow [2022-05-03T11:05:10.282080-04:00]>

In [69]:
# scrapes every web page that we just created for each day of the seven years.
# grabs everything that is a link with 'find_all("a")'

print(a.now('US/Eastern'))
nclist = []
for x in daylink:
    # print(x)
    page = requests.get(x)
    # print(page.text)
    soup = bs(page.content,"html.parser")
    results = soup.find_all("a")
    for y in results:
        if y['href'][-3:] == '.nc':
            nclist.append(x + "/" + y['href'])
print(a.now('US/Eastern'))
nclist = np.asarray(nclist)
np.save('nclist.npy',nclist)

2022-05-03T11:59:26.461374-04:00
2022-05-03T12:13:43.017317-04:00


In [72]:
#remove duplicates
nclist = list(dict.fromkeys(nclist))

In [74]:
# saves list of files as a numpy file
# it's smart to have checkpoints in your work

len(nclist)
np.save('nclistnodupes.npy',nclist)

In [103]:
lis = np.load('nclistnodupes.npy')

In [117]:
lis[0][79:]

'NLDAS_VIC0125_H.A20150101.0000.020.nc'

In [1]:
# loads list of files to download
lis = np.load('nclistnodupes.npy')[:3]


# this script obtained from NASA Earthdata's wiki
# https://wiki.earthdata.nasa.gov/
# please note that to run, you'll need to include your NASA Earthdata credentials

class SessionWithHeaderRedirection(requests.Session):
    AUTH_HOST = 'urs.earthdata.nasa.gov'

    def __init__(self, username, password):
        super().__init__()
        self.auth = (username, password)

    # Overrides from the library to keep headers when redirected to or from the NASA auth host.
    def rebuild_auth(self, prepared_request, response):
        headers = prepared_request.headers
        url = prepared_request.url
        if 'Authorization' in headers:
            original_parsed = requests.utils.urlparse(response.request.url)
            redirect_parsed = requests.utils.urlparse(url)
            if (original_parsed.hostname != redirect_parsed.hostname) and \
               redirect_parsed.hostname != self.AUTH_HOST and \
               original_parsed.hostname != self.AUTH_HOST:
                del headers['Authorization']
        return


# create session with the user credentials that will be used to authenticate access to the data

username="YOUR NAME HERE" ###YOU THE READER OF FLUXTOFLOW NEED A NASA EARTHDATA ACCOUNT!!!
password="YOUR PASSWORD HERE"
session = SessionWithHeaderRedirection(username, password)



# checks file folder to see if your destination folder for all these NLDAS files has any files in it, to
# prevent your code from downloading the same files multiple times.
destination_files = 'ncfiles/*.nc'
y = [eeks[8:] for eeks in glob.glob(destination_files)]
not_dled = []
dled = []
for x in lis:
    if str(x[79:]) not in y:
        not_dled.append(x)
    if str(x[79:])  in y:
        dled.append(x)


# this while loop keeps attempting to download all of the files in your list of NLDAS files until all 
# of them have been downloaded. it also presents an http message number associated with your request 
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Messages
while True:
    for idx,i in enumerate(not_dled):
        try:

            # submit the request using the session
            response = session.get(i, stream=True)
            print(response.status_code)
            # raise an exception in case of http errors
            response.raise_for_status()
            # save the file
            with open(f'ncfiles/{i[79:]}', 'wb') as fd:
                for data in response:
                    fd.write(data)
            fd.close()
        except requests.exceptions.HTTPError as e:
            # handle any errors here
            print('error',e)
    
    dled = []        
    not_dled = []
    for x in lis:
        y = [eeks[8:] for eeks in glob.glob(destination_files)]
        if str(x[79:]) not in y:
            not_dled.append(x)
        if str(x[79:]) in y:
            dled.append(x)
    print(f'dled: \t \n \t')
    for x in dled:
        print('\t',x,'\t')
    print(f'not dled: \t \n \t')
    for x in not_dled:
        print('\t',x,'\t')

            
    if len(not_dled) == 0:
        import sys
        sys.exit('no more left to download')

In [6]:
x = glob.glob('ncfiles/*.nc')

In [None]:
# the below scripts are samples of a workflow to place the hourly files into their own folders on a per day basis.

In [18]:
# creates lists of yearly groupings of NLDAS files and shows quantity of each.

x2015 = [y for y in x if y[25:29] == '2015']
x2016 = [y for y in x if y[25:29] == '2016']
x2017 = [y for y in x if y[25:29] == '2017']
x2018 = [y for y in x if y[25:29] == '2018']
x2019 = [y for y in x if y[25:29] == '2019']
x2020 = [y for y in x if y[25:29] == '2020']
x2021 = [y for y in x if y[25:29] == '2021']
x2022 = [y for y in x if y[25:29] == '2022']

print(f"num 2015: {len(x2015)} \n"
    f"num 2016: {len(x2016)} \n"
    f"num 2017: {len(x2017)} \n"
    f"num 2018: {len(x2018)} \n"
    f"num 2019: {len(x2019)} \n"
    f"num 2020: {len(x2020)} \n"
    f"num 2021: {len(x2021)} \n"
    f"num 2022: {len(x2022)} \n")

num 2015: 8760 
num 2016: 8784 
num 2017: 8760 
num 2018: 8760 
num 2019: 8760 
num 2020: 8784 
num 2021: 8760 
num 2022: 1440 



In [19]:
year = np.arange(2015,2023,1)
print(year)

[2015 2016 2017 2018 2019 2020 2021 2022]


In [20]:
# creates empty folders of each year

for x in year:
    os.mkdir(f'nc_sorted/{x}')

In [25]:
# moves 2015 hourly files into its own folder
for x in x2015[1:]:
    shutil.move(x,'nc_sorted/2015/')

In [28]:
# moves hourly files into their respective year

for x in x2016:
    shutil.move(x,'nc_sorted/2016/')
for x in x2017:
    shutil.move(x,'nc_sorted/2017/')
for x in x2018:
    shutil.move(x,'nc_sorted/2018/')
for x in x2019:
    shutil.move(x,'nc_sorted/2019/')
for x in x2020:
    shutil.move(x,'nc_sorted/2020/')
for x in x2021:
    shutil.move(x,'nc_sorted/2021/')
for x in x2022:
    shutil.move(x,'nc_sorted/2022/')

In [38]:
# creates list of days in 2015

days2015 = [x[36:40] for x in glob.glob('nc_sorted/2015/*.nc')]

In [41]:
# eliminates duplicates 

days2015set = list(dict.fromkeys(days2015))

In [42]:
len(days2015set)

365

In [43]:
# makes individual folder for each year of our observed timeframe

for x in year:
    os.mkdir(f'nc_sorted_dayfolders/{x}')

In [44]:
# makes individual folder for each day in 2015

for x in days2015set:
    os.mkdir(f'nc_sorted_dayfolders/2015/{x}')

In [45]:
# moves 2015 files into their correct day of year folder

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2015/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2015/{days2015[idx]}/')

In [46]:
# gets all 2016 files, creates folder for each day in 2016, and moves files into folder

days2016 = [x[36:40] for x in glob.glob('nc_sorted/2016/*.nc')]
days2016set = list(dict.fromkeys(days2016))
for x in days2016set:
    os.mkdir(f'nc_sorted_dayfolders/2016/{x}')

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2016/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2016/{days2016[idx]}/')

In [48]:
# gets all other files, creates folder for each day in their year, and moves files into folder


days2017 = [x[36:40] for x in glob.glob('nc_sorted/2017/*.nc')]
days2017set = list(dict.fromkeys(days2017))
for x in days2017set:
    os.mkdir(f'nc_sorted_dayfolders/2017/{x}')

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2017/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2017/{days2017[idx]}/')
    
days2018 = [x[36:40] for x in glob.glob('nc_sorted/2018/*.nc')]
days2018set = list(dict.fromkeys(days2018))
for x in days2018set:
    os.mkdir(f'nc_sorted_dayfolders/2018/{x}')

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2018/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2018/{days2018[idx]}/')
    
days2019 = [x[36:40] for x in glob.glob('nc_sorted/2019/*.nc')]
days2019set = list(dict.fromkeys(days2019))
for x in days2019set:
    os.mkdir(f'nc_sorted_dayfolders/2019/{x}')

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2019/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2019/{days2019[idx]}/')

days2020 = [x[36:40] for x in glob.glob('nc_sorted/2020/*.nc')]
days2020set = list(dict.fromkeys(days2020))
for x in days2020set:
    os.mkdir(f'nc_sorted_dayfolders/2020/{x}')

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2020/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2020/{days2020[idx]}/')

days2021 = [x[36:40] for x in glob.glob('nc_sorted/2021/*.nc')]
days2021set = list(dict.fromkeys(days2021))
for x in days2021set:
    os.mkdir(f'nc_sorted_dayfolders/2021/{x}')

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2021/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2021/{days2021[idx]}/')
    

days2022 = [x[36:40] for x in glob.glob('nc_sorted/2022/*.nc')]
days2022set = list(dict.fromkeys(days2022))
for x in days2022set:
    os.mkdir(f'nc_sorted_dayfolders/2022/{x}')

for idx,x in enumerate(sorted(glob.glob('nc_sorted/2022/*.nc'))):
    shutil.move(x,f'nc_sorted_dayfolders/2022/{days2022[idx]}/')

In [49]:
pwd

'/work/albertl_uri_edu/fluxtoflow/nldas'

In [53]:
# shows all folders in 2015 

glob.glob('nc_sorted/2015/*')

['nc_sorted/2015/0101',
 'nc_sorted/2015/0102',
 'nc_sorted/2015/0103',
 'nc_sorted/2015/0104',
 'nc_sorted/2015/0105',
 'nc_sorted/2015/0106',
 'nc_sorted/2015/0107',
 'nc_sorted/2015/0108',
 'nc_sorted/2015/0109',
 'nc_sorted/2015/0110',
 'nc_sorted/2015/0111',
 'nc_sorted/2015/0112',
 'nc_sorted/2015/0113',
 'nc_sorted/2015/0114',
 'nc_sorted/2015/0115',
 'nc_sorted/2015/0116',
 'nc_sorted/2015/0117',
 'nc_sorted/2015/0118',
 'nc_sorted/2015/0119',
 'nc_sorted/2015/0120',
 'nc_sorted/2015/0121',
 'nc_sorted/2015/0122',
 'nc_sorted/2015/0123',
 'nc_sorted/2015/0124',
 'nc_sorted/2015/0125',
 'nc_sorted/2015/0126',
 'nc_sorted/2015/0127',
 'nc_sorted/2015/0128',
 'nc_sorted/2015/0129',
 'nc_sorted/2015/0130',
 'nc_sorted/2015/0131',
 'nc_sorted/2015/0201',
 'nc_sorted/2015/0202',
 'nc_sorted/2015/0203',
 'nc_sorted/2015/0204',
 'nc_sorted/2015/0205',
 'nc_sorted/2015/0206',
 'nc_sorted/2015/0207',
 'nc_sorted/2015/0208',
 'nc_sorted/2015/0209',
 'nc_sorted/2015/0210',
 'nc_sorted/2015