In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
from config import NDVI_DATA_URL
from netCDF4 import Dataset
import io
import xarray as xr
import collections
import aiohttp
import asyncio
import json

In [2]:
async def get_files(year, files):
    contents = []
    exceptions = []
    async with aiohttp.ClientSession() as session:
        responses = await asyncio.gather(*(
            session.get(f'https://www.ncei.noaa.gov/data/land-normalized-difference-vegetation-index/access/{year}/{file}')
            for file in files
            ), return_exceptions=True)
    
        for response in responses:
            try: 
                contents.append(await response.read())
            except:
                exceptions.append(str(response.url))

        return contents, exceptions

In [None]:
exceptionsMap = collections.defaultdict(list) #hashmap to list every missed data from download
for year in range(2004, datetime.today().year + 1):
    print(year)
    if not str(year) in os.listdir(NDVI_DATA_URL):
        os.mkdir(f"{NDVI_DATA_URL}/{year}")
    r = requests.get(f"https://www.ncei.noaa.gov/data/land-normalized-difference-vegetation-index/access/{year}/")
    soup = BeautifulSoup(r.content, 'html.parser')
    tds = soup.find_all("td")
    files = []
    for td in tds:
        if td.text[-3:] == ".nc":
            files.append(td.text)

    #looks like we are limited in requests so we chunck them
    total_files = len(files)
    chunck = 18
    n_times_chunck = total_files // chunck
    
    #loop through the array in 24-hour chunks
    for n in range(n_times_chunck):
        #calculate start and end indices for this day
        start_idx = n * chunck       # 0, 18, 36, ...
        end_idx = start_idx + chunck   # 18, 36, 54, ...

        contents, exceptions = await get_files(year, files[start_idx:end_idx])

        exceptionsMap[year].extend(exceptions)

        datasets = [xr.open_dataset(io.BytesIO(content)) for content in contents]
        for dataset in datasets:
            subset = dataset.sel( #select subset for france only to save a lot a space 
                longitude=slice(-5, 10),
                latitude=slice(51, 42))
            subset.to_netcdf(f"{NDVI_DATA_URL}/{year}/{subset.attrs["id"]}") #save new nc file
    with open(f"{NDVI_DATA_URL}/{year}/{year}Exceptions.json", 'w') as json_file: #save exception file in case script break (internet loss...)
        json.dump(dict(exceptionsMap), json_file)
    

In [None]:
#get the data that went to exceptions from async download
#we go file by file because we drasticaly reduced the files to download
with open(f"{NDVI_DATA_URL}/Exceptions.json", "r") as file:
    data = json.load(file)

for year, files in data.items():
    print(year)
    for file in files:
        file_req = requests.get(file)
        file_req.raise_for_status()

        dataset = xr.open_dataset(io.BytesIO(file_req.content))
        subset = dataset.sel( #select subset for france only to save a lot a space 
                longitude=slice(-5, 10),
                latitude=slice(51, 42))
        subset.to_netcdf(f"{NDVI_DATA_URL}/{year}/{subset.attrs["id"]}") #save new nc file

In [12]:
Dataset(f"{NDVI_DATA_URL}/1981/AVHRR-Land_v005_AVH13C1_NOAA-07_19810711_c20170609200548.nc").variables["NDVI"]

<class 'netCDF4.Variable'>
int16 NDVI(time, latitude, longitude)
    _FillValue: -9999
    long_name: NOAA Climate Data Record of Normalized Difference Vegetation Index
    units: 1
    valid_range: [-1000 10000]
    grid_mapping: crs
    standard_name: normalized_difference_vegetation_index
    add_offset: 0.0
    scale_factor: 0.0001
unlimited dimensions: 
current shape = (1, 180, 300)
filling on

In [19]:
Dataset(f"{NDVI_DATA_URL}/1981/out.nc").variables["NDVI"][:].shape

(1, 190, 295)