# Download matchup data from OC-CCI

**Last updated: 29/04/2024**

This script downloads L3 matchup data from the European Space Agency's [**Ocean Colour Climate Change Initiative (OC-CCI)**](https://www.oceancolour.org). It uses OPeNDAP to downlaod data from OC-CCI.

This script is configured to download chlorophyll matchup data from two OC-CCI products for an **HPLC dataset** for the North Sea provided by CEFAS: 
* **daily data at 1 km resolution**
* **daily data 4 km resolution**

**You can use this script as a template and modify the code sections below to customise it for your own area of study and datasets.**

## Import libraries, functions and define download directory

In [1]:
import os
from pathlib import Path
import pandas as pd
from pandas import DataFrame
import numpy as np
import xarray as xr
from datetime import datetime, timedelta

In [2]:
def sort_dimension(dataset, dim_name):
    """
    Get the values for the specified dimension and verify if they are unsorted. If so, the function sorts them.
    """
    # Get the coordinate values for the specified dimension.
    coords = dataset[dim_name].values

    # Check if the coordinates are unsorted.
    if (coords[0] >= coords[:-1]).all():
        dataset = dataset.sortby(dim_name, ascending=True)
        
    #print("Latitude = ", dataset[dim_name].values)
    return dataset

**Modify the parameter names in the following window based on your needs.**

In [3]:
# Create a download directory for our outputs
PATH_ROOT_DIR = Path.cwd().resolve().parents[1] # /absolute/path/to/two/levels/up
NAME_DOWNLOAD_DIR_HPLC_MATCHUPS = 'data_matchups_HPLC_OCCCI_csv'
full_path_download_dir_hplc = os.path.join(PATH_ROOT_DIR,"data","raw","OCCCI_data",NAME_DOWNLOAD_DIR_HPLC_MATCHUPS)
os.makedirs(full_path_download_dir_hplc, exist_ok=True)

## Read our in situ HPLC observations

**Modify as needed.**

In [4]:
# This file was created by the Matlab function prepareHPLCdata.m
NAME_HPLC_DATA_FILE = 'cefasHPLCfiltered.csv'
full_path_hplc_data_dir = os.path.join(PATH_ROOT_DIR,'data','processed',NAME_HPLC_DATA_FILE)
matchup_hplc_locations_list = pd.read_csv(full_path_hplc_data_dir, sep = ',')
matchup_hplc_locations_list # print to the screen

Unnamed: 0,idd,Survey_name,Station_number,Prime_number,DateTime,Latitude,Longitude,Smartbuoy,Sample_depth,TP_ug_L,...,Lut_ug_L,Myxo_ug_L,Croc_ug_L,x19_Keto_Hex_fuco_ug_L,Hexkfuco_ug_L,HexkfucoL_ug_L,x4keto_hex_ug_L,x4keto_hexL_ug_L,bathymetry_m,season
0,625,CEND19_17,230.0,102.0,28-Oct-2017 01:18:00,48.350783,-5.750117,,4,0.701378,...,0.000000,0.000000,0.00000,,,,0.000000,,-116.982548,Autumn
1,671,CEND17_18,169.0,102.0,25-Oct-2018 03:42:00,48.366280,-5.725660,,6,0.796561,...,0.000000,,0.00000,,,,0.004086,,-116.179468,Autumn
2,672,CEND17_18,176.0,106.0,25-Oct-2018 20:40:00,48.547920,-4.928820,,6,0.628834,...,0.000662,,0.00000,,,,0.002824,,-74.689037,Autumn
3,626,CEND19_17,231.0,106.0,28-Oct-2017 05:48:00,48.552300,-4.915950,,4,0.858928,...,0.000000,0.000000,0.00000,,,,0.000000,,-77.795798,Autumn
4,55,CEND09_11,44.0,,22-May-2011 15:50:00,48.778933,-4.390117,,5,1.906309,...,0.004340,0.003500,0.00701,,,,,,-88.030648,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669,499,CEND15_13,191.0,72.0,29-Aug-2013 12:40:00,61.232000,-0.401000,,4,0.620383,...,0.001790,0.000000,,0.00000,0.0,0.0,,,-164.470400,Summer
670,166,CEND13_10,160.0,74.0,01-Sep-2010 04:06:00,61.251333,1.393667,,3,6.715475,...,0.024246,0.013840,,0.11529,,,,,-148.207900,Summer
671,445,CEND18_15,125.0,73.0,30-Aug-2015 14:47:00,61.285900,0.488133,,4,2.892239,...,0.014518,0.000000,0.00000,,,,0.016483,,-167.815296,Summer
672,533,CEND18_16,120.0,73.0,29-Aug-2016 09:35:00,61.288283,0.500183,,4,2.378788,...,0.001587,0.006305,0.00000,,,,0.000000,,-168.083801,Summer


In [5]:
# Extract the data that we need from the HPLC observations file
LIST_OBS_CHLA = matchup_hplc_locations_list.TChlA_ug_L[:]
LIST_OBS_LON = matchup_hplc_locations_list.Longitude[:]
LIST_OBS_LAT = matchup_hplc_locations_list.Latitude[:]
LIST_OBS_DATETIME = matchup_hplc_locations_list.DateTime[:]

# Format time
LIST_OBS_DATETIME = pd.to_datetime(LIST_OBS_DATETIME)

## Download matchup satellite observations

### Define the OPeNDAP URL

In [6]:
# Parameters for the URL
# See https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-1km-DAILY.html

LIST_DATASET_IDS = [
    "CCI_ALL-v6.0-1km-DAILY", # 1 km spatial resolution
    "CCI_ALL-v6.0-DAILY"      # 4 km spatial resolution
]

LIST_OUTPUT_NAMES = [
    "1km_1day",
    "4km_1day"
]

LIST_SAT_LAT = [
    "[0:1:17279]",
    "[0:1:4319]"
]

LIST_SAT_LON = [
    "[0:1:34559]",
    "[0:1:8639]"
]

LIST_SAT_TIME = [
    "[0:1:9531]",
    "[0:1:9496]"
]

# Construct the URL
        
list_of_urls = []
    
for dataset_id, lat, lon, time in zip(LIST_DATASET_IDS, LIST_SAT_LAT, LIST_SAT_LON, LIST_SAT_TIME):

    base = (
        'https://www.oceancolour.org/thredds/dodsC/{}' 
        '?lat{},'
        'lon{},'
        'chlor_a{}{}{},'
        'time{}'
    ).format

    url = base(
        dataset_id,
        lat,
        lon,
        time,lat,lon,
        time
    )
    
    list_of_urls.append(url)
    print(url)

https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-1km-DAILY?lat[0:1:17279],lon[0:1:34559],chlor_a[0:1:9531][0:1:17279][0:1:34559],time[0:1:9531]
https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-DAILY?lat[0:1:4319],lon[0:1:8639],chlor_a[0:1:9496][0:1:4319][0:1:8639],time[0:1:9496]


### Examine one of the links created above

In [7]:
# Load the dataset from the OPeNDAP server
ds = xr.open_dataset(list_of_urls[0])

# Sort axis that were inverted
ds = sort_dimension(ds, 'lat') # latitude in OC-CCI products is inverted, so needs sorting
ds = sort_dimension(ds, 'lon')
ds

### Download matchups

In [8]:
%%time

# Loop over datasets and observational entries

for url, output_name in zip(list_of_urls, LIST_OUTPUT_NAMES):
    
    # Load the dataset
    ds = xr.open_dataset(url)
    ds = sort_dimension(ds, 'lat')
    ds = sort_dimension(ds, 'lon')
                                                  
    list_sat_chla = []

    for i in range(len(LIST_OBS_LAT)): 
        
        obs_lat = LIST_OBS_LAT[i]
        obs_lon = LIST_OBS_LON[i]
        obs_time = LIST_OBS_DATETIME[i]
        
        # Extract the desired variable and store it
        subset = ds.sel(lat=obs_lat, lon=obs_lon, time=obs_time, method='nearest')
        list_sat_chla.append(subset['chlor_a'].values)
    
    # Create a DataFrame with extracted satellite data
    data = {
        'latitude': LIST_OBS_LAT,
        'longitude': LIST_OBS_LON,
        'time': LIST_OBS_DATETIME,
        'chla': list_sat_chla
    }
        
    df = pd.DataFrame(data)

    # Save the DataFrame to a csv file
    df.to_csv(f"{full_path_download_dir_hplc}/occci_{output_name}_matchups.csv", mode='w', index=False)

CPU times: user 5.77 s, sys: 681 ms, total: 6.45 s
Wall time: 10min 32s


### Create a matchup table for the data

In [9]:
# Initialise a new dataframe
data = {
    'latitude': LIST_OBS_LAT,
    'longitude': LIST_OBS_LON,
    'time': LIST_OBS_DATETIME
}       
df_coords_with_data = pd.DataFrame(data)

# Pick up and add the variable from every saved dataframe
for output_name in LIST_OUTPUT_NAMES:
    filename = f"{full_path_download_dir_hplc}/occci_{output_name}_matchups.csv"
    if os.path.exists(filename):
        df_file = pd.read_csv(filename)
        variable_column = df_file["chla"]
        df_coords_with_data[f"occci_{output_name}"] = variable_column

# Save new dataframe with all data
df_coords_with_data.to_csv(f"{full_path_download_dir_hplc}/occci_hplc_matchups.csv", index=False)
df_coords_with_data

Unnamed: 0,latitude,longitude,time,occci_1km_1day,occci_4km_1day
0,48.350783,-5.750117,2017-10-28 01:18:00,,
1,48.366280,-5.725660,2018-10-25 03:42:00,,
2,48.547920,-4.928820,2018-10-25 20:40:00,,
3,48.552300,-4.915950,2017-10-28 05:48:00,,
4,48.778933,-4.390117,2011-05-22 15:50:00,0.697642,0.685077
...,...,...,...,...,...
669,61.232000,-0.401000,2013-08-29 12:40:00,,
670,61.251333,1.393667,2010-09-01 04:06:00,,
671,61.285900,0.488133,2015-08-30 14:47:00,,
672,61.288283,0.500183,2016-08-29 09:35:00,,
