# Data Preparation
***
**Author:** [Andrés Piñango](https://github.com/andresawa/)  
Laboratório de Estudos dos Oceanos e Clima – LEOC, Instituto de Oceanografia, Universidade Federal do Rio Grande.  
email: andreseloy@furg.br  
**Last change:** 28/08/2021
***

This notebook contains the code applied for the generation of the datafiles `north_data.nc` and `equator_data.nc`, used in the practical exercise `atividade_pratica.ipynb`. The data variables in those files are:
* `fco2`: CO₂ fugacity in seawater, taken from the SOCAT version 2021 gridded product.
* `temperature`: Sea surface temperature (SST), taken from the SOCAT version 2021 gridded product.
* `salinity`: Salinity, taken from the SOCAT version 2021 gridded product.
* `wind_speed`: 10 meter wind speed, taken from the ERA5 dataset.
* `pco2_atm`: Global monthly mean partial pressure of CO₂ (pCO₂), taken from the Global Monitoring Laboratory - NOAA

More information can be found on the site of each product:
* [SOCAT 2021](http://socat.info)
* [ERA5](https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means)
* [GML NOAA](https://gml.noaa.gov/ccgg/trends/global.html)

***
## Import the libraries

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import wget
import tempfile
import cdsapi
from zipfile import ZipFile

## Download the compressed SOCAT data and extract it to a temporary directory
SOCAT gridded data is available as compressed (35 MB) and uncompressed (3.5 GB) files. Extract the compressed data is faster than download the uncompressed data.

In [None]:
temp_dir = tempfile.TemporaryDirectory()
temp_file = "/".join([temp_dir.name, "socat_compressed.zip"])
wget.download("https://www.socat.info/socat_files/v2021/SOCATv2021_tracks_gridded_monthly.nc.zip", temp_file)

zf = ZipFile(temp_file, 'r')
zf.extractall(temp_dir.name)
zf.close()

socat_file = "/".join([temp_dir.name, "SOCATv2021_tracks_gridded_monthly.nc"])

## Download the ERA5 wind speed data to the temporal directory

In [None]:
# Download the data for the north atlantic (40°N - 60°N)
c = cdsapi.Client()
north_file = "/".join([temp_dir.name, "wind_north.nc"])
c.retrieve(
    'reanalysis-era5-single-levels-monthly-means',
    {
        'product_type': 'monthly_averaged_reanalysis',
        'variable': '10m_wind_speed',
        'year': [
            '2000', '2001', '2002',
            '2003', '2004', '2005',
            '2006', '2007', '2008',
            '2009', '2010', '2011',
            '2012', '2013', '2014',
            '2015', '2016', '2017',
            '2018', '2019', '2020',
        ],
        'month': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
        ],
        'time': '00:00',
        'area': [
            60, -60, 40,
            0,
        ],
        'format': 'netcdf',
    },
    north_file)
# Download the data for the tropical atlantic (10°S - 10°N)
d = cdsapi.Client()
equator_file = "/".join([temp_dir.name, "wind_equator.nc"])
d.retrieve(
    'reanalysis-era5-single-levels-monthly-means',
    {
        'product_type': 'monthly_averaged_reanalysis',
        'variable': '10m_wind_speed',
        'year': [
            '2000', '2001', '2002',
            '2003', '2004', '2005',
            '2006', '2007', '2008',
            '2009', '2010', '2011',
            '2012', '2013', '2014',
            '2015', '2016', '2017',
            '2018', '2019', '2020',
        ],
        'month': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
        ],
        'time': '00:00',
        'area': [
            10, -60, -10,
            0,
        ],
        'format': 'netcdf',
    },
    equator_file)

## Import the data

In [None]:
socat = xr.open_dataset(socat_file)
wind_north = xr.open_dataset(north_file)
wind_equator = xr.open_dataset(equator_file)
atmospheric_pco2 = pd.read_csv("https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_gl.txt", sep = "\s+", names = ["year", "month", "decimal", "average", "trend"], skiprows = 58)

temp_dir.cleanup() # Close and erase the temporary directory

## Drop unwanted variables in the SOCAT data and select the studied regions

In [None]:
unwanted_vars = ["tmnth_bnds", "count_ncruise", "fco2_count_nobs", "fco2_ave_unwtd", "fco2_min_unwtd", "fco2_max_unwtd", "fco2_std_weighted", 
                 "fco2_std_unwtd", "sst_count_nobs", "sst_ave_unwtd", "sst_min_unwtd", "sst_max_unwtd", "sst_std_weighted", "sst_std_unwtd", 
                 "salinity_count_nobs", "salinity_ave_unwtd", "salinity_min_unwtd", "salinity_max_unwtd", "salinity_std_weighted", 
                 "salinity_std_unwtd", "lat_offset_unwtd", "lon_offset_unwtd"]
new_names_dims = {"xlon" : "longitude",
                  "ylat" : "latitude",
                  "tmnth" : "time"}
new_names_vars = {"xlon" : "longitude",
                  "ylat" : "latitude",
                  "tmnth" : "time",
                  "fco2_ave_weighted" : "fco2",
                  "sst_ave_weighted" : "temperature",
                  "salinity_ave_weighted" : "salinity"}

socat_north = socat.copy()
socat_north = socat_north.sel(xlon = slice(-60, 0), ylat = slice(40, 60), tmnth = slice("2000", "2020"))
socat_north = socat_north.drop_vars(unwanted_vars).rename_dims(new_names_dims).rename_vars(new_names_vars)
socat_north = socat_north.assign_coords(time = wind_equator.time.values)
socat_north = socat_north.sortby("latitude", ascending=False)

socat_equator = socat.copy()
socat_equator = socat_equator.sel(xlon = slice(-60, 0), ylat = slice(-10, 10), tmnth = slice("2000", "2020"))
socat_equator = socat_equator.drop_vars(unwanted_vars).rename_dims(new_names_dims).rename_vars(new_names_vars)
socat_equator = socat_equator.assign_coords(time = wind_equator.time.values)
socat_equator = socat_equator.sortby("latitude", ascending=False)

## Downsample the ERA5 (wind) data to match the SOCAT resolution


In [None]:
new_names_vars = {"si10" : "wind_speed"}

wind_north = wind_north.sel(longitude = slice(-59.5, -0.5), latitude = slice(59.5, 40.5))
wind_north = wind_north.rename_vars(new_names_vars)
wind_north = wind_north.coarsen(longitude = 4, boundary = "pad").mean().coarsen(latitude = 4, boundary = "pad").mean()
wind_north = wind_north.assign_coords(longitude = socat_north.longitude.values).assign_coords(latitude = socat_north.latitude.values)

wind_equator = wind_equator.sel(longitude = slice(-59.5, -0.5), latitude = slice(9.5, -9.5))
wind_equator = wind_equator.rename_vars(new_names_vars)
wind_equator = wind_equator.coarsen(longitude = 4, boundary = "pad").mean().coarsen(latitude = 4, boundary = "pad").mean()
wind_equator = wind_equator.assign_coords(longitude = socat_equator.longitude.values).assign_coords(latitude = socat_equator.latitude.values)

## Transform the atmospheric CO₂ data in a xarray.DataArray

In [None]:
new_name_coor = {"index" : "time"}

pco2_atm = atmospheric_pco2.loc[(atmospheric_pco2["year"] >= 2000) & (atmospheric_pco2["year"] < 2021)]
pco2_atm = pd.Series(pco2_atm.average)
pco2_atm = xr.DataArray.from_series(pco2_atm, sparse=False)
pco2_atm = pco2_atm.rename(new_name_coor).assign_coords(time = wind_equator.time.values)
pco2_atm.name = "pco2_atm"

##  Merge all the data in a file for the study section ubicated in the north atlantic (40°N - 60°N)

In [None]:
north_zone = socat_north.merge(wind_north).merge(pco2_atm)
north_zone.to_netcdf("~/fluxos-co2/north_data.nc")

##  Merge all the data in a file for the study section ubicated in the tropical atlantic (10°S - 10°N)

In [None]:
equator_zone = socat_equator.merge(wind_equator).merge(pco2_atm)
equator_zone.to_netcdf("~/fluxos-co2/equator_data.nc")