# preprocess essential climate variables

In [1]:
import os
import pandas as pd
import netCDF4 as nc
import xarray as xr

In [2]:
def select_bounding_box_region(path, years, ecv, lon_min, lon_max, lat_min, lat_max):
    
    data_final = pd.DataFrame(columns=['lat', 'lon', 'time', ecv])
    
    for year in years:
        files = os.listdir(path+year)
        
        for file in files:
            data = xr.open_dataset(path+year+'/'+file)
            data = data.to_dataframe().reset_index()
            data_filtered = data[(data.lon <= lon_max) & (data.lon >= lon_min) & (data.lat >= lat_min) & (data.lat <= lat_max)].dropna(subset=[ecv])[['lat', 'lon', 'time', ecv]]
            data_final = data_final.append(data_filtered)
            
    data_final = data_final.reset_index(drop=True)
    
    return data_final

In [3]:
years = ['2010',
         '2011',
         '2012',
         '2013',
         '2014',
         '2015']

In [4]:
lon_min, lon_max, lat_min, lat_max = 60, 100, 0, 40

## sea surface salinity

In [5]:
!bash download_sea_surface_salinity_data.sh

Download sea surface salinity data...
--2021-02-15 22:07:42--  ftp://anon-ftp.ceda.ac.uk/neodc/esacci/sea_surface_salinity/data/v02.31/30days/2010
           => ‘../data/sea_surface_salinity/2010/neodc/esacci/sea_surface_salinity/data/v02.31/30days/.listing’
Resolving anon-ftp.ceda.ac.uk (anon-ftp.ceda.ac.uk)... 130.246.130.100
Connecting to anon-ftp.ceda.ac.uk (anon-ftp.ceda.ac.uk)|130.246.130.100|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /neodc/esacci/sea_surface_salinity/data/v02.31/30days ... done.
==> PASV ... done.    ==> LIST ... done.

neodc/esacci/sea_su     [ <=>                ]     818  --.-KB/s    in 0,03s   

2021-02-15 22:07:43 (23,9 KB/s) - ‘../data/sea_surface_salinity/2010/neodc/esacci/sea_surface_salinity/data/v02.31/30days/.listing’ saved [818]

--2021-02-15 22:07:43--  ftp://anon-ftp.ceda.ac.uk/neodc/esacci/sea_surface_salinity/data/v02.31/30days/2010/2010
      

In [6]:
path = '../data/sea_surface_salinity/'

In [7]:
data = nc.Dataset(path+years[0]+'/'+sorted(os.listdir(path+years[0]))[0])

In [8]:
for var in data.variables.values():
    print(var)

<class 'netCDF4._netCDF4.Variable'>
int16 noutliers(time, lat, lon)
    long_name: Count of the Number of Outliers within this bin cell
    _FillValue: -1
    valid_max: 10000
    valid_min: 0
unlimited dimensions: time
current shape = (1, 584, 1388)
filling on
<class 'netCDF4._netCDF4.Variable'>
int16 total_nobs(time, lat, lon)
    long_name: Number of SSS in the time interval
    _FillValue: -1
    valid_max: 10000
    valid_min: 0
unlimited dimensions: time
current shape = (1, 584, 1388)
filling on
<class 'netCDF4._netCDF4.Variable'>
float32 sss_bias(time, lat, lon)
    _FillValue: nan
    long_name: Bias in Sea Surface Salinity
    valid_max: 100.0
    valid_min: -100.0
unlimited dimensions: time
current shape = (1, 584, 1388)
filling on
<class 'netCDF4._netCDF4.Variable'>
float32 sss_random_error(time, lat, lon)
    _FillValue: nan
    long_name: Sea Surface Salinity Random Error
    valid_max: 100.0
    valid_min: 0.0
unlimited dimensions: time
current shape = (1, 584, 1388)
fill

  


In [9]:
%%time

sss = select_bounding_box_region(path, years, 'sss', lon_min, lon_max, lat_min, lat_max)

CPU times: user 28.6 s, sys: 1.93 s, total: 30.5 s
Wall time: 29.3 s


In [10]:
sss.shape

(1817422, 4)

In [11]:
sss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1817422 entries, 0 to 1817421
Data columns (total 4 columns):
 #   Column  Dtype         
---  ------  -----         
 0   lat     float64       
 1   lon     float64       
 2   time    datetime64[ns]
 3   sss     float32       
dtypes: datetime64[ns](1), float32(1), float64(2)
memory usage: 48.5 MB


In [12]:
sss.head()

Unnamed: 0,lat,lon,time,sss
0,0.098082,60.043228,2010-04-15,35.378178
1,0.098082,60.302593,2010-04-15,35.311237
2,0.098082,60.561958,2010-04-15,35.226299
3,0.098082,60.821327,2010-04-15,35.049088
4,0.098082,61.080692,2010-04-15,35.022762


In [13]:
sss['year'] = sss.time.apply(lambda x: x.year)

In [14]:
sss['month'] = sss.time.apply(lambda x: x.month)

In [15]:
sss = sss[['lat', 'lon', 'year', 'month', 'sss']].groupby(['lat', 'lon', 'year', 'month']).aggregate('mean').reset_index()

In [16]:
sss.shape

(908712, 5)

In [17]:
sss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908712 entries, 0 to 908711
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   lat     908712 non-null  float64
 1   lon     908712 non-null  float64
 2   year    908712 non-null  int64  
 3   month   908712 non-null  int64  
 4   sss     908712 non-null  float32
dtypes: float32(1), float64(2), int64(2)
memory usage: 31.2 MB


In [18]:
sss.head()

Unnamed: 0,lat,lon,year,month,sss
0,0.098082,60.043228,2010,1,35.265869
1,0.098082,60.043228,2010,2,35.191532
2,0.098082,60.043228,2010,3,35.426231
3,0.098082,60.043228,2010,4,35.37043
4,0.098082,60.043228,2010,5,35.31086


In [19]:
sss.to_pickle(path+'monthly_sss_2010_2015.pkl.gz', compression='gzip')

## chlorophyll-a concentration

In [20]:
!bash download_chlorophyll_a_concentration_data.sh

Download chlorophyll-a concentration data...
--2021-02-15 22:20:42--  ftp://anon-ftp.ceda.ac.uk/neodc/esacci/ocean_colour/data/v4.2-release/geographic/netcdf/chlor_a/monthly/v4.2/2010
           => ‘../data/chlorophyll_a_concentration/2010/neodc/esacci/ocean_colour/data/v4.2-release/geographic/netcdf/chlor_a/monthly/v4.2/.listing’
Resolving anon-ftp.ceda.ac.uk (anon-ftp.ceda.ac.uk)... 130.246.130.100
Connecting to anon-ftp.ceda.ac.uk (anon-ftp.ceda.ac.uk)|130.246.130.100|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /neodc/esacci/ocean_colour/data/v4.2-release/geographic/netcdf/chlor_a/monthly/v4.2 ... done.
==> PASV ... done.    ==> LIST ... done.

neodc/esacci/ocean_     [  <=>               ]   1,48K  5,84KB/s    in 0,3s    

2021-02-15 22:20:42 (5,84 KB/s) - ‘../data/chlorophyll_a_concentration/2010/neodc/esacci/ocean_colour/data/v4.2-release/geographic/netcdf/chlor_a/monthly/v4.2/.list

In [21]:
path = '../data/chlorophyll_a_concentration/'

In [22]:
data = nc.Dataset(path+years[0]+'/'+sorted(os.listdir(path+years[0]))[0])

In [23]:
for var in data.variables.values():
    print(var)

<class 'netCDF4._netCDF4.Variable'>
float32 MERIS_nobs_sum(time, lat, lon)
    _FillValue: 0.0
    long_name: Count of the number of observations from the MERIS sensor contributing to this bin cell
    number_of_files_composited: 31
unlimited dimensions: 
current shape = (1, 4320, 8640)
filling on
<class 'netCDF4._netCDF4.Variable'>
float32 MODISA_nobs_sum(time, lat, lon)
    _FillValue: 0.0
    long_name: Count of the number of observations from the MODIS (Aqua) sensor contributing to this bin cell
    number_of_files_composited: 31
unlimited dimensions: 
current shape = (1, 4320, 8640)
filling on
<class 'netCDF4._netCDF4.Variable'>
float32 SeaWiFS_nobs_sum(time, lat, lon)
    _FillValue: 0.0
    long_name: Count of the number of observations from the SeaWiFS (GAC and LAC) sensor contributing to this bin cell
    number_of_files_composited: 31
unlimited dimensions: 
current shape = (1, 4320, 8640)
filling on
<class 'netCDF4._netCDF4.Variable'>
float32 VIIRS_nobs_sum(time, lat, lon)
  

  


In [24]:
%%time

chlor_a = select_bounding_box_region(path, years, 'chlor_a', lon_min, lon_max, lat_min, lat_max)

CPU times: user 8min 15s, sys: 2min 40s, total: 10min 55s
Wall time: 10min 35s


In [25]:
chlor_a.shape

(26001084, 4)

In [26]:
chlor_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26001084 entries, 0 to 26001083
Data columns (total 4 columns):
 #   Column   Dtype         
---  ------   -----         
 0   lat      float64       
 1   lon      float64       
 2   time     datetime64[ns]
 3   chlor_a  float32       
dtypes: datetime64[ns](1), float32(1), float64(2)
memory usage: 694.3 MB


In [27]:
chlor_a.head()

Unnamed: 0,lat,lon,time,chlor_a
0,39.895832,64.770836,2010-04-01,8.540924
1,39.895832,64.8125,2010-04-01,8.540924
2,39.895832,64.854164,2010-04-01,9.433975
3,39.854168,64.729164,2010-04-01,8.993294
4,39.854168,64.770836,2010-04-01,8.993294


In [28]:
chlor_a['year'] = chlor_a.time.apply(lambda x: x.year)

In [29]:
chlor_a['month'] = chlor_a.time.apply(lambda x: x.month)

In [30]:
chlor_a.drop('time', axis=1, inplace=True)

In [31]:
chlor_a.head()

Unnamed: 0,lat,lon,chlor_a,year,month
0,39.895832,64.770836,8.540924,2010,4
1,39.895832,64.8125,8.540924,2010,4
2,39.895832,64.854164,9.433975,2010,4
3,39.854168,64.729164,8.993294,2010,4
4,39.854168,64.770836,8.993294,2010,4


In [32]:
chlor_a.to_pickle(path+'monthly_chlor_a_2010_2015.pkl.gz', compression='gzip')