# preprocess essential climate variables

In [1]:
import os
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
from pyproj import CRS
import gc

In [2]:
years = list(np.arange(2010, 2019))
years

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

In [3]:
lon_min, lon_max, lat_min, lat_max = 60, 100, 0, 40

In [4]:
def preprocess_ecv(file, ecv, mask_lon, mask_lat):
    
    """
    This function preprocesses a given essential climate variable (ecv).
    It first loads a given file into an xarray.
    It then selects the relevant dimensions and ecv and crops the xarray to the relevant bounding box based on given longitude and latitude masks.
    It then extracts year and month from time and drops time.
    Finally, it converts the xarray into a dataframe and drops rows where data on the given ecv is missing.
    """
    
    all_data = xr.open_dataset(file)
    preprocessed_data = all_data[['time', 'lat', 'lon', ecv]].where(mask_lon & mask_lat, drop=True).to_dataframe().dropna(subset=[ecv]).reset_index()
    preprocessed_data['year'] = preprocessed_data['time'].apply(lambda x: x.year)
    preprocessed_data['month'] = preprocessed_data['time'].apply(lambda x: x.month)
    preprocessed_data = preprocessed_data.drop('time', axis=1)
    
    return preprocessed_data

## sea surface salinity

In [5]:
#!bash download_sea_surface_salinity_data.sh

In [6]:
path = '../data/sea_surface_salinity'

In [7]:
file = os.path.join(path, str(years[0]), sorted(os.listdir(os.path.join(path, str(years[0]))))[0])

In [8]:
data = xr.open_dataset(file)

In [9]:
data

In [10]:
data.info()

xarray.Dataset {
dimensions:
	lat = 584 ;
	lon = 1388 ;
	time = 1 ;

variables:
	float32 noutliers(time, lat, lon) ;
		noutliers:long_name = Count of the Number of Outliers within this bin cell ;
		noutliers:valid_max = 10000 ;
		noutliers:valid_min = 0 ;
	float32 total_nobs(time, lat, lon) ;
		total_nobs:long_name = Number of SSS in the time interval ;
		total_nobs:valid_max = 10000 ;
		total_nobs:valid_min = 0 ;
	float32 sss_bias(time, lat, lon) ;
		sss_bias:long_name = Bias in Sea Surface Salinity ;
		sss_bias:valid_max = 100.0 ;
		sss_bias:valid_min = -100.0 ;
	float32 sss_random_error(time, lat, lon) ;
		sss_random_error:long_name = Sea Surface Salinity Random Error ;
		sss_random_error:valid_max = 100.0 ;
		sss_random_error:valid_min = 0.0 ;
	float32 sss_bias_std(time, lat, lon) ;
		sss_bias_std:long_name = Standard Deviation of the Bias in Sea Surface Salinity ;
		sss_bias_std:valid_max = 100.0 ;
		sss_bias_std:valid_min = 0.0 ;
	float32 pct_var(time, lat, lon) ;
		pct_var:long_

In [11]:
mask_lon = (data['lon'] >= lon_min) & (data['lon'] <= lon_max)
mask_lat = (data['lat'] >= lat_min) & (data['lat'] <= lat_max)

In [12]:
%%time

for year in years:
    print('Processing {}...'.format(year))
    files = os.listdir(os.path.join(path, str(year)))
    sss = pd.DataFrame(columns=['lat', 'lon', 'sss', 'year', 'month'])
    
    for file in files:
        sss_temp = preprocess_ecv(os.path.join(path, str(year), file), 'sss', mask_lon, mask_lat)
        sss = sss.append(sss_temp)
        
    sss = sss.reset_index(drop=True)
    
    sss = sss.groupby(['lat', 'lon', 'year', 'month']).aggregate('mean').reset_index()
    
    sss = gpd.GeoDataFrame(sss, geometry=gpd.points_from_xy(sss.lon, sss.lat))
    sss.crs = CRS.from_epsg(4326)
    
    sss = sss.drop(['lat', 'lon'], axis=1)
    
    sss.to_file(os.path.join(path, 'monthly_sss_'+str(year)+'.shp'))

Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...
Processing 2018...
CPU times: user 4min 9s, sys: 9.85 s, total: 4min 18s
Wall time: 4min 20s


In [13]:
sss.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [14]:
sss.shape

(151452, 4)

In [15]:
sss.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 151452 entries, 0 to 151451
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   year      151452 non-null  int64   
 1   month     151452 non-null  int64   
 2   sss       151452 non-null  float32 
 3   geometry  151452 non-null  geometry
dtypes: float32(1), geometry(1), int64(2)
memory usage: 4.0 MB


In [16]:
sss.head()

Unnamed: 0,year,month,sss,geometry
0,2018,1,35.54454,POINT (60.04323 0.09808)
1,2018,2,35.47496,POINT (60.04323 0.09808)
2,2018,3,35.657524,POINT (60.04323 0.09808)
3,2018,4,35.48687,POINT (60.04323 0.09808)
4,2018,5,35.285263,POINT (60.04323 0.09808)


In [17]:
del [sss, sss_temp]
gc.collect()

15

## chlorophyll-a concentration

In [18]:
#!bash download_chlorophyll_a_concentration_data.sh

In [19]:
path = '../data/chlorophyll_a_concentration'

In [20]:
file = os.path.join(path, str(years[0]), sorted(os.listdir(os.path.join(path, str(years[0]))))[0])

In [21]:
data = xr.open_dataset(file)

In [22]:
data

In [23]:
data.info()

xarray.Dataset {
dimensions:
	lat = 4320 ;
	lon = 8640 ;
	time = 1 ;

variables:
	float32 MERIS_nobs_sum(time, lat, lon) ;
		MERIS_nobs_sum:long_name = Count of the number of observations from the MERIS sensor contributing to this bin cell ;
		MERIS_nobs_sum:number_of_files_composited = 31 ;
	float32 MODISA_nobs_sum(time, lat, lon) ;
		MODISA_nobs_sum:long_name = Count of the number of observations from the MODIS (Aqua) sensor contributing to this bin cell ;
		MODISA_nobs_sum:number_of_files_composited = 31 ;
	float32 SeaWiFS_nobs_sum(time, lat, lon) ;
		SeaWiFS_nobs_sum:long_name = Count of the number of observations from the SeaWiFS (GAC and LAC) sensor contributing to this bin cell ;
		SeaWiFS_nobs_sum:number_of_files_composited = 31 ;
	float32 VIIRS_nobs_sum(time, lat, lon) ;
		VIIRS_nobs_sum:long_name = Count of the number of observations from the VIIRS sensor contributing to this bin cell ;
		VIIRS_nobs_sum:number_of_files_composited = 31 ;
	float32 chlor_a(time, lat, lon) ;
		ch

In [24]:
mask_lon = (data['lon'] >= lon_min) & (data['lon'] <= lon_max)
mask_lat = (data['lat'] >= lat_min) & (data['lat'] <= lat_max)

In [25]:
chlor_a = pd.DataFrame(columns=['lat', 'lon', 'chlor_a', 'year', 'month'])

In [26]:
%%time

for year in years:
    print('Processing {}...'.format(year))
    files = os.listdir(os.path.join(path, str(year)))
    chlor_a = pd.DataFrame(columns=['lat', 'lon', 'chlor_a', 'year', 'month'])
    
    for file in files:
        chlor_a_temp = preprocess_ecv(os.path.join(path, str(year), file), 'chlor_a', mask_lon, mask_lat)
        chlor_a = chlor_a.append(chlor_a_temp)
    
    chlor_a = chlor_a.reset_index(drop=True)
    
    chlor_a = chlor_a.groupby(['lat', 'lon', 'year', 'month']).aggregate('mean').reset_index()
    
    chlor_a['chlor_a'] = np.log(chlor_a['chlor_a'])
    
    chlor_a = gpd.GeoDataFrame(chlor_a, geometry=gpd.points_from_xy(chlor_a.lon, chlor_a.lat))
    chlor_a.crs = CRS.from_epsg(4326)
    
    chlor_a = chlor_a.drop(['lat', 'lon'], axis=1)
    
    chlor_a.to_file(os.path.join(path, 'monthly_chlor_a_'+str(year)+'.shp'))

Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...
Processing 2018...
CPU times: user 1h 42min 37s, sys: 4min 27s, total: 1h 47min 4s
Wall time: 1h 47min 5s


In [27]:
chlor_a.shape

(4067400, 4)

In [28]:
chlor_a.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4067400 entries, 0 to 4067399
Data columns (total 4 columns):
 #   Column    Dtype   
---  ------    -----   
 0   year      int64   
 1   month     int64   
 2   chlor_a   float32 
 3   geometry  geometry
dtypes: float32(1), geometry(1), int64(2)
memory usage: 108.6 MB


In [29]:
chlor_a.head()

Unnamed: 0,year,month,chlor_a,geometry
0,2018,1,-1.772296,POINT (60.02083 0.02083)
1,2018,2,-2.034966,POINT (60.02083 0.02083)
2,2018,3,-1.320624,POINT (60.02083 0.02083)
3,2018,4,-2.163096,POINT (60.02083 0.02083)
4,2018,5,-2.307471,POINT (60.02083 0.02083)


In [30]:
del [chlor_a, chlor_a_temp]
gc.collect()

60

## land surface temperature

In [31]:
#!bash download_land_surface_temperature_data.sh

In [32]:
path = '../data/land_surface_temperature'

In [33]:
file = os.path.join(path, str(years[0]), sorted(os.listdir(os.path.join(path, str(years[0]))))[0])

In [34]:
data = xr.open_dataset(file)

In [35]:
data

In [36]:
data.info()

xarray.Dataset {
dimensions:
	channel = 2 ;
	lat = 3600 ;
	length_scale = 1 ;
	lon = 7200 ;
	time = 1 ;

variables:
	datetime64[ns] time(time) ;
		time:long_name = reference time of file ;
		time:standard_name = time ;
	timedelta64[ns] dtime(time, lat, lon) ;
		dtime:long_name = time difference from reference time ;
		dtime:valid_min = 0.0 ;
		dtime:valid_max = 86400.0 ;
	float32 lat(lat) ;
		lat:long_name = latitude_coordinates ;
		lat:standard_name = latitude ;
		lat:units = degrees_north ;
		lat:valid_min = -90.0 ;
		lat:valid_max = 90.0 ;
		lat:reference_datum = geographical coordinates, WGS84 projection ;
	float32 lon(lon) ;
		lon:long_name = longitude_coordinates ;
		lon:standard_name = latitude ;
		lon:units = degrees_east ;
		lon:valid_min = -180.0 ;
		lon:valid_max = 180.0 ;
		lon:reference_datum = geographical coordinates, WGS84 projection ;
	float32 satze(time, lat, lon) ;
		satze:long_name = satellite zenith angle ;
		satze:units = degrees ;
		satze:valid_min = 0 ;
		satze:

In [37]:
mask_lon = (data['lon'] >= lon_min) & (data['lon'] <= lon_max)
mask_lat = (data['lat'] >= lat_min) & (data['lat'] <= lat_max)

In [38]:
%%time

for year in years:
    print('Processing {}...'.format(year))
    files = os.listdir(os.path.join(path, str(year)))
    lst = pd.DataFrame(columns=['lat', 'lon', 'lst', 'year', 'month'])
    
    for file in files:
        lst_temp = preprocess_ecv(os.path.join(path, str(year), file), 'lst', mask_lon, mask_lat)
        lst = lst.append(lst_temp)
    
    lst = lst.reset_index(drop=True)
    
    lst = lst.groupby(['lat', 'lon', 'year', 'month']).aggregate('mean').reset_index()
    
    lst = gpd.GeoDataFrame(lst, geometry=gpd.points_from_xy(lst.lon, lst.lat))
    lst.crs = CRS.from_epsg(4326)
    
    lst = lst.drop(['lat', 'lon'], axis=1)
    
    lst.to_file(os.path.join(path, 'monthly_lst_'+str(year)+'.shp'))

Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...
Processing 2018...
CPU times: user 1h 22min 45s, sys: 3min 38s, total: 1h 26min 23s
Wall time: 1h 26min 24s


In [39]:
lst.shape

(3419527, 4)

In [40]:
lst.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3419527 entries, 0 to 3419526
Data columns (total 4 columns):
 #   Column    Dtype   
---  ------    -----   
 0   year      int64   
 1   month     int64   
 2   lst       float32 
 3   geometry  geometry
dtypes: float32(1), geometry(1), int64(2)
memory usage: 91.3 MB


In [41]:
lst.head()

Unnamed: 0,year,month,lst,geometry
0,2018,2,273.089996,POINT (97.77500 0.02500)
1,2018,4,294.190002,POINT (97.77500 0.02500)
2,2018,6,265.100006,POINT (97.77500 0.02500)
3,2018,9,270.079987,POINT (97.77500 0.02500)
4,2018,11,248.229996,POINT (97.77500 0.02500)


In [42]:
del [lst, lst_temp]
gc.collect()

60