# preprocess essential climate variables

In [None]:
import os
import numpy as np
import pandas as pd
import netCDF4 as nc
import xarray as xr
import geopandas as gpd

In [None]:
def select_bounding_box_region(path, years, ecv, lon_min, lon_max, lat_min, lat_max):
    
    data_final = pd.DataFrame(columns=['lat', 'lon', 'time', ecv])
    
    for year in years:
        files = os.listdir(path+year)
        
        for file in files:
            data = xr.open_dataset(path+year+'/'+file)
            data = data.to_dataframe().reset_index()
            data_filtered = data[(data.lon <= lon_max) & (data.lon >= lon_min) & (data.lat >= lat_min) & (data.lat <= lat_max)].dropna(subset=[ecv])[['lat', 'lon', 'time', ecv]]
            data_final = data_final.append(data_filtered)
            
    data_final = data_final.reset_index(drop=True)
    
    return data_final

In [None]:
years = ['2010',
         '2011',
         '2012',
         '2013',
         '2014',
         '2015']

In [None]:
lon_min, lon_max, lat_min, lat_max = 60, 100, 0, 40

## sea surface salinity

In [None]:
!bash download_sea_surface_salinity_data.sh

In [None]:
path = '../data/sea_surface_salinity/'

In [None]:
data = nc.Dataset(path+years[0]+'/'+sorted(os.listdir(path+years[0]))[0])

In [None]:
for var in data.variables.values():
    print(var)

In [None]:
%%time

sss = select_bounding_box_region(path, years, 'sss', lon_min, lon_max, lat_min, lat_max)

In [None]:
sss.shape

In [None]:
sss.info()

In [None]:
sss.head()

In [None]:
sss['year'] = sss.time.apply(lambda x: x.year)

In [None]:
sss['month'] = sss.time.apply(lambda x: x.month)

In [None]:
sss['month'] = sss.month.astype(str).str.pad(2, side='left',fillchar='0')

In [None]:
sss = sss[['lat', 'lon', 'year', 'month', 'sss']].groupby(['lat', 'lon', 'year', 'month']).aggregate('mean').reset_index()

In [None]:
sss.shape

In [None]:
sss.info()

In [None]:
sss.head()

In [None]:
sss = gpd.GeoDataFrame(sss, geometry=gpd.points_from_xy(sss.lon, sss.lat), crs={'init': 'epsg:4326'})

In [None]:
sss.drop(['lat', 'lon'], axis=1, inplace=True)

In [None]:
sss.info()

In [None]:
sss.to_file(path+'monthly_sss_2010_2015.shp')

## chlorophyll-a concentration

In [None]:
!bash download_chlorophyll_a_concentration_data.sh

In [None]:
path = '../data/chlorophyll_a_concentration/'

In [None]:
data = nc.Dataset(path+years[0]+'/'+sorted(os.listdir(path+years[0]))[0])

In [None]:
for var in data.variables.values():
    print(var)

In [None]:
%%time

chlor_a = select_bounding_box_region(path, years, 'chlor_a', lon_min, lon_max, lat_min, lat_max)

In [None]:
chlor_a.shape

In [None]:
chlor_a.info()

In [None]:
chlor_a.head()

In [None]:
chlor_a['year'] = chlor_a.time.apply(lambda x: x.year)

In [None]:
chlor_a['month'] = chlor_a.time.apply(lambda x: x.month)

In [None]:
chlor_a['month'] = chlor_a.month.astype(str).str.pad(2, side='left',fillchar='0')

In [None]:
chlor_a['chlor_a_log'] = np.log(chlor_a['chlor_a'])

In [None]:
chlor_a.drop(['time', 'chlor_a'], axis=1, inplace=True)

In [None]:
chlor_a.head()

In [None]:
chlor_a = gpd.GeoDataFrame(chlor_a, geometry=gpd.points_from_xy(chlor_a.lon, chlor_a.lat), crs={'init': 'epsg:4326'})

In [None]:
chlor_a.drop(['lat', 'lon'], axis=1, inplace=True)

In [None]:
chlor_a.info()

In [None]:
chlor_a.to_file(path+'monthly_chlor_a_2010_2015.shp')