# **Data loader and main data filters**

In [None]:
%run reader.ipynb

In [None]:
airs_data_asc = airs_read('AIRS/AIRS3STD_7_0_TotH2OVap_A.20160701-20220531.68W_76N_68W_76N.csv')
airs_data_disc = airs_read('AIRS/AIRS3STD_7_0_TotH2OVap_D.20160701-20220531.68W_76N_68W_76N.csv')
cimel_data = cimel_read('CIMEL/20170101_20211231_Thule.lev20')
ftir_data = ftir_read('FTIR/h2o_FTIR_2016_2020.dat')
mod_data = modis_read('MODIS/MOD05_5km_totale.csv')
myd_data = modis_read('MODIS/MYD05_5km_totale.csv')
radiosounding_data = radiosounding_read('RADIOSOUNDING/PWVradiosondeThule.txt')
vespa_data = vespa_read('VESPA/Vapor_20160716_20220311.txt')

In [None]:
year=[2016,2017,2018,2019]
gnss_data = pd.concat([gnss_read(f'GNSS/THU3_{i}global.txt',i) for i in year])
hatpro_data = pd.concat([hatpro_read(f'HATPRO/IWV_{i}_HATPRO.txt',i) for i in year])

In [None]:
#gnss_data

In [None]:
#hatpro_data

# **Main data filters** 

In [None]:
# passaggio parametri di filtraggio dati
%store -r filter_param

max_IWV = filter_param.get('max_IWV')
max_IWV_unit = filter_param.get('max_IWV_unit')
max_dist_from_THAAO = filter_param.get('max_dist_from_THAAO')
max_dist_from_THAAO_unit = filter_param.get('max_dist_from_THAAO_unit')
deltatime = filter_param.get('deltatime')

In [None]:
# print di controllo
print(f'max_IWV = {max_IWV} mm\n')
print(f'max_dist_from_THAAO = {max_dist_from_THAAO} Km\n')
print(f'deltatime = {deltatime}\n')

### Drop rows with water values > `max_IWV` 

In [None]:
mod_data = df_drop(mod_data, mod_data.IWV_MODIS > max_IWV)
myd_data = df_drop(myd_data, myd_data.IWV_MODIS > max_IWV)

### Drop rows with distance values > `max_dist_from_THAAO` [Km] 

In [None]:
mod_data = distance_from_station(mod_data,drop_above=max_dist_from_THAAO)
myd_data = distance_from_station(myd_data,drop_above=max_dist_from_THAAO)

mxd_data = pd.concat([mod_data,myd_data], sort=True)

## **Store the complete Modis and Vespa datasets**

In [None]:
%store mod_data myd_data mxd_data vespa_data

## **Merging pandas dataframes to filter MODIS and othe instruments (THAAO,...) data points closest in time**

__TODO: rivedere questo discorso, troppo semplice e contorto insieme__ <br>
Due to time lenght of the timeseries, even if the notebook representation allows to zoom in into the datasets, it is still not easy to see how close in time two data points are. Since it makes little sense to compare two MODIS and THAAO IWV data points if they are too far apart, and they can easily be hours or days far apart, therefore easily mistaken if one relies solely on the visual inspection of the plots, a tool to select and compare only the data points which are closer in time is required. The following code allows to filter the data on a timespan basis by suitably merging modis and thaao pandas dataframes while defining the Timedelta parameter (expressed in s,m,d, i.e. seconds, minutes or days respectively) that allows to operate the time-based filter. Different choices of Timedelta allow to widen or narrow the time span around each MODIS data point and select for the comparison only the THAAO data points that fall within the chosen interval. This selection is based on MODIS data points because THAAO measures are far more dense. <br>  

## **Subset ModisVsVespa dataset less than `deltatime` minutes**

In [None]:
new_myd_vespa = closest_in_time(myd_data,vespa_data,'IWV_MODIS','IWV_THAAO',on='Time',delta_time=deltatime)
new_mod_vespa = closest_in_time(mod_data,vespa_data,'IWV_MODIS','IWV_THAAO',on='Time',delta_time=deltatime)

In [None]:
new_mod_vespa = new_mod_vespa.dropna(subset=['IWV_MODIS','IWV_THAAO'])
new_myd_vespa = new_myd_vespa.dropna(subset=['IWV_MODIS','IWV_THAAO'])
new_mxd_vespa = pd.concat([new_mod_vespa,new_myd_vespa], sort=True)

%store new_mod_vespa new_myd_vespa new_mxd_vespa

## **Subset AIRS dataset less than 24 hours**

In [None]:
#AIRS è su AQUA quindi confronto solo con myd

new_mod_airs_asc = closest_in_time(mod_data,airs_data_asc,'IWV_MODIS','IWV[mm]',on='Time',delta_time='24h')
new_myd_airs_asc = closest_in_time(myd_data,airs_data_asc,'IWV_MODIS','IWV[mm]',on='Time',delta_time='24h')
new_mod_airs_disc = closest_in_time(mod_data,airs_data_disc,'IWV_MODIS','IWV[mm]',on='Time',delta_time='24h')
new_myd_airs_disc = closest_in_time(myd_data,airs_data_disc,'IWV_MODIS','IWV[mm]',on='Time',delta_time='24h')

In [None]:
new_mod_airs_asc = new_mod_airs_asc.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_myd_airs_asc = new_myd_airs_asc.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_mod_airs_disc = new_mod_airs_disc.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_myd_airs_disc = new_myd_airs_disc.dropna(subset=['IWV_MODIS','IWV[mm]'])

new_mxd_airs_asc = pd.concat([new_mod_airs_asc,new_myd_airs_asc], sort=True)
new_mxd_airs_disc = pd.concat([new_mod_airs_disc,new_myd_airs_disc], sort=True)

new_mxd_airs = pd.concat([new_mxd_airs_asc,new_mxd_airs_disc], sort=True)

%store new_mod_airs_asc new_mod_airs_disc new_mxd_airs new_mxd_airs_asc new_mxd_airs_disc
%store new_myd_airs_asc new_myd_airs_disc

In [None]:
%store airs_data_asc airs_data_disc
%store cimel_data ftir_data hatpro_data radiosounding_data
%store gnss_data

## **Subset ModisVsCimel dataset less than `deltatime` minutes**

In [None]:
new_myd_cimel = closest_in_time(myd_data,cimel_data,'IWV_MODIS','IWV[mm]',on='Time',delta_time=deltatime)
new_mod_cimel = closest_in_time(mod_data,cimel_data,'IWV_MODIS','IWV[mm]',on='Time',delta_time=deltatime)

In [None]:
new_mod_cimel = new_mod_cimel.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_myd_cimel = new_myd_cimel.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_mxd_cimel = pd.concat([new_mod_cimel,new_myd_cimel], sort=True)

%store new_mod_cimel new_myd_cimel new_mxd_cimel

## **Subset ModisVsFTIR dataset less than `deltatime` minutes**

In [None]:
new_myd_ftir = closest_in_time(myd_data,ftir_data,'IWV_MODIS','IWV[mm]',on='Time',delta_time=deltatime)
new_mod_ftir = closest_in_time(mod_data,ftir_data,'IWV_MODIS','IWV[mm]',on='Time',delta_time=deltatime)

In [None]:
new_mod_ftir = new_mod_ftir.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_myd_ftir = new_myd_ftir.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_mxd_ftir = pd.concat([new_mod_ftir,new_myd_ftir], sort=True)

%store new_mod_ftir new_myd_ftir new_mxd_ftir

## **Subset ModisVsGnss dataset less than `deltatime` minutes** ###problemi nella lettura dell'header del file

deltatime### Subset ModisVsHatpro dataset less than _deltatime_ minutes  ###problemi nella lettura dell'header del file

## **Subset ModisVsRadio dataset less than `deltatime` minutes**

In [None]:
new_myd_radio = closest_in_time(myd_data,radiosounding_data,'IWV_MODIS','IWV[mm]',on='Time',delta_time='10m')
new_mod_radio = closest_in_time(mod_data,radiosounding_data,'IWV_MODIS','IWV[mm]',on='Time',delta_time='10m')

In [None]:
new_mod_radio = new_mod_radio.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_myd_radio = new_myd_radio.dropna(subset=['IWV_MODIS','IWV[mm]'])
new_mxd_radio = pd.concat([new_mod_radio,new_myd_radio], sort=True)

%store new_mod_radio new_myd_radio new_mxd_radio