# **Data analisys for validation MODIS vs VESPA-22 IWV measurements**

In [None]:
## **Import libraries**

In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices

from scipy import stats

## **Selected data reception from Thule-mix-LAADS.ipynb**

In [None]:
%store -r max_IWV
%store -r max_dist_from_THAAO
%store -r deltatime
%store -r new_mod_vespa
%store -r new_myd_vespa
%store -r new_mxd_vespa
# passaggio plot_defaults
%store -r plot_defaults
# passaggio dati completi non filtrati con deltatime
%store -r vespa_data

### **Controllo dati ricevuti**

In [None]:
vespa_data.head(1)     # ok: raw per test


In [None]:
#vespa_data.drop(columns='ts_1', inplace=True)

In [None]:
vespa_data.head(1)     # ok: raw per test

In [None]:
num_mod_data_p_Dt = len(new_mod_vespa.IWV_MODIS) 
num_myd_data_p_Dt = len(new_myd_vespa.IWV_MODIS)
num_mxd_data_p_Dt = len(new_mxd_vespa.IWV_MODIS)
num_modmyd_data_p_Dt = num_mod_data_p_Dt + num_myd_data_p_Dt

In [None]:
print(f'-- Selection parameters --')
print(f'max IWV = {max_IWV} [mm]')
print(f'max distance from THAOO = {max_dist_from_THAAO} [km]')
print(f'Deltatime chosen for data points selection = {deltatime}\n')
print(f'-- Data points selected --')
print(f'Selected number of MOD data points =          {num_mod_data_p_Dt}')
print(f'Selected number of MYD data points =          {num_myd_data_p_Dt}\n')
print(f'Total selected number of MXD data points     =  {num_mxd_data_p_Dt}')
print(f'Total selected number of MOD+MYD data points =  {num_modmyd_data_p_Dt}\n')

## **Analisi spettrale**

### **Selezione e plot dei dati in un intervallo temporale** 

In [None]:
import hvplot.pandas

color_THAAO = 'green'

In [None]:
# seleziono un intervallo temporale che voglio analizzare 
# NOTA: .copy() è necessario per evitare una shallow copy ed il settingwithcopywarning
vespa_time_range = vespa_data.loc[vespa_data["Time"].between('2019-01-01 00:00:00','2021-01-01 00:00:00'),:].copy()

In [None]:
vespa_time_range.tail(2)  #head(2)

In [None]:
# converto il time range selezionato da dateime a timestamp (float in seconds, default=ns)
vespa_time_range['timestamp'] = pd.to_numeric(vespa_time_range['Time'].values)/ 10 ** 9  # to timestamp in seconds (default=ns)
type(vespa_time_range['timestamp'][2])

In [None]:
# test selezione
print(len(vespa_time_range))
vespa_time_range.head(3)  #.tail(3)

In [None]:
# riconverto il time range selezionato da timestamp (float in seconds) a datetime 
vespa_time_range['datetime'] = pd.to_datetime(vespa_time_range['timestamp'], 
                                              utc=True, unit='s')
type(vespa_time_range['datetime'][2].value)
vespa_time_range.head(3)

In [None]:
vespa_ts_scatter = vespa_time_range.IWV_THAAO.hvplot.scatter(color=color_THAAO, label='VESPA',
                                                       fields={'date': 'Date'},
                                                       title='IWV THAAO - VESPA', 
                                                       **plot_defaults)

In [None]:
vespa_ts_scatter_rec = vespa_time_range.IWV_THAAO.hvplot.scatter(color='red', label='VESPA_rec',
                                                       fields={'date': 'Date'},
                                                       title='IWV THAAO - VESPA', 
                                                       **plot_defaults)

In [None]:
(vespa_ts_scatter*vespa_ts_scatter_rec).opts(show_grid=True)

### **Interpolazione**
Poiché i dati non sono equispaziati e contengono buchi provo ad interpolare prima di fare lo spettro <br>

vedi [gaps](http://mres.uni-potsdam.de/index.php/2017/08/22/data-voids-and-spectral-analysis-dont-be-afraid-of-gaps/) <br>
**NOTA**: Since all these (and other) interpolation techniques might introduce artifacts into the data, <br> 
it is always advisable to (1) keep the total number of data points constant before and after <br>
interpolation, (2) report the method employed for estimating the evenly-spaced data sequence, <br>
and (3) explore the effect of interpolation on the variance of the data.

In [None]:
# creo la time series di timestamp con dati temporalmente equispaziati

In [None]:
print(vespa_time_range['Time'][0])
print(vespa_time_range['Time'].iloc[-1])
print(len(vespa_time_range['Time']))

In [None]:
date_rng = pd.date_range(start=vespa_time_range['Time'][0], 
                         end=vespa_time_range['Time'].iloc[-1], 
                         periods=len(vespa_time_range['Time']))
print(len(date_rng))
print(date_rng[0])
print(date_rng[-1])
print(type(date_rng))

In [None]:
vespa_time_range['date_rng_timestamp'] = 

In [None]:
# converto il time range selezionato da dateime a timestamp (float in seconds, default=ns)
vespa_time_range['timestamp'] = pd.to_numeric(vespa_time_range['Time'].values)/ 10 ** 9  # to timestamp in seconds (default=ns)
type(vespa_time_range['timestamp'][2])

#### **Linear interpolation**

#### **Cubic spline interpolation**

In [None]:
from scipy.interpolate import CubicSpline, PchipInterpolator, Akima1DInterpolator

In [None]:
y1 = scipy.interpolate.interp1d(vespa_time_range['timestamp'], y, kind='linear')

In [None]:
Series.resample

#### **Piecewise Cubic Hermite Interpolating Polynomial interpolation**

#### **Plot of interpolations and original data**

#### **Periodogrammi**

## **Statistica**

### **Scipy stats**

#### **Pearson and Spearman correlation coefficients**

In [None]:
res_pearson = stats.pearsonr(new_mxd_vespa.IWV_MODIS,new_mxd_vespa.IWV_THAAO)

In [None]:
print(res_pearson)
print(res_pearson.confidence_interval())

**test**: Pearson correlation coefficient's pvalue=0.0 is less than the minimum floating value, see ref.: 
[pvalue](https://https://stackoverflow.com/questions/45914221/minimal-p-value-for-scipy-stats-pearsonr) <br>

In [None]:
from scipy.stats import beta
from scipy.special import btdtr
ab = 0.5*num_mxd_data_p_Dt
prob = btdtr(ab, ab, 0.5*(1-abs(res_pearson.statistic)))
prob = beta(ab, ab).cdf(0.5*(1-abs(res_pearson.statistic)))
prob

In [None]:
res_spearman = stats.spearmanr(new_mxd_vespa.IWV_MODIS,new_mxd_vespa.IWV_THAAO)

In [None]:
print(f'Spearman: statistic={res_spearman.statistic}, pvalue={res_spearman.pvalue}')

### **Statsmodels**

#### **Ordinary Least Square regression (OLS)**

In [None]:
# Ordinary Least Square regression
mod = smf.ols(formula='IWV_MODIS ~ IWV_THAAO + diff_int + diff_distance + vza', data=new_mxd_vespa)
res = mod.fit()
print(res.summary())

In [None]:
# Ordinary Least Square regression
mod = smf.ols(formula='IWV_MODIS ~ IWV_THAAO', data=new_mxd_vespa)
res = mod.fit()
print(res.summary())

In [None]:
res.params

In [None]:
res.rsquared

In [None]:
sm.graphics.plot_partregress('IWV_MODIS','IWV_THAAO', ['diff_distance'], data=new_mxd_vespa, obs_labels=False)

#### **Error/Trend/Seasonality (ETS) model**

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
import numpy as np


In [None]:
new_mxd_vespa_1 = new_mxd_vespa.reset_index(drop=True)

In [None]:
new_mxd_vespa_1 = new_mxd_vespa_1.set_index('Time')
new_mxd_vespa_1 = new_mxd_vespa_1.sort_index()

In [None]:
#new_mxd_vespa_1 = new_mxd_vespa_1.asfreq(pd.infer_freq(new_mxd_vespa_1.index),method='bfill')

In [None]:
#new_mxd_vespa_1.asfreq(method='bfill')

In [None]:
result_add = seasonal_decompose(new_mxd_vespa_1['IWV_MODIS'],
                                #period=pd.to_datetime(new_mxd_vespa['Time_IWV_MODIS']).astype(np.int64),
                                period=500,
                                model = 'add')

result_add.plot();
#result_mul = seasonal_decompose(df['Sales'],model = 'mul')
#result_mul.plot();

In [None]:
from statsmodels.tsa.seasonal import STL

stl = STL(new_mxd_vespa_1['IWV_MODIS'], seasonal=150)
res = stl.fit()
fig = res.plot()

# **Prove**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = {'year': [2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016],
        'month': [12,11,10,9,8,7,6,5,4,3,2,1,12,11,10,9,8,7,6,5,4,3,2,1],
        'interest_rate': [2.75,2.5,2.5,2.5,2.5,2.5,2.5,2.25,2.25,2.25,2,2,2,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75],
        'unemployment_rate': [5.3,5.3,5.3,5.3,5.4,5.6,5.5,5.5,5.5,5.6,5.7,5.9,6,5.9,5.8,6.1,6.2,6.1,6.1,6.1,5.9,6.2,6.2,6.1],
        'index_price': [1464,1394,1357,1293,1256,1254,1234,1195,1159,1167,1130,1075,1047,965,943,958,971,949,884,866,876,822,704,719]        
        }

df = pd.DataFrame(data)

plt.scatter(df['interest_rate'], df['index_price'], color='red')
plt.title('Index Price Vs Interest Rate', fontsize=14)
plt.xlabel('Interest Rate', fontsize=14)
plt.ylabel('Index Price', fontsize=14)
plt.grid(True)
plt.show()

In [None]:
plt.scatter(df['unemployment_rate'], df['index_price'], color='green')
plt.title('Index Price Vs Unemployment Rate', fontsize=14)
plt.xlabel('Unemployment Rate', fontsize=14)
plt.ylabel('Index Price', fontsize=14)
plt.grid(True)
plt.show()

In [None]:
#import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm

data = {'year': [2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016],
        'month': [12,11,10,9,8,7,6,5,4,3,2,1,12,11,10,9,8,7,6,5,4,3,2,1],
        'interest_rate': [2.75,2.5,2.5,2.5,2.5,2.5,2.5,2.25,2.25,2.25,2,2,2,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75],
        'unemployment_rate': [5.3,5.3,5.3,5.3,5.4,5.6,5.5,5.5,5.5,5.6,5.7,5.9,6,5.9,5.8,6.1,6.2,6.1,6.1,6.1,5.9,6.2,6.2,6.1],
        'index_price': [1464,1394,1357,1293,1256,1254,1234,1195,1159,1167,1130,1075,1047,965,943,958,971,949,884,866,876,822,704,719]        
        }

df = pd.DataFrame(data)

x = df[['interest_rate','unemployment_rate']]
y = df['index_price']
 
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(x, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# with statsmodels
x = sm.add_constant(x) # adding a constant
 
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
 
print_model = model.summary()
print(print_model)
