# features
feature| GPS |mobile network|
---|:----:|:---------:|
latency|---|timestamp_index - timestamp_transfer|
coverage|position_determination |if data is cached before it is send, no connection is available|
quality|signal_quality_satellite|signal_quality_hdop|



In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from os.path import join
import numpy as np

In [None]:
parquet_path = join('..', '..', 'data', 'TUDA_data', 'all_TUDA_data.parquet')
df = pd.read_parquet(parquet_path, 
                     columns=[
                         "timestamp_index", "timestamp_transfer", "timestamp_measure_position", 
                         "determination_position", "latitude", "longitude", "loading_state", 
                         "signal_quality_satellite", "signal_quality_hdop"
                     ]
                    )

In [None]:
df['latency'] = df.timestamp_index - df.timestamp_transfer

## Coverage: 
**GPS**: binär, als Indikator position determination nehmen
+ wenn position_determination = 1 -> covered (1)
+ wenn position determination = 4 -> not covered(0)

**mobile**: als Indikator Dauer der Zwischenspeicherung nehmen
+ Zwischenspeicherung: timestamp_transfer - timestamp_measure_position 

In [None]:
df['coverage_gps'] = df.determination_position == 1
df.coverage_gps.replace({True: 1, False: 0}, inplace=True)

In [None]:
df['coverage_mobile'] = df.timestamp_transfer - df.timestamp_measure_position

## check correlation between features

In [None]:
df_features = df[['determination_position', 'latency', 'coverage_gps', 'coverage_mobile', 
                  'signal_quality_satellite', 'signal_quality_hdop']].copy()
corrMatrix = df_features.corr()
fig = plt.figure(figsize=(7, 7))
sn.heatmap(corrMatrix, annot=True)
output_path = join('..', 'output')
fig.savefig(join(output_path, 'features_correlation.png'), dpi=300, bbox_inches='tight')

### relative Standardabweichung

In [None]:
std_rel = (df_features.std() / df_features.mean()).to_numpy()

In [None]:
fig = plt.figure(figsize=(10, 3))
ax = fig.add_axes([0,0,1,1])
ax.bar(df_features.columns,std_rel)
plt.title('relative standard deviation')
plt.grid(axis='y')
fig.savefig(join(output_path, 'features_std.png'), dpi=300, bbox_inches='tight')

### error analysis

In [None]:
std_errs = []
std_errs_rel = []
for feature in df_features.columns:
    clean_feature = df_features[feature].dropna()
    std_err = clean_feature.std() / len(clean_feature)
    std_errs.append(std_err)
    std_errs_rel.append(std_err/clean_feature.mean())

In [None]:
fig = plt.figure(figsize=(10, 3))
ax = fig.add_axes([0,0,1,1])
ax.bar(df_features.columns,std_errs)
plt.title('standard error')
plt.grid(axis='y')
fig.savefig(join(output_path, 'features_std_error.png'), dpi=300, bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(10, 3))
ax = fig.add_axes([0,0,1,1])
ax.bar(df_features.columns,std_errs_rel)
plt.title('relative standard error')
plt.grid(axis='y')
fig.savefig(join(output_path, 'features_std_error_rel.png'), dpi=300, bbox_inches='tight')

In [None]:
'''from scipy.stats import t

def get_statistics(x: iter) -> tuple:
    """
    calculate mean and standard deviation of an iterable
    """
    mean = np.mean(x)
    stddev = np.std(x)
    return mean, stddev


def get_correlation_coefficient(x: iter, y: iter) -> float:
    """
    calculate correlation coefficient of two iterables of the same lengths
    """
    x_mean, x_stddev = get_statistics(x)
    y_mean, y_stddev = get_statistics(y)
    sum_products = 0
    for xi, yi in zip(x, y):
        sum_products += (xi - x_mean) * (yi - y_mean)
    s_xy = 1 / (len(x) - 1) * sum_products
    return s_xy / (x_stddev * y_stddev)


def correlation_test(x, y):
    # make sure that x and y have same length:
    len_min = min((len(x), len(y)))
    x = x[:len_min]
    y = y[:len_min]
    r_xy = get_correlation_coefficient(x, y)
    if abs(r_xy - 1) < 1e-6:
        r_xy = int(r_xy)
        p_value = 0.0
    else:
        t_score = r_xy * np.sqrt(len_min - 2) / np.sqrt(1 - r_xy ** 2)
        p_value = t.sf(abs(t_score), len_min - 2) * 2
    if p_value < 0.05:
        correlated = True
    else:
        correlated = False
    return correlated, p_value, r_xy'''

In [None]:
#correlation_test(df_features.dropna(subset=['signal_quality_hdop']).signal_quality_hdop.to_numpy(), df_features.dropna(subset=['signal_quality_hdop']).coverage_mobile.to_numpy())

In [None]:
"""n_features = len(df_features.columns)
correlation_mat = np.zeros((n_features, n_features))
for i, col_i in enumerate(df_features.columns):
    for j, col_j in enumerate(df_features.columns):
        if i >= j:
            correlated, _, corrcoef = correlation_test(df_features[col_i], df_features[col_j])
            # check NaN
            if corrcoef == corrcoef:
                # use symmetry of correlation matrix
                correlation_mat[i, j] = corrcoef
                correlation_mat[j, i] = corrcoef
            else:
                print("i: %s, j: %s" %(col_i, col_j))
                #correlated, _, corrcoef = correlation_test(df_features.dropna(subset=['col_i,'])[col_i], df_features.dropna()[col_j])
            if not correlated:
                correlation_mat[i, j] = np.nan
                correlation_mat[j, i] = np.nan"""

In [None]:
#sn.heatmap(correlation_mat, vmin=-1, annot=True)