# Sat Density Exploration

In this Jupyter Notebook we analyze the given sat density values. Especially outlier detection and regularities (e.g., sinusoidal, etc.) are interesting. Also, the general magnitude is interesting. This knowledge can be taken into account when looking also at the other datasets as the `SAT_DENSITY` files contain a `file id` which is how the datasets can be related.

In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
aggregated_dir = Path('../data/preprocessed/aggregated')
champ_path = aggregated_dir / 'champ_-density.feather'
gr_of1_path = aggregated_dir / 'gr-of1-density.feather'
grace1_path = aggregated_dir / 'grace1-density.feather'
grace2_path = aggregated_dir / 'grace2-density.feather'
swarma_path = aggregated_dir / 'swarma-density.feather'

In [None]:
df = pd.read_feather(gr_of1_path)
df.stack().hist(bins=100)
plt.show()

In [None]:
df.info()

In [None]:
filtered_df: pd.DataFrame = df[df['Orbit Mean Density (kg/m^3)'] > -0.0]
filtered_df.describe()

In [None]:
stacked_df: pd.DataFrame = filtered_df.stack()
stacked_df.hist(bins=250)
plt.hist()
plt.show()

## Combined

After looking at all sources individually, a combined analysis.

In [None]:
paths = [champ_path, gr_of1_path, grace1_path, grace2_path, swarma_path]
labels = ["champ", "gr-of1", "grace1", "grace2", "swarma"]
dfs = [pd.read_feather(path) for path in paths]
dfs = [*map(lambda _df: _df[_df['Orbit Mean Density (kg/m^3)'] > -0.0].stack(), dfs)]

fig, ax = plt.subplots()

ax.hist(dfs, bins=500, histtype="step", fill=True, label=labels)
fig.legend(loc="upper right")
fig.show()

In [None]:
fig, ax = plt.subplots()

ax.hist(dfs, bins=500, histtype="step", fill=True, log=True, label=labels)
fig.legend()
fig.show()

## Individual analysis

Now we plot some files of the dataset individually, to get a feel for the regularities and so forth.

In [None]:
import os

data_path = Path('../data/preprocessed/SAT_DENSITY')

In [None]:
files_list = os.listdir(data_path)
dfs = [*map(lambda _df: _df[_df['Orbit Mean Density (kg/m^3)'] > -0.0], (pd.read_parquet(data_path / files_list[i]) for i in range(0, len(files_list), len(files_list) // 100)))]
df = pd.read_parquet(data_path / files_list[12])
df = df[df['Orbit Mean Density (kg/m^3)'] > -0.0]

In [None]:
df.plot()
plt.show()

In [None]:
fig, ax = plt.subplots()
for df in dfs:
    ax.plot(range(df.shape[0]), df['Orbit Mean Density (kg/m^3)'])
fig.show()

In [None]:
# as confidence intervals for better visibility
import numpy as np
from scipy.stats import norm

# mapping functions
def _prepare_df(_df: pd.DataFrame) -> np.ndarray:
    _df.loc[_df['Orbit Mean Density (kg/m^3)'] < -0.0, 'Orbit Mean Density (kg/m^3)'] = np.nan
    _df.interpolate(inplace=True, limit_direction='both')
    return _df.to_numpy().reshape(-1)

def _ensure_minimum_points(_arr: np.ndarray, min_points: int) -> np.ndarray:
    if _arr.shape[0] < min_points:
        padding_size = min_points - _arr.shape[0]
        padding_arr = np.full(padding_size, _arr[-1], dtype=_arr.dtype)
        return np.nan_to_num(np.concatenate((_arr, padding_arr)), copy=False, nan=padding_arr[0])
    return _arr

def confidence_intervals(arr: np.ndarray, confidence_level: float = 0.6) -> tuple[np.ndarray, np.ndarray]:
    lower_limit, remainder = divmod(arr.shape[0], 1 / (1 - confidence_level))
    upper_limit = lower_limit + remainder
    lower_limit, upper_limit = int(lower_limit), int(upper_limit)

    _lower_bounds, _upper_bounds = np.empty(arr.shape[1], dtype=arr.dtype), np.empty(arr.shape[1], dtype=arr.dtype)
    for i in range(arr.shape[1]):
        sorted_arr = np.sort(arr[:, i])
        _lower_bounds[i], _upper_bounds[i] = sorted_arr[lower_limit], sorted_arr[-upper_limit]

    return _lower_bounds, _upper_bounds


In [None]:
fig, ax = plt.subplots()

data_points = [*map(lambda _df: _prepare_df(_df), (pd.read_parquet(data_path / file_name) for file_name in files_list))]
max_points = max(map(lambda _arr: _arr.shape[0], data_points))
data_points = [*filter(lambda _arr: ~np.any(np.isnan(_arr)), map(lambda _df: _ensure_minimum_points(_df, max_points), data_points))]
y = np.vstack(data_points)
x = np.arange(y.shape[1])

lower_bounds, upper_bounds = confidence_intervals(y, confidence_level=0.95)
ax.plot(x, np.mean(y, axis=0))
ax.fill_between(x, lower_bounds, upper_bounds, alpha=0.5)
ax.set_yscale("log")
fig.show()

In [None]:
%matplotlib notebook