In [None]:
from pathlib import Path
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from matplotlib.backends.backend_pdf import PdfPages

In [None]:
# load trhee snapshots of node attrs
eda_path_1 = Path(f"analysis/1_eda.csv")
eda_path_2 = Path(f"analysis/2_eda.csv")
eda_path_3 = Path(f"analysis/3_eda.csv")

In [None]:
eda_1 = pd.read_csv(eda_path_1, index_col=0)
eda_2 = pd.read_csv(eda_path_2, index_col=0)
eda_3 = pd.read_csv(eda_path_3, index_col=0)

eda_3.head()

In [None]:
# execute if you want to extract stats for one gender only
genders = ("female", "male", "all_genders")
selected_gender = genders[2]  # <- select proper gender!!!

if selected_gender != genders[-1]:
    eda_1 = eda_1.loc[eda_1["metric_Gender"] == selected_gender]
    eda_2 = eda_2.loc[eda_2["metric_Gender"] == selected_gender]
    eda_3 = eda_3.loc[eda_3["metric_Gender"] == selected_gender]

In [None]:
eda_1.columns, len(eda_1.columns), len(eda_1)

In [None]:
def plot_time_curve(snapshots: List[pd.DataFrame], column: str, pdf_writer: PdfPages) -> None:

    series: Dict[str, np.ndarray] = {}
    for idx, snapshot in enumerate(snapshots, 1):  # snapshots are ordered by time ascending!!!
        arr = snapshot[column].to_numpy()
        arr = arr[~np.isnan(arr)]
        series[f"snapshot: {idx}"] = arr
    
    fig, ax = plt.subplots()
    ax.boxplot(series.values(), patch_artist=False, meanline=True)
    ax.set_xticklabels(series.keys())
    fig.suptitle(f"Change of {column} during examination")
    fig.savefig(pdf_writer, format="pdf")
    plt.close(fig)
    

In [None]:
with PdfPages(f"analysis/time_plots_{selected_gender}.pdf") as pdf:
    for col in eda_1.columns:
        if not pd.api.types.is_numeric_dtype(eda_1[col]):
            continue
        arr = plot_time_curve([eda_1, eda_2, eda_3], col, pdf)