In [None]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path

import polars as pl
from polars import DataFrame
import pandas as pd

from scipy.stats import norm
import numpy as np
import scipy as sc


DATA_DIR = Path("/Volumes/secure/data/early_markers/cribsy")

# def fit_norm(df: DataFrame) -> DataFrame:
#    # Extract the "Value" column and fit the normal distribution
#    values = df["Value"]
#    params = norm.fit(values)
#    # You might want to return something meaningful here, like the fitted parameters
#    df_stats = pl.DataFrame({"mean": params[0], "std": params[1]})
#    return df.hstack(df_stats)


df_features = pl.from_pandas(pd.read_pickle(DATA_DIR / "features_merged.pkl")).with_columns(
    feature=pl.col("part") + "_" + pl.col("feature_name"),
)

df_features_x = df_features.select(["infant","feature","Value","category"]).sort("feature")
# Filter before grouping
df_filtered = df_features.filter(pl.col("category") == 0).sort("feature")
df_ref_stats = (
    df_filtered
    .group_by("feature", "age_bracket").agg(
        pl.col("Value").mean().alias("mean_ref"),
        pl.col("Value").std(ddof=0).alias("sd_ref"),
        pl.col("Value").var(ddof=0).alias("var_ref"),
    )
)

df_features = (
    df_features.join(df_ref_stats, on=["feature", "age_bracket"], how="inner")
    .with_columns(
        minus_log_pfeature=(
            -1 
            * (
                0.5 
                * np.log(
                    2 
                    * np.pi 
                    * pl.col("var_ref")
                )
                + (
                    (
                        pl.col("Value")
                        - pl.col("mean_ref")
                    ) ** 2
                )
                /
                (
                    2 * pl.col("var_ref")
                )
            )
        )
    )
)
    

In [None]:
df_features.sort("feature", "category", "age_bracket", "infant").head(10)


In [None]:
# 38 movement features included in the Bayesian Surprise calculations
feature_list = ['Ankle_medianx','Wrist_medianx','Ankle_mediany','Wrist_mediany',\
            'Knee_mean_angle','Elbow_mean_angle',\
            'Ankle_IQRx', 'Wrist_IQRx','Ankle_IQRy', 'Wrist_IQRy',\
            'Knee_stdev_angle', 'Elbow_stdev_angle',\
            'Ankle_medianvelx','Wrist_medianvelx','Ankle_medianvely','Wrist_medianvely',\
            'Knee_median_vel_angle','Elbow_median_vel_angle',\
            'Ankle_IQRvelx','Wrist_IQRvelx','Ankle_IQRvely','Wrist_IQRvely',\
            'Knee_IQR_vel_angle','Elbow_IQR_vel_angle',\
            'Ankle_IQRaccx','Wrist_IQRaccx','Ankle_IQRaccy','Wrist_IQRaccy',\
            'Knee_IQR_acc_angle','Elbow_IQR_acc_angle',\
            'Ankle_meanent', 'Wrist_meanent','Knee_entropy_angle', 'Elbow_entropy_angle',\
            'Ankle_lrCorr_x', 'Wrist_lrCorr_x','Knee_lrCorr_angle', 'Elbow_lrCorr_angle']

df_features = df_features.filter(
    pl.col("feature").is_in(feature_list)
)
df_features.sort("feature", "category", "age_bracket", "infant").head(10)

In [None]:

df_surprise = df_features.group_by('infant', 'age_in_weeks','risk', 'age_bracket', 'category').agg(pl.col('minus_log_pfeature').sum())

mean_neg_log_p = df_surprise.filter(pl.col("category") == 0).select("minus_log_pfeature").mean().item()
sd_neg_log_p = df_surprise.filter(pl.col("category") == 0).select("minus_log_pfeature").std().item()
df_surprise = (
    df_surprise.with_columns(
        z=(pl.col('minus_log_pfeature') - mean_neg_log_p)/sd_neg_log_p
    ).with_columns(
        p=(pl.col('z').abs().map_elements(lambda x: norm.sf(x), return_dtype=pl.Float64)*2).round(3)
    )
).sort(["category", "age_bracket", "infant"]) 

# Z is the normalized Bayesian Surprise, normed on the reference population (risk = 0 OR category=0)
# P is the SURVIVAL FUNCTIOn
# it is the probability that the norm bayesian surprise takes a value greater than the Z value
# the lower this probability is, the HIGHER the risk for the test subject
# perhaps we can set 10% as the probability below which we deem test subject's risk
df_surprise.head(10)

In [None]:

df_surprise = df_surprise.filter(pl.col("infant").str.contains("clin_100_").not_())
# clin_100 is the test infant
df_surprise_gt = df_surprise.clone()
# SURPRISE_GT holds the calculations using ALL 38 FEATURES
print(f"{df_surprise_gt.height} rows")
df_surprise_gt


In [None]:
df_result = df_features_x.join(df_surprise, on="infant", how="inner")
# this dataframe should have both the 38 features and the surprise values (p, Z, minus log)
df_result.sort(["infant", "feature"])

In [None]:
# generate training samples for RFE feature elimination
df_X = df_result.pivot(index='infant', on='feature', values='Value').sort("infant")
y = df_result.group_by('infant').agg(pl.col('z').first()).sort("infant").to_numpy()[:,1]
# to_numpyy = df_result.unique("infant").select("z").to_numpy(writable=True)

In [None]:
df_X.drop("infant")

In [None]:
y

In [None]:
# RECURSIVE FEATURE ELIMINATION
# cut down 38 down to FEATURE_SIZE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor


FEATURE_SIZE = 10

model = RandomForestRegressor()
selector = RFE(model, n_features_to_select=FEATURE_SIZE)
X_selected = selector.fit_transform(df_X.drop("infant"), y)

In [None]:
selected_features = [a for a, b in zip(df_X.drop("infant").columns, selector.support_) if b]
np.array(selected_features)

# ['Ankle_IQRaccx',
#  'Ankle_IQRaccy',
#  'Ankle_IQRvelx',
#  'Ankle_IQRvely',
#  'Ankle_IQRx',
#  'Elbow_IQR_acc_angle',
#  'Hip_entropy_angle',
#  'Hip_stdev_angle',
#  'Knee_IQR_acc_angle',
#  'Wrist_IQRaccx']