# 1.3-agifford-FindFrequencyPeaksTraining
This notebooks cycles through the training dataset to identify peak frequencies by activity label that cross a pre-determined threshold. We will adjust the threshold manually to ensure that each label only contributes ~1-2 frequencies to the feature set (not include 0 Hz).

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from scipy.fftpack import fft, fftshift
from pathlib import Path

In [None]:
def _make_single_annot_frame(df, shift):
    annot_df = df[df.label != df.label.shift(shift)]
    annot_df = annot_df.dropna(subset="label").reset_index()
    return annot_df

def make_annot_dataframe(df, t_start=None, t_end=None):
    t_start = t_start or df.time.min()
    t_end = t_end or df.time.max()
    
    df = df[(df.time >= t_start) & (df.time <= t_end)].copy()
    
    (act_starts_df, act_ends_df) = (
        _make_single_annot_frame(df, shift) for shift in [1, -1]
    )
    return act_starts_df, act_ends_df

def local_fmax_above_thresh(freq, x_w, threshold):
    local_max_ix = (np.diff(np.sign(np.diff(x_w))) < 0).nonzero()[0] + 1
    x_w_max = x_w[local_max_ix]
    freq_max = freq[local_max_ix]

    return freq_max[np.where((x_w_max>threshold) & (freq_max>0))]

def calculate_frequencies(n_fft, fs=50):
    n_points = 2 * int(np.floor(n_fft / 2))
    if n_fft % 2:
        n_points += 1
    freq = fs/2 * np.linspace(-1, 1, n_points)
    return freq

def calculate_normed_spectrum(df, col, fs=50):
    n_fft = df.shape[0]
    window = signal.hann(n_fft)
    X_w = fft(window * df[col].values)
    X_w_norm = 20 * np.log10(np.abs(fftshift(X_w / abs(X_w).max())))
    return X_w_norm

def _round(local_fmax, round_level):
    return np.round(local_fmax * round_level) / round_level


def find_single_file_peaks(df, thresholds, round_level):
    activity_starts_df, activity_ends_df = make_annot_dataframe(df)
    df_ =  df.dropna(subset="label")

    all_pks_df = pd.DataFrame(columns=["file_id", "subject_id", "data_id", "threshold", "label", "label_group", "peak_fs"])
    data_cols = ["accel_x", "accel_y", "accel_z", "gyro_x", "gyro_y", "gyro_z"]
    for thresh in thresholds:
        for r_ix in range(activity_starts_df.shape[0]):
            act_tp_df = pd.concat([activity_starts_df.loc[[r_ix], :], activity_ends_df.loc[[r_ix], :]], ignore_index=True)
            df_snip = df_[(df_.time >= act_tp_df.loc[0, "time"]) & (df_.time <= act_tp_df.loc[1, "time"])].reset_index()
            X_w_data = [calculate_normed_spectrum(df_snip, col) for col in data_cols]
            freq = calculate_frequencies(len(X_w_data[0]))

            rounded_fmax_data = []
            col_data = []
            for col, X_w in zip(data_cols, X_w_data):
                local_fmax = local_fmax_above_thresh(freq, X_w, thresh)
                rounded_fmax = _round(local_fmax, round_level)
                rounded_fmax_data.extend(rounded_fmax)
                col_data.extend([col for _ in rounded_fmax])

            data = {
                "file_id": [df_snip.loc[0, "file_id"] for _ in rounded_fmax_data],
                "subject_id": [df_snip.loc[0, "subject_id"] for _ in rounded_fmax_data],
                "data_id": [df_snip.loc[0, "data_id"] for _ in rounded_fmax_data],
                "threshold": [thresh for _ in rounded_fmax_data],
                "label": [act_tp_df.loc[0, "label"] for _ in rounded_fmax_data],
                "label_group": [act_tp_df.loc[0, "label_group"] for _ in rounded_fmax_data],
                "measure": col_data,
                "peak_fs": rounded_fmax_data
            }
            pks_df = pd.DataFrame(data=data)
            all_pks_df = pd.concat([all_pks_df, pks_df], ignore_index=True)
    return all_pks_df

In [None]:
with open("../../src/data/train_val_files.json", "r", encoding="utf-8") as infile:
    train_val_files = json.load(infile)
train_files = train_val_files["train"]

In [None]:
thresholds = [-5, -10, -15, -20]
round_level = 1
all_files_pks_df = pd.DataFrame(columns=["file_id", "subject_id", "data_id", "threshold", "label", "label_group", "measure", "peak_fs"])
for ix, file in enumerate(train_files):
    print(f"analyzing file {ix+1} of {len(train_files)}", end="\r")
    df = pd.read_parquet(file, engine="fastparquet")
    all_pks_df = find_single_file_peaks(df, thresholds, round_level)
    all_files_pks_df = pd.concat([all_files_pks_df, all_pks_df], ignore_index=True)
    

In [None]:
all_files_pks_df.groupby(["threshold", "measure", "label"], as_index=False).agg(count=("peak_fs", "nunique"))

So even with rounding to whole frequencies and the most restrictive frequency threshold, some labels have many "significant peaks".

In [None]:
npeaks_by_thresh_label = all_files_pks_df.groupby(
    ["threshold", "measure", "label"], as_index=False
).agg(
    UniquePeaks=("peak_fs", "nunique"), MinFreq=("peak_fs", "min")
).sort_values(by=["measure", "UniquePeaks"], ascending=[True, False])

In [None]:
npeaks_by_thresh_label[(npeaks_by_thresh_label["threshold"]==-5) & (npeaks_by_thresh_label["UniquePeaks"]>3)]

Given this, I think I will keep the -5 threshold and "fix" the labels with >3 peak frequencies to select "ideal" (determined arbitrarily at this point) subsets. I will try to find the 2 peaks (besides 0 Hz) that are most differentiating to each of the labels compared to the other labels (for each measure).

In [None]:
data_cols = ["accel_x", "accel_y", "accel_z", "gyro_x", "gyro_y", "gyro_z"]
for col in data_cols:
    print(col)
    print(
        npeaks_by_thresh_label[
            (npeaks_by_thresh_label["threshold"]==-5) & 
            (npeaks_by_thresh_label["UniquePeaks"]>=3) &
            (npeaks_by_thresh_label["measure"]==col)
        ].shape[0]
    )
    print(
        npeaks_by_thresh_label[
            (npeaks_by_thresh_label["threshold"]==-5) & 
            (npeaks_by_thresh_label["UniquePeaks"]==2) &
            (npeaks_by_thresh_label["measure"]==col)
        ].shape[0]
    )
    print(
        npeaks_by_thresh_label[
            (npeaks_by_thresh_label["threshold"]==-5) & 
            (npeaks_by_thresh_label["UniquePeaks"]==1) &
            (npeaks_by_thresh_label["measure"]==col)
        ].shape[0]
    )

Of the labels that have only one peak, is it the case that it is always 0 Hz? -> For all but `gyro_z`, this is the case. At least one label with only a single peak frequency has its peak at 1 Hz rather than 0.

In [None]:
select_thresh_pks = npeaks_by_thresh_label[npeaks_by_thresh_label["threshold"] == -5]
for col in data_cols:
    print(col)
    one_peak_labels = select_thresh_pks[
        (select_thresh_pks["UniquePeaks"]==1) &
        (select_thresh_pks["measure"]==col)
    ]
    print(one_peak_labels.MinFreq.unique())

In [None]:
select_thresh_pks.columns

In [None]:
all_files_pks_df = all_files_pks_df.drop(columns=["UniquePeaks"], errors="ignore")
thresh_files_pks_df = all_files_pks_df.merge(select_thresh_pks.iloc[:, :4], on=["threshold", "measure", "label"])
select_labels_df = thresh_files_pks_df[thresh_files_pks_df.UniquePeaks >= 3]
# don't need to analyze 0Hz since we're keeping it regardless
select_labels_df = select_labels_df[select_labels_df.peak_fs > 0]

In [None]:
select_labels_df.head()

In [None]:
frqs_by_activity = select_labels_df.groupby(["label", "measure"], as_index=False).agg(Peaks=("peak_fs", set))

other_labels_df = thresh_files_pks_df[thresh_files_pks_df.UniquePeaks < 3]
# don't need to analyze 0Hz since we're keeping it regardless
# other_labels_df = other_labels_df[other_labels_df.peak_fs > 0]
frqs_by_other = other_labels_df[other_labels_df.peak_fs > 0].groupby(["label", "measure"], as_index=False).agg(Peaks=("peak_fs", set))

By looking at the labels x measures with only 1 or 2 peak frequencies (and after removing 0 Hz from all), we see that the overwhelming majority of `labels x measures` include 1 Hz (98%) and consist ONLY of 1 Hz (97%). However, there are some `labels x measures` whose other peak frequency is something other than 1 Hz. This tells me that I should not use 1 Hz for any of the labels in `frqs_by_activity` that include it (unless there are no other options). It also tells me I can generally ignore these other labels when comparing the unique frequencies for labels in `frqs_by_activity` to determine the most discriminating frequencies among the labels.

In [None]:
# fraction of all rows that have 1 Hz as a peak frequency
frqs_by_other["Peaks"].apply(lambda x: 1 in x).sum() / frqs_by_other.shape[0]

In [None]:
# fraction of all rows that have only 1 Hz as the peak frequency
frqs_by_other["Peaks"].apply(lambda x: {1} == x).sum() / frqs_by_other.shape[0]

In [None]:
# all possible other peak frequencies
frqs_by_other["Peaks"].apply(lambda x: max(x)).unique()

Here, for each `label`, I am searching for how often each `peak_fs` is differentiating among the other labels. For simplicity, I'm only comparing the peak frequencies within a measurement type (e.g., comparing only "accel_x" peaks for discriminating frequencies among accel_x data across labels).

In [None]:
labels_uniqf = pd.DataFrame(columns=["label", "measure", "peak_fs"])
for col in data_cols:
    print(col)
    col_df = frqs_by_activity[frqs_by_activity["measure"] == col].reset_index()
    for ix, if_set in enumerate(col_df.Peaks.values):
        ilabel = col_df.loc[ix, "label"]
        for jx, jf_set in enumerate(col_df.Peaks.values):
            if ix == jx:
                continue

            diff = list(if_set - jf_set)
            idf = pd.DataFrame(data={
                "label": [ilabel for _ in diff],
                "measure": [col for _ in diff],
                "peak_fs": diff
            })
            labels_uniqf = pd.concat([labels_uniqf, idf], ignore_index=True)


Now I aggregate by `label` and `peak_fs` to identify how often each frequency for each label was differentiating, and separately count how often each frequency showed up as a peak for each label.

In [None]:
best_freqs = labels_uniqf.groupby(["label", "measure", "peak_fs"], as_index=False).agg(DiffCount=("peak_fs", "count"))
sel_labels_frq_cnt = select_labels_df.groupby(["label", "measure", "peak_fs"], as_index=False).agg(Count=("peak_fs", "count"))

In [None]:
sel_labels_frq_cnt = sel_labels_frq_cnt.drop(columns=["DiffCount"], errors="ignore")
sel_labels_frq_cnt = sel_labels_frq_cnt.merge(
    best_freqs,
    on=["label", "measure", "peak_fs"],
)

Finally, here I am determining how often each frequency showed up as a peak frequency across all labels.

In [None]:
freq_counts = sel_labels_frq_cnt.groupby(["measure", "peak_fs"], as_index=False).agg(FreqCount=("peak_fs", "count"))
sel_labels_frq_cnt = sel_labels_frq_cnt.drop(columns=["FreqCount"], errors="ignore")
sel_labels_frq_cnt = sel_labels_frq_cnt.merge(
    freq_counts,
    on=["measure", "peak_fs"],
)

Since I will try to not use 1 Hz frequencies as much as possible, I will find all rows where `peak_fs` == 1 and where `label x measure` has more than 2 peak frequencies. Then, for these rows, I will delete 1 Hz from the options.

In [None]:
sel_labels_nfreqs = sel_labels_frq_cnt.groupby(["label", "measure"], as_index=False).agg(NFreqs=("peak_fs", "count"))
sel_labels_frq_cnt = sel_labels_frq_cnt.drop(columns="NFreqs", errors="ignore")
sel_labels_frq_cnt = sel_labels_frq_cnt.merge(
    sel_labels_nfreqs,
    on=["label", "measure"]
)
sel_labels_frq_cnt = sel_labels_frq_cnt[
    (sel_labels_frq_cnt["peak_fs"] != 1) &
    (sel_labels_frq_cnt["NFreqs"] > 2)
]

What I want to find now are the "best" 2 frequencies for each `label x measure`. Here, I am defining "best" as those frequencies that show up often for an individual label AND are highly differentiating among other labels. To combine these characteristics, I create a column "DiscrimFactor", which simply multiplies "DiffCount" by "Count". Then, we sort the dataframe in descending order by "DiscrimFactor" in order and grab the first 2 frequencies per label. To account for potential ties in this metric, we next sort by "DiffCount" (descending), "Count" (descending), and finally "FreqCount" (ascending). In this way, we find:
1. first, the frequencies with the highest "DiscrimFactor"
2. next, the frequencies that are most differentiating among the other labels ("DiffCount")
3. next, the frequencies that are most common for the given label ("label")
4. finally, the frequencies that are least common over all labels ("FreqCount") -> this last point ensures that, all other things being equal, the frequency I pick for a given label is the most likely to be different from the ones already selected in the other labels (given random chance)

In [None]:
sel_labels_frq_cnt["DiscrimFactor"] = sel_labels_frq_cnt["DiffCount"] * sel_labels_frq_cnt["Count"]
sel_labels_frq_cnt = sel_labels_frq_cnt.sort_values(by=["label", "measure", "DiscrimFactor", "DiffCount", "Count", "FreqCount"], ascending=[True, True, False, False, False, True])

In [None]:
sel_labels_frq_cnt

In [None]:
select_freqs_labels = sel_labels_frq_cnt.groupby(["label", "measure"]).head(2)

Let's make sure we didn't lose any labels:

In [None]:
labels_check = list(other_labels_df.label.unique())
labels_check.extend(select_freqs_labels.label.unique())

all_labels = thresh_files_pks_df.label.unique()

assert all([any([label1 == label2 for label2 in all_labels])] for label1 in labels_check)

Given this checks out, let's combine the dataframes back and then select all of the unique frequencies.

In [None]:
# only want the non-duplicated labels-frequency combinations
all_select_freqs = other_labels_df.drop_duplicates(subset=["label", "measure", "peak_fs"])[["label", "measure", "peak_fs"]]
all_select_freqs = pd.concat([all_select_freqs, select_freqs_labels[["label", "measure", "peak_fs"]]], ignore_index=True)

All in all, there are 65 frequencies, which means there will be 65 features included in the model.

In [None]:
freq_feats_meas = all_select_freqs.groupby("measure", as_index=False).agg(Frequencies=("peak_fs", lambda x: list(set(x))), NFreqs=("peak_fs", "nunique"))
print(freq_feats_meas.NFreqs.sum())
freq_feats_meas

In [None]:
for row in freq_feats_meas.itertuples():
    print(row.measure)
    print(row.Frequencies)

We will write them to a JSON to store for later use in further EDA and model building.

In [None]:
frequency_features = {
    row.measure: row.Frequencies
    for row in freq_feats_meas.itertuples()
}
freq_feat_path = Path("../../src/features/frequency_features.json")
if not freq_feat_path.exists():
    with open(freq_feat_path, "w", encoding="utf-8") as outfile:
        json.dump(frequency_features, outfile)