# 1.3-agifford-FindFrequencyPeaksTraining
This notebooks cycles through the training dataset to identify peak frequencies by activity label that cross a pre-determined threshold. We will adjust the threshold manually to ensure that each label only contributes ~1-2 frequencies to the feature set (not include 0 Hz).

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from scipy.fftpack import fft, fftshift

In [None]:
def _make_single_annot_frame(df, shift):
    annot_df = df[df.label != df.label.shift(shift)]
    annot_df = annot_df.dropna(subset="label").reset_index()
    return annot_df

def make_annot_dataframe(df, t_start=None, t_end=None):
    t_start = t_start or df.time.min()
    t_end = t_end or df.time.max()
    
    df = df[(df.time >= t_start) & (df.time <= t_end)].copy()
    
    (act_starts_df, act_ends_df) = (
        _make_single_annot_frame(df, shift) for shift in [1, -1]
    )
    return act_starts_df, act_ends_df

def local_fmax_above_thresh(freq, x_w, threshold):
    local_max_ix = (np.diff(np.sign(np.diff(x_w))) < 0).nonzero()[0] + 1
    x_w_max = x_w[local_max_ix]
    freq_max = freq[local_max_ix]

    return freq_max[np.where((x_w_max>threshold) & (freq_max>0))]

def calculate_normed_spectrum(df, fs=50):
    n_fft = df.shape[0]
    window = signal.hann(n_fft)
    X_w = fft(window * df.accel_x.values)
    X_w_norm = 20 * np.log10(np.abs(fftshift(X_w / abs(X_w).max())))

    n_points = 2 * int(np.floor(n_fft / 2))
    if n_fft % 2:
        n_points += 1
    freq = fs/2 * np.linspace(-1, 1, n_points)
    return X_w_norm, freq

def _round(local_fmax, round_level):
    return np.round(local_fmax * round_level) / round_level


def find_single_file_peaks(df, thresholds, round_level):
    activity_starts_df, activity_ends_df = make_annot_dataframe(df)
    df_ =  df.dropna(subset="label")

    all_pks_df = pd.DataFrame(columns=["file_id", "subject_id", "data_id", "threshold", "label", "label_group", "peak_fs"])
    for thresh in thresholds:
        for r_ix in range(activity_starts_df.shape[0]):
            act_tp_df = pd.concat([activity_starts_df.loc[[r_ix], :], activity_ends_df.loc[[r_ix], :]], ignore_index=True)
            df_snip = df_[(df_.time >= act_tp_df.loc[0, "time"]) & (df_.time <= act_tp_df.loc[1, "time"])].reset_index()
            X_w_norm, freq = calculate_normed_spectrum(df_snip)

            local_fmax = local_fmax_above_thresh(freq, X_w_norm, thresh)
            rounded_fmax = _round(local_fmax, round_level)

            data = {
                "file_id": [df_snip.loc[0, "file_id"] for _ in rounded_fmax],
                "subject_id": [df_snip.loc[0, "subject_id"] for _ in rounded_fmax],
                "data_id": [df_snip.loc[0, "data_id"] for _ in rounded_fmax],
                "threshold": [thresh for _ in rounded_fmax],
                "label": [act_tp_df.loc[0, "label"] for _ in rounded_fmax],
                "label_group": [act_tp_df.loc[0, "label_group"] for _ in rounded_fmax],
                "peak_fs": rounded_fmax
            }
            pks_df = pd.DataFrame(data=data)
            all_pks_df = pd.concat([all_pks_df, pks_df], ignore_index=True)
    return all_pks_df

In [None]:
with open("../../src/data/train_val_files.json", "r", encoding="utf-8") as infile:
    train_val_files = json.load(infile)
train_files = train_val_files["train"]

In [None]:
thresholds = [-5, -10, -15, -20]
round_level = 1
all_files_pks_df = pd.DataFrame(columns=["file_id", "subject_id", "data_id", "threshold", "label", "label_group", "peak_fs"])
for ix, file in enumerate(train_files):
    print(f"analyzing file {ix+1} of {len(train_files)}", end="\r")
    df = pd.read_parquet(file, engine="fastparquet")
    all_pks_df = find_single_file_peaks(df, thresholds, round_level)
    all_files_pks_df = pd.concat([all_files_pks_df, all_pks_df], ignore_index=True)
    

So even with rounding to whole frequencies and the most restrictive frequency threshold, some labels have many "significant peaks".

In [None]:
all_files_pks_df[all_files_pks_df.peak_fs == 0]

In [None]:
npeaks_by_thresh_label = all_files_pks_df.groupby(
    ["threshold", "label"], as_index=False
).agg(
    UniquePeaks=("peak_fs", "nunique"), MinFreq=("peak_fs", "min")
).sort_values(by="UniquePeaks", ascending=False)

In [None]:
npeaks_by_thresh_label[(npeaks_by_thresh_label["threshold"]==-5) & (npeaks_by_thresh_label["UniquePeaks"]>3)]

Given this, I think I will keep the -5 threshold and "fix" the 11 labels with >3 peak frequencies to select an "ideal" (determined arbitrarily at this point) subset. I will try to find the 2 peaks (besides 0 Hz) that are most differentiating to each of the labels compared to the other labels.

In [None]:
print(npeaks_by_thresh_label[(npeaks_by_thresh_label["threshold"]==-5) & (npeaks_by_thresh_label["UniquePeaks"]>=3)].shape[0])
print(npeaks_by_thresh_label[(npeaks_by_thresh_label["threshold"]==-5) & (npeaks_by_thresh_label["UniquePeaks"]==2)].shape[0])
print(npeaks_by_thresh_label[(npeaks_by_thresh_label["threshold"]==-5) & (npeaks_by_thresh_label["UniquePeaks"]==1)].shape[0])

Of the labels that have only one peak, is it the case that it is always 0 Hz? -> Yes

In [None]:
select_thresh_pks = npeaks_by_thresh_label[npeaks_by_thresh_label["threshold"] == -5]
one_peak_labels = select_thresh_pks[select_thresh_pks["UniquePeaks"]==1]
one_peak_labels.MinFreq.unique()

In [None]:
all_files_pks_df = all_files_pks_df.drop(columns=["UniquePeaks"], errors="ignore")
thresh_files_pks_df = all_files_pks_df.merge(select_thresh_pks.iloc[:, :3], on=["threshold", "label"])
select_labels_df = thresh_files_pks_df[thresh_files_pks_df.UniquePeaks >= 3]
# don't need to analyze 0Hz since we're keeping it regardless
select_labels_df = select_labels_df[select_labels_df.peak_fs > 0]

In [None]:
select_labels_df.head()

In [None]:
frqs_by_activity = select_labels_df.groupby("label", as_index=False).agg(Peaks=("peak_fs", set))

other_labels_df = thresh_files_pks_df[thresh_files_pks_df.UniquePeaks < 3]
# don't need to analyze 0Hz since we're keeping it regardless
# other_labels_df = other_labels_df[other_labels_df.peak_fs > 0]
frqs_by_other = other_labels_df[other_labels_df.peak_fs > 0].groupby("label", as_index=False).agg(Peaks=("peak_fs", set))

The fact that the peak frequencies (other than 0 Hz) for the "other" activities only includes 1 Hz tells me that all of the 2-peak activities are all 0 Hz and 1 Hz. This tells me that I should not use 1 Hz for any of the labels in `frqs_by_activity` that include it. It also tells me I can ignore these other labels when comparing the unique frequencies for labels in `frqs_by_activity` to determine the most discriminating frequencies among the labels.

In [None]:
frqs_by_other

Here, for each `label`, I am searching for how often each `peak_fs` is differentiating among the other labels.

In [None]:
labels_uniqf = pd.DataFrame(columns=["label", "peak_fs"])
for ix, if_set in enumerate(frqs_by_activity.Peaks.values):
    ilabel = frqs_by_activity.loc[ix, "label"]
    for jx, jf_set in enumerate(frqs_by_activity.Peaks.values):
        if ix == jx:
            continue

        diff = list(if_set - jf_set)
        idf = pd.DataFrame(data={
            "label": [ilabel for _ in diff],
            "peak_fs": diff
        })
        labels_uniqf = pd.concat([labels_uniqf, idf], ignore_index=True)


Now I aggregate by `label` and `peak_fs` to identify how often each frequency for each label was differentiating, and separately count how often each frequency showed up as a peak for each label.

In [None]:
best_freqs = labels_uniqf.groupby(["label", "peak_fs"], as_index=False).agg(DiffCount=("peak_fs", "count"))
sel_labels_frq_cnt = select_labels_df.groupby(["label", "peak_fs"], as_index=False).agg(Count=("peak_fs", "count"))

In [None]:
sel_labels_frq_cnt = sel_labels_frq_cnt.drop(columns=["DiffCount"], errors="ignore")
sel_labels_frq_cnt = sel_labels_frq_cnt.merge(
    best_freqs,
    on=["label", "peak_fs"],
)

Finally, here I am determining how often each frequency showed up as a peak frequency across all labels.

In [None]:
freq_counts = sel_labels_frq_cnt.groupby("peak_fs", as_index=False).agg(FreqCount=("peak_fs", "count"))
sel_labels_frq_cnt = sel_labels_frq_cnt.drop(columns=["FreqCount"], errors="ignore")
sel_labels_frq_cnt = sel_labels_frq_cnt.merge(
    freq_counts,
    on=["peak_fs"],
)

What I want to find are the "best" 2 frequencies for each label. Here, I am defining "best" as those frequencies that show up often for an individual label AND are highly differentiating among other labels. To combine these characteristics, I create a column "DiscrimFactor", which simply multiplies "DiffCount" by "Count". Then, we sort the dataframe in descending order by "DiscrimFactor" in order and grab the first 2 frequencies per label. To account for potential ties in this metric, we next sort by "DiffCount" (descending), "Count" (descending), and finally "FreqCount" (ascending). In this way, we find:
1. first, the frequencies with the highest "DiscrimFactor"
2. next, the frequencies that are most differentiating among the other labels ("DiffCount")
3. next, the frequencies that are most common for the given label ("label")
4. finally, the frequencies that are least common over all labels ("FreqCount") -> this last point ensures that, all other things being equal, the frequency I pick for a given label is the most likely to be different from the ones already selected in the other labels (given random chance)

In [None]:
sel_labels_frq_cnt["DiscrimFactor"] = sel_labels_frq_cnt["DiffCount"] * sel_labels_frq_cnt["Count"]
sel_labels_frq_cnt = sel_labels_frq_cnt.sort_values(by=["label", "DiscrimFactor", "DiffCount", "Count", "FreqCount"], ascending=[True, False, False, False, True])

In [None]:
select_freqs_labels = sel_labels_frq_cnt.groupby("peak_fs").head(2).reset_index(drop=True)

Let's make sure we didn't lose any labels:

In [None]:
labels_check = list(other_labels_df.label.unique())
labels_check.extend(select_freqs_labels.label.unique())

all_labels = thresh_files_pks_df.label.unique()

assert all([any([label1 == label2 for label2 in all_labels])] for label1 in labels_check)

Given this checks out, let's combine the dataframes back and then select all of the unique frequencies.

In [None]:
# only want the non-duplicated labels-frequency combinations
all_select_freqs = other_labels_df.drop_duplicates(subset=["label", "peak_fs"])[["label", "peak_fs"]]
all_select_freqs = pd.concat([all_select_freqs, select_freqs_labels[["label", "peak_fs"]]], ignore_index=True)

All in all, there are 24 frequencies, which means there will be 24 features included in the model.

In [None]:
all_select_freqs.peak_fs.nunique(), all_select_freqs.peak_fs.unique()

We will write them to a JSON to store for later use in further EDA and model building.

In [None]:
frequency_features = {
    "frequencies": all_select_freqs.peak_fs.unique().tolist()
}
with open("../../src/features/frequency_features.json", "w", encoding="utf-8") as outfile:
    json.dump(frequency_features, outfile)