In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from message.config import DATA_DIR

FINA_COLUMNS = ['patient_id', 'patient_name', 'patient_age', 'session_group',
       'therapy_name', 'session_is_nok', 'session_number', 'pain', 'fatigue',
       'quality', 'quality_reason_movement_detection',
       'quality_reason_my_self_personal', 'quality_reason_other',
       'quality_reason_exercises',
       'quality_reason_tablet_and_or_motion_trackers',
       'quality_reason_easy_of_use', 'quality_reason_tablet',
       'quality_reason_session_speed', 'leave_session',
       'leave_exercise_system_problem', 'leave_exercise_other',
       'leave_exercise_unable_perform', 'leave_exercise_pain',
       'leave_exercise_tired', 'leave_exercise_technical_issues',
       'leave_exercise_difficulty', 'prescribed_repeats', 'training_time',
       'perc_correct_repeats', 'number_exercises',
       'number_of_distinct_exercises', 'exercise_with_most_incorrect',
       'first_exercise_skipped']

In [2]:

df = pd.read_parquet(
    Path(DATA_DIR, "exercise_results.parquet")
)
df.head()

Unnamed: 0,session_exercise_result_sword_id,session_group,patient_id,therapy_name,exercise_name,exercise_side,exercise_order,prescribed_repeats,training_time,correct_repeats,...,quality_reason_other,quality_reason_exercises,quality_reason_tablet_and_or_motion_trackers,quality_reason_easy_of_use,quality_reason_tablet,quality_reason_session_speed,session_number,session_is_nok,patient_name,patient_age
0,39810278,lg1c88p/9QtkOmQwiwd5stMlmOU=,glRS/3uRDZt6RpmB+LaLyx/a7wk=,low_back,prone_press_ups,center,10,5,35,1,...,0,0,0,0,0,0,318,False,Sonya Berg,76
1,39810303,lg1c88p/9QtkOmQwiwd5stMlmOU=,glRS/3uRDZt6RpmB+LaLyx/a7wk=,low_back,child's_pose,center,12,1,33,1,...,0,0,0,0,0,0,318,False,Sonya Berg,76
2,39810255,lg1c88p/9QtkOmQwiwd5stMlmOU=,glRS/3uRDZt6RpmB+LaLyx/a7wk=,low_back,plank,center,7,3,36,1,...,0,0,0,0,0,0,318,False,Sonya Berg,76
3,39810227,lg1c88p/9QtkOmQwiwd5stMlmOU=,glRS/3uRDZt6RpmB+LaLyx/a7wk=,low_back,pelvic_anterior_posterior_tilt,center,4,20,23,20,...,0,0,0,0,0,0,318,False,Sonya Berg,76
4,39810238,lg1c88p/9QtkOmQwiwd5stMlmOU=,glRS/3uRDZt6RpmB+LaLyx/a7wk=,low_back,pelvic_side_tilt,center,5,20,24,20,...,0,0,0,0,0,0,318,False,Sonya Berg,76


In [3]:
df.shape

(1126654, 28)

In [4]:
df.isna().sum()

session_exercise_result_sword_id                      0
session_group                                         0
patient_id                                            0
therapy_name                                       1489
exercise_name                                         0
exercise_side                                         0
exercise_order                                        0
prescribed_repeats                                    0
training_time                                         0
correct_repeats                                   22538
wrong_repeats                                     22538
leave_exercise                                  1113800
leave_session                                   1106065
pain                                              89362
fatigue                                           89401
quality                                           93521
quality_reason_movement_detection                     0
quality_reason_my_self_personal                 

In [6]:

df_expected = pd.read_parquet(
    Path(DATA_DIR, "features_expected.parquet")
)

df_expected.head()

Unnamed: 0,patient_id,patient_name,patient_age,session_group,therapy_name,session_is_nok,session_number,pain,fatigue,quality,...,leave_exercise_tired,leave_exercise_technical_issues,leave_exercise_difficulty,prescribed_repeats,training_time,perc_correct_repeats,number_exercises,number_of_distinct_exercises,exercise_with_most_incorrect,first_exercise_skipped
0,8kjNYQWFdg6/P/57yCcf9GOt6RI=,Michael Contreras,44,pTxNzYvQu7dF+SSTQuGRcQxnLpw=,wrist_hand,False,166,2.0,2.0,5.0,...,0.0,0.0,0.0,154.0,910.0,0.985915,20,11,wris_prono_supination,prayer_position_stretch
1,ffUN1kRvtDG/KEH2m94XNnWKqvE=,Michelle Johnson,92,JfT2D1Pn8KzeJ9EJMF/tFcxADmE=,low_back,False,25,2.0,4.0,4.0,...,0.0,0.0,0.0,162.0,522.0,0.966667,17,7,trunk_side_bending,cat_camel
2,8Y38TEHzJF2Znv8X6DD1S4wOJF0=,Barbara Vargas,75,/cyvYRRuw7wmkQApjg3uz8DkSvs=,knee,False,26,2.0,0.0,4.0,...,0.0,0.0,0.0,142.0,531.0,0.930769,16,11,hip_hyperextension,bridge
3,LqcI2XEv2cCJBJipyIWcFsTO3AE=,Casey Bright,89,EPKZQPfZv51C4+MyZmaBwZuWgEg=,elbow,True,3,2.0,2.0,2.0,...,0.0,0.0,0.0,135.0,1054.0,0.944954,11,8,elbow_flexion,diagonal_2_flexion
4,BPy3iol9fM6YK2I331Enkt741OA=,Desiree Chavez,51,8bRMabEcjpPZxkfPTIgynBwxtBU=,shoulder,True,5,2.0,4.0,5.0,...,1.0,0.0,0.0,100.0,409.0,0.842105,10,8,shoulder_abduction,shoulder_flexion


In [19]:
(df["session_group"].unique() == df_expected["session_group"]).sum()

1

## Tinkering

In [7]:
# Grouping by session_group to aggregate session-level data
grouped = df.groupby("session_group").agg(
    patient_id=("patient_id", "first"),
    patient_name=("patient_name", "first"),
    patient_age=("patient_age", "first"),
    pain=("pain", "first"),
    fatigue=("fatigue", "first"),
    therapy_name=("therapy_name", "first"),
    session_number=("session_number", "first"),
    leave_session=("leave_session", "first"),
    quality=("quality", "first"),
    session_is_nok=("session_is_nok", "first"),
    prescribed_repeats=("prescribed_repeats", "sum"),
    training_time=("training_time", "sum"),
    correct_repeats=("correct_repeats", "sum"),
    wrong_repeats=("wrong_repeats", "sum"),
    number_exercises=("exercise_name", "count"),
    number_of_distinct_exercises=("exercise_name", "nunique"),
).reset_index()
grouped.set_index("session_group", inplace=True)

grouped.head()

Unnamed: 0_level_0,patient_id,patient_name,patient_age,pain,fatigue,therapy_name,session_number,leave_session,quality,session_is_nok,prescribed_repeats,training_time,correct_repeats,wrong_repeats,number_exercises,number_of_distinct_exercises
session_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
++//wixk6DpH8NMGvqLqvpzWbzY=,3FDm7kzjNVgmqUPhyODoZpMIIGc=,Taylor Mendez,55,6,4,shoulder,6,,4,True,96,356,95,1,8,8
++2JgoMe8JGBtUHdsOiLGO8UZ18=,KRnwvSlSa6U62Edl3dHJa0nVM5A=,Danielle Miller,75,4,0,knee,8,,5,False,200,767,199,1,19,10
++4kUzy7ewH5u7FjNoU8CW6thbY=,euuPYQwygUQC94V0LvmFOzHPoXQ=,Jeremy Randall,70,4,2,low_back,9,,5,False,150,683,149,1,18,12
++8Q+lFKrp9IKCWsBT0IO0XEV1Y=,R0S8jUhp1lC00zuUYuB0QAnj6as=,Richard Robinson,98,4,4,low_back,2,,4,True,50,279,44,6,7,7
++9PC4/46Jmrl/PHbzkM1BCPg2g=,Uj48yKb4R53oteHJHRcnzpUjBdY=,Hector Perry,28,2,2,knee,17,,5,False,147,466,145,2,16,11


In [8]:
# Calculating percentage of correct repetitions
grouped["perc_correct_repeats"] = grouped["correct_repeats"] / (grouped["correct_repeats"] + grouped["wrong_repeats"])
grouped.head()

Unnamed: 0_level_0,patient_id,patient_name,patient_age,pain,fatigue,therapy_name,session_number,leave_session,quality,session_is_nok,prescribed_repeats,training_time,correct_repeats,wrong_repeats,number_exercises,number_of_distinct_exercises,perc_correct_repeats
session_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
++//wixk6DpH8NMGvqLqvpzWbzY=,3FDm7kzjNVgmqUPhyODoZpMIIGc=,Taylor Mendez,55,6,4,shoulder,6,,4,True,96,356,95,1,8,8,0.989583
++2JgoMe8JGBtUHdsOiLGO8UZ18=,KRnwvSlSa6U62Edl3dHJa0nVM5A=,Danielle Miller,75,4,0,knee,8,,5,False,200,767,199,1,19,10,0.995
++4kUzy7ewH5u7FjNoU8CW6thbY=,euuPYQwygUQC94V0LvmFOzHPoXQ=,Jeremy Randall,70,4,2,low_back,9,,5,False,150,683,149,1,18,12,0.993333
++8Q+lFKrp9IKCWsBT0IO0XEV1Y=,R0S8jUhp1lC00zuUYuB0QAnj6as=,Richard Robinson,98,4,4,low_back,2,,4,True,50,279,44,6,7,7,0.88
++9PC4/46Jmrl/PHbzkM1BCPg2g=,Uj48yKb4R53oteHJHRcnzpUjBdY=,Hector Perry,28,2,2,knee,17,,5,False,147,466,145,2,16,11,0.986395


In [9]:
[x in ["system_problem", "other", "unable_perform", "pain", "tired", "technical_issues", "difficulty"] for x in df["leave_session"].unique()]

[False, True, True, True, True]

In [10]:
# Counting leave_exercise reasons
leave_exercise_reasons = ["system_problem", "other", "unable_perform", "pain", "tired", "technical_issues", "difficulty"]
quality_reasons = ["movement_detection", "my_self_personal", "other", "exercises", "tablet", "tablet_and_or_motion_trackers", "easy_of_use", "session_speed"]

for reason in leave_exercise_reasons:
    grouped[f"leave_exercise_{reason}"] = df[df["leave_exercise"] == reason].groupby("session_group")["leave_exercise"].count()
    grouped[f"leave_exercise_{reason}"].fillna(0, inplace=True)
for reason in quality_reasons:
    grouped[f"quality_{reason}"] = df[df["quality"] == reason].groupby("session_group")["quality"].count()
    grouped[f"quality_{reason}"].fillna(0, inplace=True)

grouped

Unnamed: 0_level_0,patient_id,patient_name,patient_age,pain,fatigue,therapy_name,session_number,leave_session,quality,session_is_nok,...,leave_exercise_technical_issues,leave_exercise_difficulty,quality_movement_detection,quality_my_self_personal,quality_other,quality_exercises,quality_tablet,quality_tablet_and_or_motion_trackers,quality_easy_of_use,quality_session_speed
session_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
++//wixk6DpH8NMGvqLqvpzWbzY=,3FDm7kzjNVgmqUPhyODoZpMIIGc=,Taylor Mendez,55,6,4,shoulder,6,,4,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++2JgoMe8JGBtUHdsOiLGO8UZ18=,KRnwvSlSa6U62Edl3dHJa0nVM5A=,Danielle Miller,75,4,0,knee,8,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++4kUzy7ewH5u7FjNoU8CW6thbY=,euuPYQwygUQC94V0LvmFOzHPoXQ=,Jeremy Randall,70,4,2,low_back,9,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++8Q+lFKrp9IKCWsBT0IO0XEV1Y=,R0S8jUhp1lC00zuUYuB0QAnj6as=,Richard Robinson,98,4,4,low_back,2,,4,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++9PC4/46Jmrl/PHbzkM1BCPg2g=,Uj48yKb4R53oteHJHRcnzpUjBdY=,Hector Perry,28,2,2,knee,17,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzjYItanfFoaxcOT72WuLVdFlpU=,p5r85M4UOxA+ooLsAnpGA091yec=,Brianna George,28,2,0,shoulder,11,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzkMg0F+Nr92E2UwQld7T2c9DxY=,Q8xrPkPRSOYhbjrzTxF77Fp40IA=,Sophia Harris,85,8,4,neck,13,,5,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzwyhOME0/jCt/TlocGDlnM7Nx4=,4tqtZxzLS9+7QJtKa3pW0beUG3s=,Donna Moreno,34,0,0,elbow,13,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzy4uJWf1oWcSbMEHF4RG15ELcU=,bWap/fmf6koGTRDMpejv7/J+ovw=,Nancy Walter,96,4,0,low_back,1,,3,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
grouped["session_is_nok"].isna().sum()

1070

In [12]:
# assuming session is ok
grouped["session_is_nok"] = grouped["session_is_nok"].fillna(False)

In [13]:
grouped

Unnamed: 0_level_0,patient_id,patient_name,patient_age,pain,fatigue,therapy_name,session_number,leave_session,quality,session_is_nok,...,leave_exercise_technical_issues,leave_exercise_difficulty,quality_movement_detection,quality_my_self_personal,quality_other,quality_exercises,quality_tablet,quality_tablet_and_or_motion_trackers,quality_easy_of_use,quality_session_speed
session_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
++//wixk6DpH8NMGvqLqvpzWbzY=,3FDm7kzjNVgmqUPhyODoZpMIIGc=,Taylor Mendez,55,6,4,shoulder,6,,4,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++2JgoMe8JGBtUHdsOiLGO8UZ18=,KRnwvSlSa6U62Edl3dHJa0nVM5A=,Danielle Miller,75,4,0,knee,8,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++4kUzy7ewH5u7FjNoU8CW6thbY=,euuPYQwygUQC94V0LvmFOzHPoXQ=,Jeremy Randall,70,4,2,low_back,9,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++8Q+lFKrp9IKCWsBT0IO0XEV1Y=,R0S8jUhp1lC00zuUYuB0QAnj6as=,Richard Robinson,98,4,4,low_back,2,,4,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
++9PC4/46Jmrl/PHbzkM1BCPg2g=,Uj48yKb4R53oteHJHRcnzpUjBdY=,Hector Perry,28,2,2,knee,17,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzjYItanfFoaxcOT72WuLVdFlpU=,p5r85M4UOxA+ooLsAnpGA091yec=,Brianna George,28,2,0,shoulder,11,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzkMg0F+Nr92E2UwQld7T2c9DxY=,Q8xrPkPRSOYhbjrzTxF77Fp40IA=,Sophia Harris,85,8,4,neck,13,,5,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzwyhOME0/jCt/TlocGDlnM7Nx4=,4tqtZxzLS9+7QJtKa3pW0beUG3s=,Donna Moreno,34,0,0,elbow,13,,5,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzy4uJWf1oWcSbMEHF4RG15ELcU=,bWap/fmf6koGTRDMpejv7/J+ovw=,Nancy Walter,96,4,0,low_back,1,,3,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Filter out sessions that have no wrong repeats before applying idxmax
df_nonzero_wrong = df[df["wrong_repeats"] > 0]

if not df_nonzero_wrong.empty:
    most_incorrect = df_nonzero_wrong.loc[df_nonzero_wrong.groupby("session_group")["wrong_repeats"].idxmax(), 
                                          ["session_group", "exercise_name"]]
else:
    most_incorrect = pd.DataFrame(columns=["session_group", "exercise_name"])  # Empty DataFrame to merge

grouped = grouped.merge(most_incorrect, on="session_group", how="left")
grouped.rename(columns={"exercise_name": "exercise_with_most_incorrect"}, inplace=True)

grouped["exercise_with_most_incorrect"]

0                shoulder_abduction
1                     hip_abduction
2                hip_hyperextension
3                      knee_flexion
4                hip_hyperextension
                    ...            
74785            diagonal_1_flexion
74786     sitting_neck_side_bending
74787                  standing_row
74788            hip_hyperextension
74789    shoulder_internal_rotation
Name: exercise_with_most_incorrect, Length: 74790, dtype: object

In [15]:
# Finding the first skipped exercise
skipped_exercises = df[df["leave_exercise"].notnull()].sort_values(by=["session_group", "exercise_order"])
first_skipped = skipped_exercises.groupby("session_group").first().reset_index()[["session_group", "exercise_name"]]
grouped = grouped.merge(first_skipped, on="session_group", how="left")
grouped.rename(columns={"exercise_name": "first_exercise_skipped"}, inplace=True)

In [16]:
grouped.columns

Index(['session_group', 'patient_id', 'patient_name', 'patient_age', 'pain',
       'fatigue', 'therapy_name', 'session_number', 'leave_session', 'quality',
       'session_is_nok', 'prescribed_repeats', 'training_time',
       'correct_repeats', 'wrong_repeats', 'number_exercises',
       'number_of_distinct_exercises', 'perc_correct_repeats',
       'leave_exercise_system_problem', 'leave_exercise_other',
       'leave_exercise_unable_perform', 'leave_exercise_pain',
       'leave_exercise_tired', 'leave_exercise_technical_issues',
       'leave_exercise_difficulty', 'quality_movement_detection',
       'quality_my_self_personal', 'quality_other', 'quality_exercises',
       'quality_tablet', 'quality_tablet_and_or_motion_trackers',
       'quality_easy_of_use', 'quality_session_speed',
       'exercise_with_most_incorrect', 'first_exercise_skipped'],
      dtype='object')

In [17]:
columns_order = [
        "session_group",
        "patient_id",
        "patient_name",
        "patient_age",
        "pain",
        "fatigue",
        "therapy_name",
        "session_number",
        "leave_session",
        "quality",
        * grouped.columns[grouped.columns.str.startswith("quality_reason_")],
        "session_is_nok",
        * grouped.columns[grouped.columns.str.startswith("leave_exercise_")],
        "prescribed_repeats",
        "training_time",
        "perc_correct_repeats",
        "number_exercises",
        "number_of_distinct_exercises",
        "exercise_with_most_incorrect",
        "first_exercise_skipped",
    ]
columns_order

['session_group',
 'patient_id',
 'patient_name',
 'patient_age',
 'pain',
 'fatigue',
 'therapy_name',
 'session_number',
 'leave_session',
 'quality',
 'session_is_nok',
 'leave_exercise_system_problem',
 'leave_exercise_other',
 'leave_exercise_unable_perform',
 'leave_exercise_pain',
 'leave_exercise_tired',
 'leave_exercise_technical_issues',
 'leave_exercise_difficulty',
 'prescribed_repeats',
 'training_time',
 'perc_correct_repeats',
 'number_exercises',
 'number_of_distinct_exercises',
 'exercise_with_most_incorrect',
 'first_exercise_skipped']