# Second attempt at extracting running session data

In this second attempt, we want to make the processs more efficient.

In [6]:
import core.constants as c
import os

import json

import pandas as pd

from collections import defaultdict

run_data_meta_cleaned = pd.read_csv(c.RICKD_RUNNING_METADATA_CLEANED_FILE)

In [15]:
from pydantic import BaseModel, Field


class Marker(BaseModel):
    x: float
    y: float
    z: float

    @classmethod
    def from_coordinates_list(cls, lst: list) -> "Marker":
        return cls(x=lst[0], y=lst[1], z=lst[2])

    def append_to_dict(self, var_prefix: str, target_dict: dict) -> None:
        target_dict[var_prefix + "_x"].append(self.x)
        target_dict[var_prefix + "_y"].append(self.y)
        target_dict[var_prefix + "_z"].append(self.z)


def get_dv_var_name(side, orig_name) -> str:
    return side + "_" + str.lower(orig_name)

def get_marker_var_name(group, orig_name) -> str:
    return group + "_" + str.lower(orig_name)


In [None]:
num_sessions = len(os.listdir(c.RICKD_SOURCE_DATA_FOLDER))
descriptive_variables_data = defaultdict(list)
marker_data = defaultdict(list)
for idx, session in run_data_meta_cleaned.iterrows():
    print(f"Processing row: {idx} / {num_sessions}")
    subject_id = session['sub_id']
    relative_session_path = session['session_file_path']
    num_dv_vars = 0
    print(f"Processing session: {relative_session_path} of subject {subject_id}")
    json_session = json.load(open(os.path.join(c.RICKD_SOURCE_DATA_FOLDER, relative_session_path)))

    # Extract Descriptive Variable data
    # ---------------------------------
    # Variables are stored in the following format:
    # <side>_<variable_name>
    # Where <l/r> is the side of the body, left or right
    # and <variable_name> is the name of the variable in the json file
    dv_r = json_session['dv_r']
    variables = {
        'l': dv_r['left'],
        'r': dv_r['right']
    }
    
    for side, data in variables.items():
        # Notify if missing variables.
        if num_dv_vars != 0 and len(data) != num_dv_vars:
            print(f"Error: Number of descriptive variables mismatch. Expected {num_dv_vars} variables, but got {len(data)}.")
        num_dv_vars = len(data)

        for name, value in data.items():
            variable_name = get_dv_var_name(side, name)
            descriptive_variables_data[variable_name].append(value)
    
    # Extract Marker data
    # -------------------
    # Variables are stored in the following format:
    # <indiv/cluster>_<variable_name>_<x/y/z>
    # Where <indiv/cluster> is the type of marker, individual or mounted on a cluster
    # <variable_name> is the name of the variable in the json file
    # and <x/y/z> is the x, y, or z coordinate of the marker
    marker_vars = {
        "indiv": json_session['joints'],
        "cluster": json_session['neutral']
    }
    
    for group, var_set in marker_vars.items():
        for var, coordinates in var_set.items():
            marker = Marker.from_coordinates_list(coordinates)
            variable_prefix = get_marker_var_name(group, var)
            marker.append_to_dict(variable_prefix, marker_data)
        
dv_df = pd.DataFrame(descriptive_variables_data)
display(dv_df)

marker_df = pd.DataFrame(marker_data)
display(marker_df)

Processing row: 0 / 1799
Processing session: 100433/20101005t132240.json
Processing row: 1 / 1799
Processing session: 100434/20101117t132240.json
Processing row: 2 / 1799
Processing session: 100537/20120703t102550.json
Processing row: 3 / 1799
Processing session: 100560/20120717t103748.json
Processing row: 4 / 1799
Processing session: 101481/20120717t105021.json
Processing row: 5 / 1799
Processing session: 100591/20120809t100115.json
Processing row: 6 / 1799
Processing session: 100595/20120829t125604.json
Processing row: 7 / 1799
Processing session: 100646/20121101t095248.json
Processing row: 8 / 1799
Processing session: 100658/20121122t140316.json
Processing row: 9 / 1799
Processing session: 100671/20121213t105755.json
Processing row: 10 / 1799
Processing session: 100727/20130410t105446.json
Processing row: 11 / 1799
Processing session: 100767/20130606t134651.json
Processing row: 12 / 1799
Processing session: 100776/20130620t121501.json
Processing row: 13 / 1799
Processing session: 10

In [18]:
run_data_meta_cleaned.iloc[18]

id                            100844_20130924t115413
sub_id                                        100844
datestring                       2013-09-24 11:54:13
filename                        20130924t115413.json
speed_r                                     2.923853
age                                               19
Height                                         178.3
Weight                                          69.0
Gender                                          male
DominantLeg                                    right
InjDefn                  continuing to train in pain
InjJoint                                       ankle
InjSide                                         left
SpecInjury                       achilles tendonitis
InjDuration                                      NaN
InjJoint2                                  no injury
InjSide2                                       right
SpecInjury2                                      NaN
Activities           running, weight training 