# Third attempt at extracting running session data

In this second attempt, we want to us multi processing.

In [1]:
import core.constants as c
import os
import pandas as pd
from collections import defaultdict
from typing import List


run_data_meta_cleaned = pd.read_csv(c.RICKD_RUNNING_METADATA_CLEANED_FILE)

In [2]:
from core.processing import process_row, SessionData
from tqdm.notebook import tqdm
from functools import partial
import concurrent.futures
import multiprocessing

process_row_with_folder = partial(process_row, source_data_folder=c.RICKD_SOURCE_DATA_FOLDER)


rows = list(run_data_meta_cleaned.iterrows())

# Use spawn method for better compatibility
if __name__ == '__main__':
    multiprocessing.set_start_method('spawn', force=True)

with concurrent.futures.ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
    results: List[SessionData] = list(tqdm(
        executor.map(process_row_with_folder, rows),
        total=len(rows)
    ))

  0%|          | 0/1832 [00:00<?, ?it/s]

In [3]:


descriptive_variables = pd.DataFrame([s.desc_variables for s in results])
marker_centers = pd.DataFrame([s.marker_center_data for s in results])
print(f"descriptive_variables.shape: {descriptive_variables.shape}")
print(f"marker_centers.shape: {marker_centers.shape}")
descriptive_variables.to_csv()
marker_centers.to_csv(os.path.join(c.RICKD_PROCESSED_DATA_FOLDER, "marker_centers.csv"))

descriptive_variables.shape: (1832, 155)
marker_centers.shape: (1832, 135)


In [4]:
marker_data_by_freq = defaultdict(list)
for s in results:
    freq = int(s.session_hz)
    df = pd.DataFrame.from_dict(s.marker_data)
    marker_data_by_freq[freq].append(df)

combined_marker_data_by_freq = {}
for freq, dfs in marker_data_by_freq.items():
    combined_marker_data_by_freq[freq] = pd.concat(dfs)
    print(f"marker_data_{freq:03d}hz.shape: {combined_marker_data_by_freq[freq].shape}")

marker_data_200hz.shape: (12227292, 93)
marker_data_120hz.shape: (25598, 93)


In [6]:
combined_marker_data_by_freq.keys()

dict_keys([200, 120])

In [8]:
%%time
combined_marker_data_by_freq[120].to_csv(c.RICKD_MARKER_DATA_120HZ_FILE)

CPU times: user 994 ms, sys: 20.4 ms, total: 1.01 s
Wall time: 1.02 s


In [9]:
%%time
combined_marker_data_by_freq[200].to_csv(c.RICKD_MARKER_DATA_200HZ_FILE)

CPU times: user 7min 48s, sys: 18.1 s, total: 8min 6s
Wall time: 9min 30s


In [10]:
# Re-create RICKD_SESSION_DATA_FULL_FILE as RICKD_RUNNING_METADATA_CLEANED_FILE + descriptive_variables
run_data_meta_cleaned = pd.read_csv(c.RICKD_RUNNING_METADATA_CLEANED_FILE)
descriptive_variables = pd.read_csv(c.RICKD_DESCRIPTIVE_VARIABLES_FILE)

session_data_full = pd.merge(run_data_meta_cleaned, descriptive_variables, on='id', how='left')
display(session_data_full)

Unnamed: 0,id,sub_id,datestring,filename,speed_r,age,Height,Weight,Gender,DominantLeg,...,r_knee_add_vel_percent_stance,r_hip_abd_peak_vel,r_hip_abd_vel_percent_stance,r_knee_rot_peak_vel,r_hip_rot_peak_vel,r_pronation_onset,r_pronation_offset,r_peak_hip_add_velocity,r_peak_pelvic_drop_velocity,r_vertical_oscillation
0,100433_20101005t132240,100433,2010-10-05 13:22:40,20101005t132240.json,1.610861,53,,,unknown,,...,0,-17.620105,0,121.078813,95.598653,17,57,59.323001,-75.058744,51.466247
1,100434_20101117t132240,100434,2010-11-17 13:22:40,20101117t132240.json,2.237294,51,,,female,,...,0,-42.544797,0,292.718838,17.255165,8,53,183.573751,-96.178927,62.307667
2,100537_20120703t102550,100537,2012-07-03 10:25:50,20120703t102550.json,2.127441,255,173.1,67.6,female,right,...,0,-55.286282,0,318.798151,201.561725,13,56,257.430836,-62.804925,81.753809
3,100560_20120717t103748,100560,2012-07-17 10:37:48,20120717t103748.json,2.657365,33,179.3,83.0,female,right,...,0,-109.157604,0,349.195418,84.881142,20,55,320.445059,-141.687354,111.686204
4,101481_20120717t105021,101481,2012-07-17 10:50:21,20120717t105021.json,2.625088,32,176.3,58.6,female,,...,0,-24.314138,0,226.067684,93.835737,17,57,169.271213,-49.506248,99.045820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1827,200986_20150312t143944,200986,2015-03-12 14:39:44,20150312t143944.json,4.876998,20,174.0,56.8,female,right,...,0,-64.155468,0,256.832528,56.272377,26,55,113.244646,-35.757575,84.589713
1828,200987_20150312t160840,200987,2015-03-12 16:08:40,20150312t160840.json,2.765022,50,164.0,60.0,female,right,...,0,-53.086396,0,153.266603,62.976728,18,52,171.902428,-70.457833,68.524480
1829,201100_20150409t155915,201100,2015-04-09 15:59:15,20150409t155915.json,2.790966,52,170.0,80.0,male,right,...,0,-67.818836,0,240.909620,15.134823,19,68,203.683223,-120.697186,86.710600
1830,201101_20150413t143152,201101,2015-04-13 14:31:52,20150413t143152.json,2.828602,21,162.0,65.5,male,right,...,0,-21.455733,0,252.971737,72.610820,17,50,168.473505,-79.974041,67.343927


In [13]:
session_data_full.to_csv(c.RICKD_SESSION_DATA_FULL_FILE, index=False)