# Second attempt at extracting running session data

In this second attempt, we want to make the processs more efficient and use multithreading.

In [1]:
import core.constants as c
import os
import pandas as pd
from collections import defaultdict
import concurrent.futures
from typing import List


run_data_meta_cleaned = pd.read_csv(c.RICKD_RUNNING_METADATA_CLEANED_FILE)

In [5]:
from tqdm.notebook import tqdm
from functools import partial

from core.processing import process_row, SessionData
rows = list(run_data_meta_cleaned.iterrows())

process_row_with_folder = partial(process_row, source_data_folder=c.RICKD_SOURCE_DATA_FOLDER)


with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
    results: List[SessionData] = list(tqdm(executor.map(process_row_with_folder, rows), total=len(rows)))

  0%|          | 0/1832 [00:03<?, ?it/s]

--- 3158.9457201957703 seconds ---


In [6]:
descriptive_variables = pd.DataFrame([s.desc_variables for s in results])
marker_centers = pd.DataFrame([s.marker_center_data for s in results])
print(f"descriptive_variables.shape: {descriptive_variables.shape}")
print(f"marker_centers.shape: {marker_centers.shape}")
descriptive_variables.to_csv(os.path.join(c.RICKD_PROCESSED_DATA_FOLDER, "descriptive_variables.csv"))
marker_centers.to_csv(os.path.join(c.RICKD_PROCESSED_DATA_FOLDER, "marker_centers.csv"))

In [None]:
marker_data_by_freq = defaultdict(list)
for s in results:
    freq = int(s.session_hz)
    df = pd.DataFrame.from_dict(s.marker_data)
    marker_data_by_freq[freq].append(df)

combined_marker_data_by_freq = {}
for freq, dfs in marker_data_by_freq.items():
    combined_marker_data_by_freq[freq] = pd.concat(dfs)
    print(f"marker_data_{freq:03d}hz.shape: {combined_marker_data_by_freq[freq].shape}")

In [9]:
for freq, df in combined_marker_data_by_freq.items():
    df.to_csv(os.path.join(c.RICKD_PROCESSED_DATA_FOLDER, f"marker_data_{freq:03d}hz.csv"))