## Convert DOZ files to structured parquets to ease processing for the file reader

In [1]:
from concurrent.futures import ThreadPoolExecutor
from doz_file_utils import compose_doz, parse_doz
from doz_file_utils.doz_dataclasses import ParsedDOZ
import glob
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

from parse_data import _parse_single_file

In [2]:
sub_res = {}
for subject in glob.glob("../corr/train/raw/*"):
    all_paths = glob.glob(os.path.join(subject, "**/*"), recursive=True)
    file_paths = [p for p in all_paths if os.path.isfile(p)]
    results = []

    with ThreadPoolExecutor() as executor:
        futures = executor.map(_parse_single_file, file_paths)

        for parsed in tqdm(futures, desc="Parsing DOZ files", total=len(file_paths)):
            if parsed is not None:
                results.append(parsed)
    
    df = pd.DataFrame(results, columns=["ts", "gain", "signals"])
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_table(table, f"data/{subject.split('/')[4]}.parquet")

Parsing DOZ files: 100%|██████████| 2756/2756 [00:02<00:00, 1215.76it/s]
Parsing DOZ files: 100%|██████████| 1600/1600 [00:01<00:00, 1074.47it/s]
Parsing DOZ files: 100%|██████████| 2178/2178 [00:01<00:00, 1093.94it/s]
Parsing DOZ files: 100%|██████████| 1396/1396 [00:01<00:00, 1109.90it/s]
Parsing DOZ files: 100%|██████████| 469/469 [00:00<00:00, 1567.97it/s]
Parsing DOZ files: 100%|██████████| 1420/1420 [00:01<00:00, 1273.27it/s]
Parsing DOZ files: 100%|██████████| 2183/2183 [00:00<00:00, 4486.10it/s] 
Parsing DOZ files: 100%|██████████| 2154/2154 [00:01<00:00, 1163.07it/s]
Parsing DOZ files: 100%|██████████| 2201/2201 [00:01<00:00, 1169.05it/s]
Parsing DOZ files: 100%|██████████| 2071/2071 [00:01<00:00, 1237.71it/s]
Parsing DOZ files: 100%|██████████| 1987/1987 [00:00<00:00, 2443.25it/s] 
Parsing DOZ files: 100%|██████████| 1977/1977 [00:01<00:00, 1489.58it/s]
Parsing DOZ files: 100%|██████████| 209/209 [00:00<00:00, 1249.31it/s]
Parsing DOZ files: 100%|██████████| 761/761 [00:00<00