In [1]:
import geopandas as gpd
import pandas as pd
from pathlib import Path

regional_dir = Path("data/validation_set/regional_roads")
parquet_files = list(regional_dir.glob("*.parquet"))
output_path = Path("data/validation_set/regional_roads/processed_regional.parquet")
output_dir = regional_dir / "sampled"
output_dir.mkdir(parents=True, exist_ok=True)

rename_map = {
    'startTs': 'DateTime',
    'Stevilo_OA_sum': 'Number_OA',
    'Stevilo_BUS_sum': 'Number_BUS',
    'Stevilo_LT_sum': 'Number_LT',
    'Stevilo_ST_sum': 'Number_ST',
    'Stevilo_TT_sum': 'Number_TT',
    'Stevilo_TP_sum': 'Number_TP',
    'Stevilo_TPP_sum': 'Number_TPP',
    'PovpHitrost_avg': 'PovpSpeed',
    'MinHitrost_avg': 'MinSpeed',
    'MaxHitrost_avg': 'MaxSpeed',
    'Zasedenost_avg': 'Occupancy'
}

columns_to_keep = list(rename_map.keys()) + ['groupKey', 'geometry']

samples = []
for f in parquet_files:
    try:
        gdf = gpd.read_parquet(f, columns=[col for col in columns_to_keep if col != 'TrafficStatus'])
        if 'TrafficStatus' in gdf.columns:
            gdf['TrafficStatus'] = gdf['TrafficStatus']
        gdf = gdf.rename(columns=rename_map)
        gdf['DateTime'] = pd.to_datetime(gdf['DateTime'], dayfirst=True, errors='coerce')
        gdf = gdf.dropna(subset=['DateTime'])

        sample = gdf.sample(n=min(1000, len(gdf)), random_state=42)
        samples.append(sample)

        sample.to_parquet(output_dir / f.name.replace(".parquet", "_sampled.parquet"), index=False)
        print(f"✔ Sampled and saved: {f.name}")
    except Exception as e:
        print(f"❌ Error with {f.name}: {e}")

if samples:
    pd.concat(samples, ignore_index=True).to_parquet(output_path, index=False)
    print(f"\n✅ Final unified file saved to: {output_path}")
    del samples
else:
    print("⚠ No valid samples collected.")


✔ Sampled and saved: 2023-03-27.parquet
✔ Sampled and saved: 2023-03-28.parquet
✔ Sampled and saved: 2023-03-29.parquet
✔ Sampled and saved: 2023-03-30.parquet
✔ Sampled and saved: 2023-03-31.parquet
✔ Sampled and saved: 2023-04-01.parquet
✔ Sampled and saved: 2023-04-02.parquet
❌ Error with processed_regional.parquet: No match for FieldRef.Name(startTs) in DateTime: timestamp[ns]
Number_OA: double
Number_BUS: double
Number_LT: double
Number_ST: double
Number_TT: double
Number_TP: double
Number_TPP: double
PovpSpeed: double
MinSpeed: double
MaxSpeed: double
Occupancy: double
groupKey: string
geometry: binary
__fragment_index: int32
__batch_index: int32
__last_in_fragment: bool
__filename: string

✅ Final unified file saved to: data\validation_set\regional_roads\processed_regional.parquet


In [8]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Path to all highway xlsx files
highway_dir = Path("data/validation_set/highways")
xlsx_files = list(highway_dir.glob("*.xlsx"))

# Container for processed DataFrames
processed_highways = []

# Iterate through each file and process
for file in tqdm(xlsx_files, desc="Loading files", unit="files"):
    try:
        # Read data starting from row 4 (header is at row 3)
        df = pd.read_excel(file, sheet_name="Seznam", header=3)

        # Rename the timestamp column and parse it
        df = df.rename(columns={
                df.columns[0]: "Date",
                df.columns[1]: "Time"
            })
        df["DateTime"] = pd.to_datetime(
                        df["Date"].astype(str) + " " + df["Time"].astype(str),
                        format="%d.%m.%Y %H:%M",  # adjust if needed
                        errors="coerce"
                    )
        df = df.drop(columns=["Date", "Time"])
        # Add filename as station ID
        df["station_file"] = file.name

        # Store it
        processed_highways.append(df)
    except Exception as e:
        print(f"❌ Error with {file.name}: {e}")

# Combine and save if we got anything
if processed_highways:
    combined_df = pd.concat(processed_highways, ignore_index=True)
    output_path = highway_dir / "processed_highways.parquet"
    combined_df.to_parquet(output_path, index=False)
    output_path
else:
    "⚠ No valid highway files were processed."


Loading files: 100%|██████████| 243/243 [09:29<00:00,  2.34s/files]


MemoryError: Unable to allocate 36.5 MiB for an array with shape (2, 2389203) and data type object