In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
from math import radians, cos, sin, asin, sqrt
import tqdm

# Fast vectorized haversine function
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # meters
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * np.sin(dlon / 2) ** 2
    return 2 * R * np.arcsin(np.sqrt(a))

# Stay point extraction logic
def extract_stay_points(df, D_thres=200, T_thres=timedelta(minutes=20)):
    stay_points = []
    for device_id, group in tqdm.tqdm(df.groupby('deviceid', observed=True), desc="Processing devices"):
        group = group.sort_values("datetime")
        points = group[["datetime", "lat", "lon"]].values
        i = 0
        while i < len(points):
            j = i + 1
            while j < len(points):
                dist = haversine(points[i][1], points[i][2], points[j][1], points[j][2])
                if dist > D_thres:
                    delta_t = points[j][0] - points[i][0]
                    if delta_t > T_thres:
                        lat_mean = group.iloc[i:j]["lat"].mean()
                        lon_mean = group.iloc[i:j]["lon"].mean()
                        stay_points.append({
                            "deviceid": device_id,
                            "arrival_time": points[i][0],
                            "leave_time": points[j][0],
                            "stay_lat": lat_mean,
                            "stay_lon": lon_mean,
                            "duration_min": delta_t.total_seconds() / 60
                        })
                    break
                j += 1
            i = j
    return pd.DataFrame(stay_points)

# Entry point: process a file
def process_file(parquet_path, output_path):
    print(f"Reading {parquet_path.name}...")
    df = pd.read_parquet(parquet_path, columns=["deviceid", "date", "time", "lon", "lat"])
    df["datetime"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str), format="%d.%m.%Y %H:%M:%S")
    df = df.drop(columns=["date", "time"])

    stay_df = extract_stay_points(df)
    stay_df.to_parquet(output_path, index=False)
    print(f"Saved {len(stay_df)} stay points to {output_path.name}")

# Run this part per file
if __name__ == "__main__":
    input_path = Path("data/20230327.parquet")
    output_path = Path("staypoints/staypoints_20230327.parquet")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    process_file(input_path, output_path)


Reading 20230327.parquet...


ArrowMemoryError: realloc of size 67108864 failed

In [7]:
def add_line_from_to(initial:list[int, int], final:list[int, int], m:folium.Map):
    # Use the folium AntPath plugin to draw a line between two points
    AntPath(
        locations=[initial, final],
        dash_array=[20, 20],
        delay=1000,
        color="#A00000",
        pulse_color="#A00000",
        weight=5,
        tooltip="From start to finish"
    ).add_to(m)

In [None]:
stay_points = dd.read_parquet('stay_points.parquet')
display(stay_points)
unique_devices = stay_points['deviceid'].unique()
unique_devices = unique_devices[:10]  # Limit to 10 devices for demonstration
os.makedirs('maps', exist_ok=True)
for device in unique_devices:
    device_df = stay_points[stay_points['deviceid'] == device]
    m = folium.Map(location=[device_df['stay_lat'].mean(), device_df['stay_lon'].mean()], zoom_start=12)
    for i in range(len(device_df)-1):
        initial = (device_df.iloc[i]['stay_lat'], device_df.iloc[i]['stay_lon'])
        final = (device_df.iloc[i+1]['stay_lat'], device_df.iloc[i+1]['stay_lon'])
        add_line_from_to(initial, final, m)
    m.save(f'maps/{device}.html')

NameError: name 'pd' is not defined

In [None]:
# Automatically find all .parquet files in the data folder
days = sorted(["data/" + f for f in os.listdir("data") if f.endswith(".parquet")])

for day in tqdm.tqdm(days, desc="Processing days"):
    current = dd.read_parquet(day)
    # Randomly sample 10% of the data following the same distribution as the original data
    sample = current.sample(frac=0.1, random_state=42)
    # Save the sampled data to a new parquet file
    # Ensure the directory exists
    os.makedirs("sampled_data", exist_ok=True)
    sample.to_parquet(day.replace("data/", "sampled_data/"), index=False)

Processing days:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
days_sample = sorted(["sampled_data/" + f for f in os.listdir("sampled_data") if f.endswith(".parquet")])