In [1]:
#Imports
import numpy as np  # noqa: F401
import dask.dataframe as dd
import matplotlib.pyplot as plt # noqa: F401
from geopy.distance import geodesic
from datetime import datetime, timedelta
import os # noqa: F401
import tqdm 
import folium
from folium.plugins import AntPath

In [None]:
df = dd.read_parquet('sampled_data/20230327.parquet', columns= ['deviceid', 'date', 'time', 'lon', 'lat'])

# Load data
df['datetime'] = dd.to_datetime(df['date'].astype(str) + ' ' + df['time'].astype(str), format='%d.%m.%Y %H:%M:%S')

df = df.sort_values(by=['deviceid', 'datetime']).reset_index(drop=True)

# Thresholds
D_thres = 200  # meters
T_thres = timedelta(minutes=20)

stay_points = []

# Process per device
for device_id, group in tqdm.tqdm(df.groupby('deviceid'), desc="Processing devices"):
    points = group[['datetime', 'lat', 'lon']].values
    i = 0
    while i < len(points):
        j = i + 1
        while j < len(points):
            dist = geodesic((points[i][1], points[i][2]), (points[j][1], points[j][2])).meters
            if dist > D_thres:
                delta_t = points[j][0] - points[i][0]
                if delta_t > T_thres:
                    lat_mean = group.iloc[i:j]['lat'].mean()
                    lon_mean = group.iloc[i:j]['lon'].mean()
                    stay_points.append({
                        'deviceid': device_id,
                        'arrival_time': points[i][0],
                        'leave_time': points[j][0],
                        'stay_lat': lat_mean,
                        'stay_lon': lon_mean,
                        'duration_min': delta_t.total_seconds() / 60
                    })
                break
            j += 1
        i = j

stay_df = dd.DataFrame(stay_points)

  for device_id, group in tqdm.tqdm(df.groupby('deviceid'), desc="Processing devices"):
Processing devices: 100%|██████████| 751168/751168 [34:43<00:00, 360.59it/s]  


In [7]:
def add_line_from_to(initial:list[int, int], final:list[int, int], m:folium.Map):
    # Use the folium AntPath plugin to draw a line between two points
    AntPath(
        locations=[initial, final],
        dash_array=[20, 20],
        delay=1000,
        color="#A00000",
        pulse_color="#A00000",
        weight=5,
        tooltip="From start to finish"
    ).add_to(m)

In [None]:
stay_points = dd.read_parquet('stay_points.parquet')
display(stay_points)
unique_devices = stay_points['deviceid'].unique()
unique_devices = unique_devices[:10]  # Limit to 10 devices for demonstration
os.makedirs('maps', exist_ok=True)
for device in unique_devices:
    device_df = stay_points[stay_points['deviceid'] == device]
    m = folium.Map(location=[device_df['stay_lat'].mean(), device_df['stay_lon'].mean()], zoom_start=12)
    for i in range(len(device_df)-1):
        initial = (device_df.iloc[i]['stay_lat'], device_df.iloc[i]['stay_lon'])
        final = (device_df.iloc[i+1]['stay_lat'], device_df.iloc[i+1]['stay_lon'])
        add_line_from_to(initial, final, m)
    m.save(f'maps/{device}.html')

NameError: name 'pd' is not defined

In [None]:
# Automatically find all .parquet files in the data folder
days = sorted(["data/" + f for f in os.listdir("data") if f.endswith(".parquet")])

for day in tqdm.tqdm(days, desc="Processing days"):
    current = dd.read_parquet(day)
    # Randomly sample 10% of the data following the same distribution as the original data
    sample = current.sample(frac=0.1, random_state=42)
    # Save the sampled data to a new parquet file
    # Ensure the directory exists
    os.makedirs("sampled_data", exist_ok=True)
    sample.to_parquet(day.replace("data/", "sampled_data/"), index=False)

Processing days:   0%|          | 0/7 [00:00<?, ?it/s]