In [4]:
from pathlib import Path
import logging
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.strtree import STRtree
from shapely.geometry import Point
import time
import os

In [5]:
IN_DIR = Path("data_binned") 
OUT_DIR = Path("data_transitions")
parquet_files = list(IN_DIR.glob("*.parquet"))
print(f"Found {len(parquet_files)} Parquet files in {IN_DIR}")

Found 7 Parquet files in data_binned


In [8]:
def save_transitions_from_df(df, output_file):
    """
    Save transition data directly from a dataframe with FROM, TO and VALID columns.
    
    Args:
        df: DataFrame with FROM, TO and VALID columns
        output_file: Path to save the file
        format: 'parquet'
        
        how it looks like:
        time_bin  FROM      TO  count
0         0       19.0    19.0     68
1         0       19.0   787.0      1
2         0       19.0  1435.0      2
...
    """
    
    # Filter to only rows with valid transitions
    valid_df = df[df['VALID'] == True].copy()
    
    # Group by time_bin, FROM, TO and count
    transitions_df = valid_df.groupby(['time_bin', 'FROM', 'TO']).size().reset_index(name='count')

    transitions_df.to_parquet(output_file, compression='snappy')
    # Print statistics
    file_size = os.path.getsize(output_file) / (1024*1024)
    
    print(f"Saved transitions to {output_file}")
    print(f"File size: {file_size:.2f} MB")
    print(f"Total transitions saved: {len(transitions_df)}")
    print(f"Unique time bins: {len(transitions_df['time_bin'].unique())}")
    print(f"Unique FROM zones: {len(transitions_df['FROM'].unique())}")
    print(f"Unique TO zones: {len(transitions_df['TO'].unique())}")
    
    return transitions_df

def create_transition_hashmap(df):
    """
    Create a transition matrix as a nested dictionary (hash map) from the dataframe.
    Structure will be: result[time_bin][from_zone][to_zone] = count
    """
    # Initialize the transition hash map
    # Structure: transition_map[time_bin][from_zone][to_zone] = count
    transition_map = {}
    
    # Filter to include only valid transitions
    valid_df = df[df['VALID'] == True].copy()
    
    # Print basic stats
    total_rows = len(df)
    valid_rows = len(valid_df)
    print(f"Total rows in dataframe: {total_rows}")
    print(f"Valid transitions: {valid_rows} ({valid_rows/total_rows*100:.2f}% of total)")
    
    # Process each valid row
    for _, row in valid_df.iterrows():
        from_zone = row['FROM']  # Using the FROM column
        to_zone = row['TO']      # Using the TO column
        time_bin = row['time_bin']
        
        # Initialize nested dictionaries if they don't exist
        if time_bin not in transition_map:
            transition_map[time_bin] = {}
        
        if from_zone not in transition_map[time_bin]:
            transition_map[time_bin][from_zone] = {}
        
        # Increment the transition count
        if to_zone in transition_map[time_bin][from_zone]:
            transition_map[time_bin][from_zone][to_zone] += 1
        else:
            transition_map[time_bin][from_zone][to_zone] = 1
    
    # Print transition statistics by time bin
    print("\nTransitions by time bin:")
    for time_bin in sorted(transition_map.keys()):
        time_bin_transitions = sum(sum(counts.values()) for counts in transition_map[time_bin].values())
        print(f"  Time bin {time_bin}: {time_bin_transitions} transitions ({time_bin_transitions/valid_rows*100:.2f}% of valid)")
    
    return transition_map

def print_stats(df):
    """
    Calculate and print the percentage of valid and invalid transitions
    using the VALID column.
    """
    total_rows = len(df)

    # Count valid transitions (VALID == True)
    valid_transitions = df['VALID'].sum()
    valid_pct = (valid_transitions / total_rows) * 100

    # Count invalid transitions (VALID == False)
    invalid_transitions = total_rows - valid_transitions
    invalid_pct = (invalid_transitions / total_rows) * 100

    print(f"Total rows: {total_rows}")
    print(f"Valid transitions: {valid_transitions} ({valid_pct:.2f}%)")
    print(f"Invalid transitions: {invalid_transitions} ({invalid_pct:.2f}%)")

def create_zone_transitions_sequential_approach1(df):
    # Extract arrays from dataframe
    zone_ids = df["zone_id"].to_numpy()
    time_bins = df["time_bin"].to_numpy()
    dc = df["device_change"].to_numpy()
    
    # Create arrays for the next row's values using roll
    zone_next = np.roll(zone_ids, -1)
    time_bin_next = np.roll(time_bins, -1)
    
    # Handle device changes - mark the last row of each device
    last_row = np.roll(dc, -1)
    last_row[-1] = True  # Last row of the entire dataframe
    
    # Valid transitions are when:
    # 1. Not at a device change boundary
    # 2. Time bins are the same
    valid_idx = (~last_row) & (time_bins == time_bin_next)
    
    # Create FROM and TO columns (use NaN for invalid transitions)
    from_zones = np.full(len(df), np.nan, dtype=float)
    to_zones = np.full(len(df), np.nan, dtype=float)
    
    # Set values only for valid transitions
    from_zones[valid_idx] = zone_ids[valid_idx]
    to_zones[valid_idx] = zone_next[valid_idx]
    
    # Add columns to dataframe
    df["FROM"] = from_zones
    df["TO"] = to_zones
    
    # Add VALID column - TRUE when both FROM and TO are not NaN
    # This is equivalent to valid_idx
    df["VALID"] = valid_idx
    
    # Add the same_zone column for convenience
    df["same_zone"] = (zone_ids == zone_next) & valid_idx
    
    return df


def create_zone_transition_matrix_approach1(df):
    # Get unique zone_ids and time_bins
    unique_zones = df['zone_id'].unique()
    time_bins = df['time_bin'].unique()
    
    # Initialize transition matrices for each time_bin
    transition_matrices = {tb: pd.DataFrame(0, index=unique_zones, columns=unique_zones) 
                          for tb in time_bins}
    
    # Iterate through rows sequentially
    for i in range(len(df) - 1):
        current_row = df.iloc[i]
        next_row = df.iloc[i + 1]
        
        # Check if we're still tracking the same device and in the same time bin
        if (not next_row['device_change'] and 
            current_row['time_bin'] == next_row['time_bin']):
            
            # Get from and to zones
            from_zone = current_row['zone_id']
            to_zone = next_row['zone_id']
            time_bin = current_row['time_bin']
            
            # Count all transitions, even those between the same zone
            transition_matrices[time_bin].loc[from_zone, to_zone] += 1
    
    return transition_matrices

def create_zone_transition_by_dwell_time(df):
    """
    Create a transition matrix based on dominant dwell zones per user and time_bin.

    Returns:
        DataFrame with columns: FROM, TO, count
    """
    # Ensure the timestamps are in datetime format
    df["timestamp"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str))

    # Sort the dataframe for accurate duration calculation
    df = df.sort_values(by=["deviceid", "time_bin", "timestamp"])

    # Calculate duration between consecutive points
    df["next_timestamp"] = df.groupby(["deviceid", "time_bin"])["timestamp"].shift(-1)
    df["duration_minutes"] = (df["next_timestamp"] - df["timestamp"]).dt.total_seconds() / 60.0

    # Assume final point in each group lasts 0 minute if not known
    df["duration_minutes"] = df["duration_minutes"].fillna(0)

    # Get zone with max dwell time per device per time_bin
    dwell_df = df.groupby(["deviceid", "time_bin", "zone_id"])["duration_minutes"].sum().reset_index()
    dominant = dwell_df.loc[dwell_df.groupby(["deviceid", "time_bin"])["duration_minutes"].idxmax()]

    # Sort by device and time bin to prepare for transition extraction
    dominant = dominant.sort_values(by=["deviceid", "time_bin"])

    # Shift to get next row's zone and time_bin
    dominant["next_zone"] = dominant.groupby("deviceid")["zone_id"].shift(-1)
    dominant["next_time_bin"] = dominant.groupby("deviceid")["time_bin"].shift(-1)

    # Only keep transitions between consecutive time bins
    valid_transitions = dominant[dominant["time_bin"] + 1 == dominant["next_time_bin"]].copy()

    # Rename columns to match other transition formats
    valid_transitions.rename(columns={"zone_id": "FROM", "next_zone": "TO"}, inplace=True)

    # Count transitions
    transition_matrix = (
        valid_transitions.groupby(["time_bin", "FROM", "TO"])
        .size()
        .reset_index(name="count")
    )

    return transition_matrix


transition_matrices_per_day = []


OUT_DIR.mkdir(parents=True, exist_ok=True)
# Loop through each file and process it individually
for file_path in parquet_files:
    print(f"\nProcessing {file_path.name}...")
    
    # Load the current parquet file
    df = pd.read_parquet(file_path)
    
    # Print information about this file
    print(f"Columns: {df.columns.tolist()}")
    print(f"Shape: {df.shape}")
    
    # Add timer before function call
    start_time = time.time()
    
    # Perform any necessary operations on df here
    #transition_matrices_per_day.append(create_zone_transition_matrix_approach1(df))
    df = create_zone_transition_by_dwell_time(df)
    
    elapsed_time = time.time() - start_time
    print(f"Time taken to add to the df FROM/TO: {elapsed_time:.2f} seconds")
    
    #start_time = time.time()
    #transition_map = create_transition_hashmap(df)
    #elapsed_time = time.time() - start_time
    #print(f"Processing transition matrix took: {elapsed_time:.2f} seconds")
    out_path = OUT_DIR / file_path.name
    
    df.to_parquet(out_path, compression='snappy')
    # The dataframe will be garbage collected after each iteration
    # as it goes out of scope
    print(f"Finished processing {file_path.name}")
    print("-" * 50)
    
print('Finished')


Processing 20230331.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (87260420, 12)


  df["timestamp"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str))


Time taken to add to the df FROM/TO: 382.21 seconds
Finished processing 20230331.parquet
--------------------------------------------------

Processing 20230328.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (86532152, 12)


  df["timestamp"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str))


Time taken to add to the df FROM/TO: 405.75 seconds
Finished processing 20230328.parquet
--------------------------------------------------

Processing 20230327.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (87981395, 12)


  df["timestamp"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str))


Time taken to add to the df FROM/TO: 457.47 seconds
Finished processing 20230327.parquet
--------------------------------------------------

Processing 20230401.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (78501155, 12)
Time taken to add to the df FROM/TO: 375.11 seconds
Finished processing 20230401.parquet
--------------------------------------------------

Processing 20230329.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (88523283, 12)


  df["timestamp"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str))


Time taken to add to the df FROM/TO: 499.47 seconds
Finished processing 20230329.parquet
--------------------------------------------------

Processing 20230330.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (88407616, 12)


  df["timestamp"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str))


Time taken to add to the df FROM/TO: 474.47 seconds
Finished processing 20230330.parquet
--------------------------------------------------

Processing 20230402.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (75946916, 12)
Time taken to add to the df FROM/TO: 316.52 seconds
Finished processing 20230402.parquet
--------------------------------------------------
Finished
