In [2]:
from pathlib import Path
import logging
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.strtree import STRtree
from shapely.geometry import Point
import time
import os

In [3]:
IN_DIR = Path("data_binned") 
OUT_DIR = Path("data_transitions")

parquet_files = list(IN_DIR.glob("*.parquet"))
print(f"Found {len(parquet_files)} Parquet files in {IN_DIR}")


Found 7 Parquet files in data_binned


In [7]:
def calculate_nan_percentages(df):
    """
    Calculate and print the percentage of rows where VALID is True vs False.
    
    Args:
        df: DataFrame with a 'VALID' column
    """
    total_rows = len(df)
    
    # Count valid and invalid rows
    valid_rows = df['VALID'].sum()  # Sum of True values
    invalid_rows = total_rows - valid_rows
    
    # Calculate percentages
    valid_percent = (valid_rows / total_rows) * 100 if total_rows > 0 else 0
    invalid_percent = (invalid_rows / total_rows) * 100 if total_rows > 0 else 0
    
    # Print results
    print(f"Total rows: {total_rows}")
    print(f"Valid rows (VALID=True): {valid_rows} ({valid_percent:.2f}%)")
    print(f"Invalid rows (VALID=False): {invalid_rows} ({invalid_percent:.2f}%)")


def save_transitions_from_df(df, output_file):
    """
    Save transition data directly from a dataframe with FROM, TO and VALID columns.
    
    Args:
        df: DataFrame with FROM, TO and VALID columns
        output_file: Path to save the file
        format: 'parquet'
        
        how it looks like:
        time_bin  FROM      TO  count
0         0       19.0    19.0     68
1         0       19.0   787.0      1
2         0       19.0  1435.0      2
...
    """
    
    # Filter to only rows with valid transitions
    valid_df = df[df['VALID'] == True].copy()
    
    # Group by time_bin, FROM, TO and count
    transitions_df = valid_df.groupby(['time_bin', 'FROM', 'TO']).size().reset_index(name='count')

    transitions_df.to_parquet(output_file, compression='snappy')
    # Print statistics
    file_size = os.path.getsize(output_file) / (1024*1024)
    
    print(f"Saved transitions to {output_file}")
    print(f"File size: {file_size:.2f} MB")
    print(f"Total transitions saved: {len(transitions_df)}")
    print(f"Unique time bins: {len(transitions_df['time_bin'].unique())}")
    print(f"Unique FROM zones: {len(transitions_df['FROM'].unique())}")
    print(f"Unique TO zones: {len(transitions_df['TO'].unique())}")
    
    return transitions_df

def create_transition_hashmap(df):
    """
    Create a transition matrix as a nested dictionary (hash map) from the dataframe.
    Structure will be: result[time_bin][from_zone][to_zone] = count
    """
    # Initialize the transition hash map
    # Structure: transition_map[time_bin][from_zone][to_zone] = count
    transition_map = {}
    
    # Filter to include only valid transitions
    valid_df = df[df['VALID'] == True].copy()
    
    # Print basic stats
    total_rows = len(df)
    valid_rows = len(valid_df)
    print(f"Total rows in dataframe: {total_rows}")
    print(f"Valid transitions: {valid_rows} ({valid_rows/total_rows*100:.2f}% of total)")
    
    # Process each valid row
    for _, row in valid_df.iterrows():
        from_zone = row['FROM']  # Using the FROM column
        to_zone = row['TO']      # Using the TO column
        time_bin = row['time_bin']
        
        # Initialize nested dictionaries if they don't exist
        if time_bin not in transition_map:
            transition_map[time_bin] = {}
        
        if from_zone not in transition_map[time_bin]:
            transition_map[time_bin][from_zone] = {}
        
        # Increment the transition count
        if to_zone in transition_map[time_bin][from_zone]:
            transition_map[time_bin][from_zone][to_zone] += 1
        else:
            transition_map[time_bin][from_zone][to_zone] = 1
    
    # Print transition statistics by time bin
    print("\nTransitions by time bin:")
    for time_bin in sorted(transition_map.keys()):
        time_bin_transitions = sum(sum(counts.values()) for counts in transition_map[time_bin].values())
        print(f"  Time bin {time_bin}: {time_bin_transitions} transitions ({time_bin_transitions/valid_rows*100:.2f}% of valid)")
    
    return transition_map

def print_stats(df):
    """
    Calculate and print the percentage of valid and invalid transitions
    using the VALID column.
    """
    total_rows = len(df)

    # Count valid transitions (VALID == True)
    valid_transitions = df['VALID'].sum()
    valid_pct = (valid_transitions / total_rows) * 100

    # Count invalid transitions (VALID == False)
    invalid_transitions = total_rows - valid_transitions
    invalid_pct = (invalid_transitions / total_rows) * 100

    print(f"Total rows: {total_rows}")
    print(f"Valid transitions: {valid_transitions} ({valid_pct:.2f}%)")
    print(f"Invalid transitions: {invalid_transitions} ({invalid_pct:.2f}%)")

    
def create_zone_transitions_sequential_approach2(df):
    """
    Create zone transitions based on the zone where users spend the most time in each time bin.
    
    For each device:
    1. Calculate seconds spent in each zone within each time bin
    2. Create transitions from the most frequent zone in time_bin to most frequent zone in time_bin+1
    
    Args:
        df: DataFrame with zone_id, time_bin, device_change columns and TIME column in HH:MM:SS format
        
    Returns:
        DataFrame with FROM, TO and VALID columns added
    """
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    
    # Convert TIME to seconds since midnight for easier calculation
    if 'time' in result_df.columns:
        result_df['seconds'] = result_df['time'].apply(
            lambda t: int(t.split(':')[0]) * 3600 + int(t.split(':')[1]) * 60 + int(t.split(':')[2])
            if isinstance(t, str) else t.hour * 3600 + t.minute * 60 + t.second
        )
    else:
        raise ValueError("DataFrame must have a TIME column in HH:MM:SS format")
    
    # Initialize columns for result
    result_df['FROM'] = np.nan
    result_df['TO'] = np.nan
    result_df['VALID'] = False
    
    # Extract arrays for faster processing
    zone_ids = result_df["zone_id"].to_numpy()
    time_bins = result_df["time_bin"].to_numpy()
    seconds = result_df["seconds"].to_numpy()
    device_changes = result_df["device_change"].to_numpy()
    
    # Create a device_id array if the column exists, otherwise create one based on device_change
    if 'device_id' in result_df.columns:
        device_ids = result_df["device_id"].to_numpy()
    else:
        # Create synthetic device IDs based on device_change markers
        device_ids = np.zeros(len(result_df), dtype=int)
        current_id = 0
        for i in range(len(result_df)):
            if i > 0 and device_changes[i]:
                current_id += 1
            device_ids[i] = current_id
    
    # Create arrays for the next row's values
    next_seconds = np.roll(seconds, -1)
    next_device_changes = np.roll(device_changes, -1)
    
    # Mark the last row as a device change
    next_device_changes[-1] = True
    
    # Calculate time differences between consecutive points (in seconds)
    time_diffs = next_seconds - seconds
    
    # Zero out time differences at device changes or if too large (> 3600 seconds = 1 hour)
    time_diffs[(next_device_changes) | (time_diffs > 3600)] = 0
    
    # Dictionary to store time spent in each zone by device and time bin
    # Structure: {(device_id, time_bin): {zone_id: seconds}}
    zone_times = {}
    
    # Process rows sequentially
    for i in range(len(result_df)):
        # Get current device ID from the array
        device_id = device_ids[i]
        time_bin = time_bins[i]
        zone_id = zone_ids[i]
        
        # Skip last row of each device (no time diff available)
        if i < len(result_df) - 1 and not next_device_changes[i]:
            # Initialize counter if needed
            key = (device_id, time_bin)
            if key not in zone_times:
                zone_times[key] = {}
            if zone_id not in zone_times[key]:
                zone_times[key][zone_id] = 0
            
            # Add time spent in this zone
            zone_times[key][zone_id] += time_diffs[i]
    
    # Find primary zone for each device and time bin
    primary_zones = {}  # {(device_id, time_bin): primary_zone_id}
    
    for key, zones in zone_times.items():
        if zones:
            primary_zones[key] = max(zones.items(), key=lambda x: x[1])[0]
    
    # Create transitions between consecutive time bins for each device
    for i in range(len(result_df) - 1):
        # Skip if this is the last row for a device
        if next_device_changes[i]:
            continue
        
        device_id = device_ids[i]
        current_bin = time_bins[i]
        next_bin = time_bins[i+1]
        
        # Create transition only if time bins are consecutive and for the same device
        if not next_device_changes[i] and next_bin == current_bin + 1:
            current_key = (device_id, current_bin)
            next_key = (device_id, next_bin)
            
            # Only create transition if we have primary zones for both bins
            if current_key in primary_zones and next_key in primary_zones:
                from_zone = primary_zones[current_key]
                to_zone = primary_zones[next_key]
                
                # Set transition on the last row of the current time bin
                result_df.loc[result_df.index[i], 'FROM'] = from_zone
                result_df.loc[result_df.index[i], 'TO'] = to_zone
                result_df.loc[result_df.index[i], 'VALID'] = True
    
    # Clean up
    if 'seconds' in result_df.columns:
        result_df = result_df.drop('seconds', axis=1)
    
    # Add the same_zone column for convenience
    result_df["same_zone"] = (result_df["FROM"] == result_df["TO"]) & result_df["VALID"]
    
    return result_df

def create_zone_transitions_sequential_approach1(df):
    # Extract arrays from dataframe
    zone_ids = df["zone_id"].to_numpy()
    time_bins = df["time_bin"].to_numpy()
    dc = df["device_change"].to_numpy()
    
    # Create arrays for the next row's values using roll
    zone_next = np.roll(zone_ids, -1)
    time_bin_next = np.roll(time_bins, -1)
    
    # Handle device changes - mark the last row of each device
    last_row = np.roll(dc, -1)
    last_row[-1] = True  # Last row of the entire dataframe
    
    # Valid transitions are when:
    # 1. Not at a device change boundary
    # 2. Time bins are the same
    valid_idx = (~last_row) & (time_bins == time_bin_next)
    
    # Create FROM and TO columns (use NaN for invalid transitions)
    from_zones = np.full(len(df), np.nan, dtype=float)
    to_zones = np.full(len(df), np.nan, dtype=float)
    
    # Set values only for valid transitions
    from_zones[valid_idx] = zone_ids[valid_idx]
    to_zones[valid_idx] = zone_next[valid_idx]
    
    # Add columns to dataframe
    df["FROM"] = from_zones
    df["TO"] = to_zones
    
    # Add VALID column - TRUE when both FROM and TO are not NaN
    # This is equivalent to valid_idx
    df["VALID"] = valid_idx
    
    # Add the same_zone column for convenience
    df["same_zone"] = (zone_ids == zone_next) & valid_idx
    
    return df


def create_zone_transition_matrix_approach1(df):
    # Get unique zone_ids and time_bins
    unique_zones = df['zone_id'].unique()
    time_bins = df['time_bin'].unique()
    
    # Initialize transition matrices for each time_bin
    transition_matrices = {tb: pd.DataFrame(0, index=unique_zones, columns=unique_zones) 
                          for tb in time_bins}
    
    # Iterate through rows sequentially
    for i in range(len(df) - 1):
        current_row = df.iloc[i]
        next_row = df.iloc[i + 1]
        
        # Check if we're still tracking the same device and in the same time bin
        if (not next_row['device_change'] and 
            current_row['time_bin'] == next_row['time_bin']):
            
            # Get from and to zones
            from_zone = current_row['zone_id']
            to_zone = next_row['zone_id']
            time_bin = current_row['time_bin']
            
            # Count all transitions, even those between the same zone
            transition_matrices[time_bin].loc[from_zone, to_zone] += 1
    
    return transition_matrices


transition_matrices_per_day = []


OUT_DIR.mkdir(parents=True, exist_ok=True)
# Loop through each file and process it individually
for file_path in parquet_files:
    print(f"\nProcessing {file_path.name}...")
    
    # Load the current parquet file
    df = pd.read_parquet(file_path)
    
    # Print information about this file
    print(f"Columns: {df.columns.tolist()}")
    print(f"Shape: {df.shape}")
    print("Sample data:")
    print(df.head())
    # Add timer before function call
    start_time = time.time()
    
    # Perform any necessary operations on df here
    #transition_matrices_per_day.append(create_zone_transition_matrix_approach1(df))
    #df = create_zone_transitions_sequential_approach1(df)
    
    df = create_zone_transitions_sequential_approach2(df)
    
    elapsed_time = time.time() - start_time
    print(f"Time taken to add to the df FROM/TO: {elapsed_time:.2f} seconds")
    calculate_nan_percentages(df)
    
    
    #start_time = time.time()
    #transition_map = create_transition_hashmap(df)
    #elapsed_time = time.time() - start_time
    #print(f"Processing transition matrix took: {elapsed_time:.2f} seconds")
    out_path = OUT_DIR / file_path.name

    save_transitions_from_df(df, out_path)
    # The dataframe will be garbage collected after each iteration
    # as it goes out of scope
    print(f"Finished processing {file_path.name}")
    print("-" * 50)
    
print('Finished')


Processing 20230331.parquet...
Columns: ['deviceid', 'date', 'time', 'lon', 'lat', 'datetime', 'device_change', 'dist_m', 'dt', 'speed_m_s', 'zone_id', 'time_bin']
Shape: (87260420, 12)
Sample data:
   deviceid        date      time       lon        lat            datetime  \
0         0  31.03.2023  02:29:54  14.52146  46.052380 2023-03-31 02:29:54   
1         0  31.03.2023  07:40:40  14.52137  46.051311 2023-03-31 07:40:40   
2         0  31.03.2023  07:41:13  14.52137  46.051311 2023-03-31 07:41:13   
3         0  31.03.2023  08:00:11  14.52139  46.053879 2023-03-31 08:00:11   
4         0  31.03.2023  08:00:14  14.52139  46.053879 2023-03-31 08:00:14   

   device_change      dist_m     dt  speed_m_s  zone_id  time_bin  
0          False  119.063995   2218   0.053681     1133         2  
1          False  119.063995  18646   0.006385     1133         7  
2          False   30.052505     30   1.001750     1133         7  
3          False  285.569763   1138   0.250940     1827    

KeyboardInterrupt: 