In [1]:
import gzip
import json
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pytz
import matplotlib.dates as mdates
import matplotlib.patches as mpatches
import os
import random

import imn_loading

In [2]:
def read_poi_data(filepath):
    poi_data = {}
    with gzip.open(filepath, 'rt', encoding='utf-8') as f:
        for line in f:
            row = json.loads(line)
            # Keep UID as int to match IMN data format
            poi_data[row['uid']] = row
    return poi_data

poi_to_activity = {
    "education": "school", "food_and_drink": "eat", "shopping": "shop",
    "entertainment_and_recreation": "leisure", "transportation": "transit",
    "healthcare": "health", "public_services": "admin", "finance": "finance",
    "utilities": "utility", "other": "unknown"
}

def enrich_imn_with_poi(imn, poi_info):
    poi_classes = poi_info["poi_classes"]
    enriched = {}
    for loc_id, loc in imn["locations"].items():
        vec = poi_info["poi_freq"].get(loc_id, [0.0]*len(poi_classes))
        top_idx = int(np.argmax(vec))
        label = poi_to_activity.get(poi_classes[top_idx], "unknown")
        if loc_id == imn.get("home"): label = "home"
        if loc_id == imn.get("work"): label = "work"
        enriched[loc_id] = {**loc, "activity_label": label}
    imn["locations"] = enriched
    return imn

def extract_stays_from_trips(trips, locations):
    """Convert trips into stays by considering the destination of each trip as a stay."""
    stays = []
    
    # First pass: create stays with start times
    for from_id, to_id, st, et in trips:
        activity_label = locations[to_id].get('activity_label', 'unknown')
        # Create stay with start time as trip end time
        stays.append(Stay(to_id, activity_label, et, None))
    
    # Second pass: set end times based on next stay's start time
    for i in range(len(stays)-1):
        # set end time to the START of the next trip, not the next stay's start
        next_trip_start = trips[i+1][2]  # start time of the next trip
        stays[i].set_end_time(next_trip_start)
    
    # Handle the last stay
    if stays:
        # For the last stay, if there's a next day's first trip, use that as end time
        # Otherwise, use a default duration of 1 hour
        last_stay = stays[-1]
        if last_stay.start_time is not None:
            last_stay.set_end_time(last_stay.start_time + 3600)  # Default 1 hour duration
    
    return stays

def extract_stays_by_day(stays):
    """Group stays by day, handling cross-day stays."""
    stays_by_day = defaultdict(list)
    
    for stay in stays:
        if stay.start_time is None or stay.end_time is None:
            continue
            
        start_dt = datetime.fromtimestamp(stay.start_time, tz)
        end_dt = datetime.fromtimestamp(stay.end_time, tz)
        
        # If stay spans multiple days, split it
        current_dt = start_dt.replace(hour=0, minute=0, second=0, microsecond=0)
        end_of_day = current_dt + timedelta(days=1)
        
        while current_dt < end_dt:
            day_start = max(start_dt, current_dt)
            day_end = min(end_dt, end_of_day)
            
            # Create stay for this day
            day_stay = Stay(
                stay.location_id,
                stay.activity_label,
                int(day_start.timestamp()),
                int(day_end.timestamp())
            )
            
            stays_by_day[current_dt.date()].append(day_stay)
            
            # Move to next day
            current_dt = end_of_day
            end_of_day = current_dt + timedelta(days=1)
    
    return stays_by_day


class Stay:
    def __init__(self, location_id, activity_label, start_time, end_time):
        self.location_id = location_id
        self.activity_label = activity_label
        self.start_time = start_time
        self.end_time = end_time
        self.duration = self.end_time - self.start_time if self.end_time is not None and self.start_time is not None else None

    def set_end_time(self, end_time):
        self.end_time = end_time
        if self.start_time is not None:
            self.duration = self.end_time - self.start_time

    def to_dict(self):
        return {
            'location_id': self.location_id,
            'activity_label': self.activity_label,
            'start_time': self.start_time,
            'end_time': self.end_time,
            'duration': self.duration
        }

In [3]:
# Load IMNs and POI data from the new test files
imns = imn_loading.read_imn('data/test_milano_imns.json.gz')
poi_data = read_poi_data('data/test_milano_imns_pois.json.gz')
print(f"Loaded {len(imns)} IMNs and POI data for {len(poi_data)} users")

Loaded 11658 IMNs and POI data for 11658 users


In [4]:
user_id = 650

In [5]:
imn = imns[user_id]
imn.keys()

dict_keys(['locations', 'trips', 'home', 'work'])

In [6]:
imn

{'locations': {'0': {'coordinates': [9.175958, 45.528595], 'frequency': 20},
  '1': {'coordinates': [9.20885, 45.414547], 'frequency': 4},
  '2': {'coordinates': [9.174006, 45.527006], 'frequency': 4},
  '3': {'coordinates': [9.166437, 45.557348], 'frequency': 4},
  '4': {'coordinates': [9.175718, 45.526646], 'frequency': 4},
  '5': {'coordinates': [9.05514, 45.41317], 'frequency': 2},
  '6': {'coordinates': [9.23827, 45.552969], 'frequency': 2},
  '7': {'coordinates': [9.174558, 45.528896], 'frequency': 2},
  '8': {'coordinates': [9.1779, 45.508082], 'frequency': 2},
  '9': {'coordinates': [9.150353, 45.520783], 'frequency': 2},
  '10': {'coordinates': [9.178946, 45.498679], 'frequency': 2},
  '11': {'coordinates': [9.238809, 45.552381], 'frequency': 2},
  '12': {'coordinates': [9.078704, 45.426008], 'frequency': 2}},
 'trips': [('0', '5', 1175434055, 1175435457),
  ('5', '0', 1175442486, 1175445449),
  ('0', '6', 1175499996, 1175501080),
  ('6', '0', 1175506936, 1175507644),
  ('0', 

In [7]:
poi_data[user_id]

{'uid': 650,
 'poi_classes': ['other',
  'transportation',
  'healthcare',
  'public_services',
  'finance',
  'food_and_drink',
  'utilities',
  'education',
  'entertainment_and_recreation',
  'shopping'],
 'poi_freq': {'0': [1.0, 1.0, 1.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
  '1': [0.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
  '2': [3.0, 2.0, 2.0, 3.0, 1.0, 3.0, 2.0, 0.0, 0.0, 0.0],
  '3': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  '4': [1.0, 2.0, 1.0, 0.0, 1.0, 1.0, 3.0, 1.0, 0.0, 1.0],
  '5': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  '6': [1.0, 2.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0],
  '7': [2.0, 1.0, 1.0, 2.0, 1.0, 5.0, 1.0, 0.0, 0.0, 0.0],
  '8': [1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 2.0, 1.0, 0.0, 0.0],
  '9': [189.0, 16.0, 1.0, 1.0, 0.0, 5.0, 3.0, 0.0, 0.0, 0.0],
  '10': [27.0, 4.0, 1.0, 19.0, 0.0, 4.0, 4.0, 0.0, 0.0, 0.0],
  '11': [1.0, 3.0, 0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0],
  '12': [0.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}}

In [8]:
# Enrich IMN with POI data
enriched = enrich_imn_with_poi(imn, poi_data[user_id])
enriched

{'locations': {'0': {'coordinates': [9.175958, 45.528595],
   'frequency': 20,
   'activity_label': 'home'},
  '1': {'coordinates': [9.20885, 45.414547],
   'frequency': 4,
   'activity_label': 'work'},
  '2': {'coordinates': [9.174006, 45.527006],
   'frequency': 4,
   'activity_label': 'unknown'},
  '3': {'coordinates': [9.166437, 45.557348],
   'frequency': 4,
   'activity_label': 'unknown'},
  '4': {'coordinates': [9.175718, 45.526646],
   'frequency': 4,
   'activity_label': 'utility'},
  '5': {'coordinates': [9.05514, 45.41317],
   'frequency': 2,
   'activity_label': 'unknown'},
  '6': {'coordinates': [9.23827, 45.552969],
   'frequency': 2,
   'activity_label': 'eat'},
  '7': {'coordinates': [9.174558, 45.528896],
   'frequency': 2,
   'activity_label': 'eat'},
  '8': {'coordinates': [9.1779, 45.508082],
   'frequency': 2,
   'activity_label': 'utility'},
  '9': {'coordinates': [9.150353, 45.520783],
   'frequency': 2,
   'activity_label': 'unknown'},
  '10': {'coordinates': [9

In [9]:
# Extract stays from trips
stays = extract_stays_from_trips(enriched['trips'], enriched['locations'])
stays

[<__main__.Stay at 0x3127ec590>,
 <__main__.Stay at 0x30e806d50>,
 <__main__.Stay at 0x30e806c10>,
 <__main__.Stay at 0x3094df100>,
 <__main__.Stay at 0x3094defd0>,
 <__main__.Stay at 0x1077b68d0>,
 <__main__.Stay at 0x105573680>,
 <__main__.Stay at 0x30957af10>,
 <__main__.Stay at 0x30ec4c550>,
 <__main__.Stay at 0x30ec4c950>,
 <__main__.Stay at 0x3095b15e0>,
 <__main__.Stay at 0x3095b17c0>,
 <__main__.Stay at 0x3095cd7f0>,
 <__main__.Stay at 0x3095cdc50>,
 <__main__.Stay at 0x1058af380>,
 <__main__.Stay at 0x30ecd4590>,
 <__main__.Stay at 0x30ecd4350>,
 <__main__.Stay at 0x30950e200>,
 <__main__.Stay at 0x30950e360>,
 <__main__.Stay at 0x3095df2f0>,
 <__main__.Stay at 0x3095df250>,
 <__main__.Stay at 0x3095c5490>,
 <__main__.Stay at 0x309819550>,
 <__main__.Stay at 0x3098195d0>,
 <__main__.Stay at 0x309620c90>,
 <__main__.Stay at 0x3096203d0>]

In [10]:
# --- Timezone config ---
tz = pytz.timezone("Europe/Rome")

# Group stays by day
stays_by_day = extract_stays_by_day(stays)
for day, stays in stays_by_day.items():
    print(f"Day: {day}")
    for stay in stays:
        print(f"  - {stay.location_id} ({stay.activity_label}) from {datetime.fromtimestamp(stay.start_time, tz)} to {datetime.fromtimestamp(stay.end_time, tz)}")


Day: 2007-04-01
  - 5 (unknown) from 2007-04-01 15:50:57+02:00 to 2007-04-01 17:48:06+02:00
  - 0 (home) from 2007-04-01 18:37:29+02:00 to 2007-04-02 00:00:00+02:00
Day: 2007-04-02
  - 0 (home) from 2007-04-02 00:00:00+02:00 to 2007-04-02 09:46:36+02:00
  - 6 (eat) from 2007-04-02 10:04:40+02:00 to 2007-04-02 11:42:16+02:00
  - 0 (home) from 2007-04-02 11:54:04+02:00 to 2007-04-02 15:29:11+02:00
  - 1 (work) from 2007-04-02 16:11:36+02:00 to 2007-04-02 16:39:13+02:00
  - 0 (home) from 2007-04-02 17:25:23+02:00 to 2007-04-03 00:00:00+02:00
Day: 2007-04-03
  - 0 (home) from 2007-04-03 00:00:00+02:00 to 2007-04-03 08:43:23+02:00
  - 7 (eat) from 2007-04-03 08:45:45+02:00 to 2007-04-03 09:13:29+02:00
  - 2 (unknown) from 2007-04-03 09:16:42+02:00 to 2007-04-03 14:57:47+02:00
  - 1 (work) from 2007-04-03 15:34:12+02:00 to 2007-04-03 17:07:08+02:00
  - 0 (home) from 2007-04-03 18:19:45+02:00 to 2007-04-04 00:00:00+02:00
Day: 2007-04-04
  - 0 (home) from 2007-04-04 00:00:00+02:00 to 2007-04-0

## User-Specific Distributions

In [None]:
def build_stay_distributions(stays_by_day):
    """Build distributions for stay durations and activity types across all days."""
    duration_dist = defaultdict(list)
    activity_transitions = defaultdict(list)
    trip_durations = []
    
    # Collect data from all days
    for day, day_stays in stays_by_day.items():
        for i in range(len(day_stays)-1):
            current_stay = day_stays[i]
            next_stay = day_stays[i+1]
            
            # Record duration for this activity type
            if current_stay.duration is not None:
                duration_dist[current_stay.activity_label].append(current_stay.duration)
            
            # Record activity transition
            activity_transitions[current_stay.activity_label].append(next_stay.activity_label)
            
            # Record trip duration (gap between stays)
            if current_stay.end_time is not None and next_stay.start_time is not None:
                trip_duration = next_stay.start_time - current_stay.end_time
                if trip_duration > 0:
                    trip_durations.append(trip_duration)
    
    # Convert lists to probability distributions
    duration_probs = {}
    for activity, durations in duration_dist.items():
        if len(durations) > 0:
            hist, bins = np.histogram(durations, bins=20, density=False)  # Use counts instead of density
            duration_probs[activity] = (hist, bins)
    
    transition_probs = {}
    for from_activity, to_activities in activity_transitions.items():
        if len(to_activities) > 0:
            unique_activities, counts = np.unique(to_activities, return_counts=True)
            probs = counts / counts.sum()
            transition_probs[from_activity] = dict(zip(unique_activities, probs))
    
    # Trip duration distribution
    trip_duration_probs = None
    if len(trip_durations) > 0:
        hist, bins = np.histogram(trip_durations, bins=20, density=False)  # Use counts instead of density
        trip_duration_probs = (hist, bins)
    
    return duration_probs, transition_probs, trip_duration_probs

# Build user-specific distributions
user_duration_probs, user_transition_probs, user_trip_duration_probs = build_stay_distributions(stays_by_day)

        