In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
PURPOSES = ["Residential", "Study", "Work", "Shopping", "Leisure", "Other"]

In [3]:
# Regenerating the entire dataset with structured households, housemates, and unrelated individuals

# Define fixed zones (5 zones)
zones = [{"zone_id": f"Z{i+1}"} for i in range(5)]

# Define fixed purposes (Each zone has 5 activity types with attractiveness scores)
purposes = [{"purpose_id": f"P_{zone['zone_id']}_{purpose}", "zone_id": zone["zone_id"], 
             "purpose_type": purpose, "attractiveness_score": random.uniform(0.5, 2.5)}
            for zone in zones for purpose in PURPOSES]

# Define structured family households (6 households)
family_households = [
    {"household_id": "H1", "zone_id": "Z1", "household_income": 70000, "household_size": 4, "main_person": "P1"},
    {"household_id": "H2", "zone_id": "Z2", "household_income": 50000, "household_size": 3, "main_person": "P4"},
    {"household_id": "H3", "zone_id": "Z3", "household_income": 90000, "household_size": 5, "main_person": "P6"},
    {"household_id": "H4", "zone_id": "Z4", "household_income": 40000, "household_size": 2, "main_person": "P10"},
    {"household_id": "H5", "zone_id": "Z5", "household_income": 60000, "household_size": 4, "main_person": "P12"},
    {"household_id": "H6", "zone_id": "Z3", "household_income": 80000, "household_size": 6, "main_person": "P16"},
]

# Define structured family people data
family_people = [
    # Household 1
    {"person_id": "P1", "household_id": "H1", "age": 35, "employment_status": "Employed", "role_in_household": "Main"},
    {"person_id": "P2", "household_id": "H1", "age": 33, "employment_status": "Employed", "role_in_household": "Spouse"},
    {"person_id": "P3", "household_id": "H1", "age": 8, "employment_status": "Student", "role_in_household": "Child"},
    
    # Household 2
    {"person_id": "P4", "household_id": "H2", "age": 45, "employment_status": "Employed", "role_in_household": "Main"},
    {"person_id": "P5", "household_id": "H2", "age": 20, "employment_status": "Student", "role_in_household": "Child"},
    
    # Household 3
    {"person_id": "P6", "household_id": "H3", "age": 40, "employment_status": "Employed", "role_in_household": "Main"},
    {"person_id": "P7", "household_id": "H3", "age": 38, "employment_status": "Employed", "role_in_household": "Spouse"},
    {"person_id": "P8", "household_id": "H3", "age": 15, "employment_status": "Student", "role_in_household": "Child"},
    {"person_id": "P9", "household_id": "H3", "age": 10, "employment_status": "Student", "role_in_household": "Child"},
    
    # Household 4
    {"person_id": "P10", "household_id": "H4", "age": 50, "employment_status": "Employed", "role_in_household": "Main"},
    {"person_id": "P11", "household_id": "H4", "age": 48, "employment_status": "Employed", "role_in_household": "Spouse"},
    
    # Household 5
    {"person_id": "P12", "household_id": "H5", "age": 42, "employment_status": "Employed", "role_in_household": "Main"},
    {"person_id": "P13", "household_id": "H5", "age": 39, "employment_status": "Employed", "role_in_household": "Spouse"},
    {"person_id": "P14", "household_id": "H5", "age": 17, "employment_status": "Student", "role_in_household": "Child"},
    {"person_id": "P15", "household_id": "H5", "age": 12, "employment_status": "Student", "role_in_household": "Child"},
    
    # Household 6
    {"person_id": "P16", "household_id": "H6", "age": 55, "employment_status": "Employed", "role_in_household": "Main"},
    {"person_id": "P17", "household_id": "H6", "age": 53, "employment_status": "Employed", "role_in_household": "Spouse"},
    {"person_id": "P18", "household_id": "H6", "age": 30, "employment_status": "Employed", "role_in_household": "Child"},
    {"person_id": "P19", "household_id": "H6", "age": 25, "employment_status": "Employed", "role_in_household": "Child"},
    {"person_id": "P20", "household_id": "H6", "age": 22, "employment_status": "Student", "role_in_household": "Child"},
]

# Define housemate households (only unrelated individuals)
housemate_households = [
    {"household_id": "H7", "zone_id": f"Z{random.randint(1,5)}", "household_income": 50000, "household_size": 3, "main_person": "P21"},
    {"household_id": "H8", "zone_id": f"Z{random.randint(1,5)}", "household_income": 55000, "household_size": 3, "main_person": "P24"},
]

# Define housemate individuals (unrelated people sharing a household)
housemates = []
for i, household in enumerate(housemate_households):
    base_person_id = 21 + (i * 3)
    for j in range(3):
        housemates.append({
            "person_id": f"P{base_person_id + j}",
            "household_id": household["household_id"],
            "age": 22 + j * 5,
            "employment_status": "Employed" if j % 2 == 0 else "Student",
            "role_in_household": "Housemate"
        })

# Combine households and people
all_households = family_households + housemate_households
all_people = family_people + housemates

# Convert data to DataFrames
zones_df = pd.DataFrame(zones)
purposes_df = pd.DataFrame(purposes)
households_df = pd.DataFrame(all_households)
people_df = pd.DataFrame(all_people)



In [4]:
# Generate prediction dataset by relocating people to new homes
test_people = []
for person in all_people:
    new_household = f"H{int(person['household_id'][1:]) + 8}"  # Shift household IDs for the prediction set
    test_people.append({
        "person_id": f"P{int(person['person_id'][1:]) + 50}",  # Shift person IDs for uniqueness
        "household_id": new_household,
        "age": person["age"],
        "employment_status": person["employment_status"],
        "role_in_household": person["role_in_household"],
    })

# Generate new households for relocated people in the prediction dataset
prediction_households = [{"household_id": f"H{i}", "zone_id": f"Z{random.randint(1, 5)}", 
                          "household_income": random.randint(30000, 100000), 
                          "household_size": random.randint(1, 5), 
                          "main_person": f"P{50 + i}"}
                         for i in range(9, 17)]

# Merge with existing households
all_prediction_households = prediction_households

# Convert data to DataFrames
prediction_households_df = pd.DataFrame(all_prediction_households)
test_people_df = pd.DataFrame(test_people)


In [5]:
# Combine training people data and relocated prediction people data for the final to_predict_data
to_predict_people = people_df.copy()
to_predict_people = pd.concat([to_predict_people, test_people_df], ignore_index=True)

# Combine training households and prediction households for final to_predict_households
to_predict_households = households_df.copy()
to_predict_households = pd.concat([to_predict_households, prediction_households_df], ignore_index=True)

# Convert to DataFrames
to_predict_people_df = pd.DataFrame(to_predict_people)
to_predict_households_df = pd.DataFrame(to_predict_households)

In [6]:
# Define zone connectivity (only neighboring zones should have travel links)
zone_connections = {
    "Z1": ["Z2", "Z3"],
    "Z2": ["Z1", "Z5", "Z3"],
    "Z3": ["Z1", "Z2", "Z4"],
    "Z4": ["Z3"],
    "Z5": ["Z2"]
}

# Generate OD matrix based on connectivity
od_matrix = []
for origin, destinations in zone_connections.items():
    for destination in destinations:
        distance_km = random.uniform(2, 10)  # Random realistic distance in km
        travel_time = int(distance_km * random.uniform(2, 4))  # 2-4 min per km

        od_matrix.append({
            "origin": origin,
            "destination": destination,
            "distance_km": round(distance_km, 2),
            "travel_time_min": travel_time
        })

# Convert to DataFrame
od_matrix_df = pd.DataFrame(od_matrix)
od_matrix_df

Unnamed: 0,origin,destination,distance_km,travel_time_min
0,Z1,Z2,5.15,11
1,Z1,Z3,3.33,9
2,Z2,Z1,9.66,31
3,Z2,Z5,2.21,6
4,Z2,Z3,7.26,17
5,Z3,Z1,4.7,14
6,Z3,Z2,9.37,35
7,Z3,Z4,9.58,24
8,Z4,Z3,5.76,16
9,Z5,Z2,8.55,24


In [7]:
# Manually defining the **Travel Diaries** for each person, ensuring **realistic activity schedules**
# Joint activities will be assigned within the same household or housemate group.

travel_diaries = [
    # Household 1 (P1, P2, P3)
    {"person_id": "P1", "zone_id": "Z1", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},  # Home
    {"person_id": "P1", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P2
    {"person_id": "P1", "zone_id": "Z1", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo activity
    {"person_id": "P1", "zone_id": "Z1", "purpose": "Residential", "duration": 420, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P2
    
    {"person_id": "P2", "zone_id": "Z1", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},  # Home
    {"person_id": "P2", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P1
    {"person_id": "P2", "zone_id": "Z5", "purpose": "Shopping", "duration": 90, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P2", "zone_id": "Z1", "purpose": "Residential", "duration": 450, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P1
    
    {"person_id": "P3", "zone_id": "Z1", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},  # Home
    {"person_id": "P3", "zone_id": "Z2", "purpose": "Study", "duration": 390, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P5
    {"person_id": "P3", "zone_id": "Z1", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 1},  # Joint with P5
    {"person_id": "P3", "zone_id": "Z1", "purpose": "Residential", "duration": 510, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P5

    # Household 2 (P4, P5)
    {"person_id": "P4", "zone_id": "Z2", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P4", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 0},  # Solo work
    {"person_id": "P4", "zone_id": "Z2", "purpose": "Shopping", "duration": 120, "ranking_in_day": 3, "joint_activity": 1},  # Joint with P5
    {"person_id": "P4", "zone_id": "Z2", "purpose": "Residential", "duration": 420, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P5
    
    {"person_id": "P5", "zone_id": "Z2", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P5", "zone_id": "Z2", "purpose": "Study", "duration": 390, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P3
    {"person_id": "P5", "zone_id": "Z1", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 1},  # Joint with P3
    {"person_id": "P5", "zone_id": "Z2", "purpose": "Residential", "duration": 510, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P3

    # Household 3 (P6, P7, P8, P9)
    {"person_id": "P6", "zone_id": "Z3", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P6", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P7
    {"person_id": "P6", "zone_id": "Z3", "purpose": "Other", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P6", "zone_id": "Z3", "purpose": "Residential", "duration": 420, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P7

    {"person_id": "P7", "zone_id": "Z3", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P7", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P6
    {"person_id": "P7", "zone_id": "Z4", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P7", "zone_id": "Z3", "purpose": "Residential", "duration": 420, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P6

    {"person_id": "P8", "zone_id": "Z3", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P8", "zone_id": "Z3", "purpose": "Study", "duration": 390, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P9
    {"person_id": "P8", "zone_id": "Z4", "purpose": "Other", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P8", "zone_id": "Z3", "purpose": "Residential", "duration": 510, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P9

    {"person_id": "P9", "zone_id": "Z3", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P9", "zone_id": "Z3", "purpose": "Study", "duration": 390, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P8
    {"person_id": "P9", "zone_id": "Z4", "purpose": "Other", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P9", "zone_id": "Z3", "purpose": "Residential", "duration": 510, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P8
     
    # Household 4 (P10, P11)
    {"person_id": "P10", "zone_id": "Z4", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P10", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P11
    {"person_id": "P10", "zone_id": "Z4", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P10", "zone_id": "Z4", "purpose": "Residential", "duration": 360, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P11

    {"person_id": "P11", "zone_id": "Z4", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P11", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P10
    {"person_id": "P11", "zone_id": "Z4", "purpose": "Shopping", "duration": 90, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P11", "zone_id": "Z4", "purpose": "Residential", "duration": 390, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P10

    # Household 5 (P12, P13, P14, P15)
    {"person_id": "P12", "zone_id": "Z5", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P12", "zone_id": "Z2", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P13
    {"person_id": "P12", "zone_id": "Z5", "purpose": "Other", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P12", "zone_id": "Z5", "purpose": "Residential", "duration": 420, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P13

    {"person_id": "P13", "zone_id": "Z5", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P13", "zone_id": "Z2", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P12
    {"person_id": "P13", "zone_id": "Z4", "purpose": "Shopping", "duration": 90, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P13", "zone_id": "Z5", "purpose": "Residential", "duration": 390, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P12

    {"person_id": "P14", "zone_id": "Z5", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P14", "zone_id": "Z3", "purpose": "Study", "duration": 390, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P15
    {"person_id": "P14", "zone_id": "Z5", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P14", "zone_id": "Z5", "purpose": "Residential", "duration": 510, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P15

    {"person_id": "P15", "zone_id": "Z5", "purpose": "Residential", "duration": 420, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P15", "zone_id": "Z3", "purpose": "Study", "duration": 390, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P14
    {"person_id": "P15", "zone_id": "Z5", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P15", "zone_id": "Z5", "purpose": "Residential", "duration": 510, "ranking_in_day": 4, "joint_activity": 1},  # Joint with P14

    # Housemates in Household 7 (P21, P22, P23)
    {"person_id": "P21", "zone_id": "Z1", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P21", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P22
    {"person_id": "P21", "zone_id": "Z1", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 1},  # Joint with P23
    {"person_id": "P21", "zone_id": "Z1", "purpose": "Residential", "duration": 360, "ranking_in_day": 4, "joint_activity": 1},  # Back home with P23

    {"person_id": "P22", "zone_id": "Z1", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P22", "zone_id": "Z3", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P21
    {"person_id": "P22", "zone_id": "Z5", "purpose": "Shopping", "duration": 90, "ranking_in_day": 3, "joint_activity": 0},  # Solo
    {"person_id": "P22", "zone_id": "Z1", "purpose": "Residential", "duration": 390, "ranking_in_day": 4, "joint_activity": 0},  # Back home solo

    {"person_id": "P23", "zone_id": "Z1", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P23", "zone_id": "Z3", "purpose": "Other", "duration": 300, "ranking_in_day": 2, "joint_activity": 0},  # Solo
    {"person_id": "P23", "zone_id": "Z1", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 1},  # Joint with P21
    {"person_id": "P23", "zone_id": "Z1", "purpose": "Residential", "duration": 360, "ranking_in_day": 4, "joint_activity": 1},  # Back home with P21

    # Housemates in Household 8 (P24, P25, P26)
    {"person_id": "P24", "zone_id": "Z3", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P24", "zone_id": "Z4", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P25
    {"person_id": "P24", "zone_id": "Z3", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 1},  # Joint with P26
    {"person_id": "P24", "zone_id": "Z3", "purpose": "Residential", "duration": 360, "ranking_in_day": 4, "joint_activity": 1},  # Back home with P26

    {"person_id": "P25", "zone_id": "Z3", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P25", "zone_id": "Z4", "purpose": "Work", "duration": 480, "ranking_in_day": 2, "joint_activity": 1},  # Joint with P24
    {"person_id": "P25", "zone_id": "Z3", "purpose": "Residential", "duration": 390, "ranking_in_day": 3, "joint_activity": 0},  # Back home solo

    {"person_id": "P26", "zone_id": "Z3", "purpose": "Residential", "duration": 480, "ranking_in_day": 1, "joint_activity": None},
    {"person_id": "P26", "zone_id": "Z3", "purpose": "Other", "duration": 300, "ranking_in_day": 2, "joint_activity": 0},  # Solo
    {"person_id": "P26", "zone_id": "Z3", "purpose": "Leisure", "duration": 120, "ranking_in_day": 3, "joint_activity": 1},  # Joint with P24
    {"person_id": "P26", "zone_id": "Z3", "purpose": "Residential", "duration": 360, "ranking_in_day": 4, "joint_activity": 1},  # Back home with P24
]


# Convert to DataFrame
travel_diaries_df = pd.DataFrame(travel_diaries)

In [8]:
start_time_of_day = 0  # Midnight (00:00 AM)

In [9]:
# Validate the timeline for each person to ensure no overlap and realistic activity sequences

# Compute actual start and end times for each activity
validated_travel_diaries = []
for person_id in travel_diaries_df["person_id"].unique():
    person_diary = travel_diaries_df[travel_diaries_df["person_id"] == person_id].sort_values("ranking_in_day")
    
    current_time = 0  # Midnight (00:00 AM)
    for index, row in person_diary.iterrows():
        start_time = current_time
        end_time = start_time + row["duration"]
        
        validated_travel_diaries.append({
            "person_id": row["person_id"],
            "zone_id": row["zone_id"],
            "purpose": row["purpose"],
            "duration": row["duration"],
            "ranking_in_day": row["ranking_in_day"],
            "joint_activity": row["joint_activity"],
            "start_time": start_time,
            "end_time": end_time
        })
        
        current_time = end_time  # Move forward in time

# Convert to DataFrame
validated_travel_diaries_df = pd.DataFrame(validated_travel_diaries)

# Check for any inconsistencies (e.g., exceeding 24 hours)
invalid_entries = validated_travel_diaries_df[validated_travel_diaries_df["end_time"] > 1440]  # 1440 min = 24 hours
invalid_entries

Unnamed: 0,person_id,zone_id,purpose,duration,ranking_in_day,joint_activity,start_time,end_time


In [11]:
zones_df.to_csv("data/zones.csv", index=False)
purposes_df.to_csv("data/purposes.csv", index=False)
households_df.to_csv("data/sample_households.csv", index=False)
people_df.to_csv("data/sample_people.csv", index=False)
travel_diaries_df.to_csv("data/sample_travel_diaries.csv", index=False)
to_predict_households_df.to_csv("data/to_predict_households.csv", index=False)
to_predict_people_df.to_csv("data/to_predict_people.csv", index=False)
od_matrix_df.to_csv("data/od_matrix.csv", index=False)