In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from tqdm import tqdm
import os
from pathlib import Path

os.chdir("../..")
os.getcwd()

'/home/bwool/RESEARCH/TRB-Home-Data-Quality-2025'

# Simple HDA--Centroid Method (A1)

In [2]:
def centroid_home_detection(data: pd.DataFrame):
    homes = []
    skipped_no_night = 0

    for caid, user_df in tqdm(data.groupby("caid"), desc="Running centroid HDA"):
        dt = user_df['datetime_pdt'].dt
        is_night = (dt.hour >= 19) | (dt.hour < 7)
        night_df = user_df[is_night]

        if night_df.empty:
            skipped_no_night += 1
            continue

        home_lat = night_df['latitude'].mean()
        home_lon = night_df['longitude'].mean()

        homes.append({
            'caid': caid,
            'latitude': home_lat,
            'longitude': home_lon
        })

    print(f"Skipped {skipped_no_night} users with no nighttime observations.")
    return pd.DataFrame(homes)

In [3]:
def run_centroid_hda_on_all_2019_data():
    # === Set input/output folders ===
    folder_2019_cleaned = "00_Data/02_Cleaned_Sample_Data/2019_Cleaned_Data"
    output_file = "00_Data/04_HDA_Sample_Data/2019_all_users_centroid_home_locations.csv"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # === List all parquet files ===
    cleaned_files = [
        os.path.join(folder_2019_cleaned, f)
        for f in os.listdir(folder_2019_cleaned)
        if f.endswith(".parquet")
    ]

    # === Collect all home detections ===
    all_homes = []

    for filepath in tqdm(cleaned_files, desc="Processing files for HDA"):
        try:
            df = pd.read_parquet(filepath, engine="pyarrow")
            homes_df = centroid_home_detection(df)
            all_homes.append(homes_df)
        except Exception as e:
            print(f"Error processing {filepath}: {e}")

    # === Concatenate all results and save ===
    final_df = pd.concat(all_homes, ignore_index=True)
    final_df.to_csv(output_file, index=False)
    print(f"Saved {len(final_df):,} home locations to: {output_file}")

run_centroid_hda_on_all_2019_data()
# 1m 27.4 s

Running centroid HDA: 100%|██████████| 15444/15444 [00:07<00:00, 1990.08it/s]
Processing files for HDA:  11%|█         | 1/9 [00:09<01:15,  9.45s/it]

Skipped 360 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15259/15259 [00:07<00:00, 1969.95it/s]
Processing files for HDA:  22%|██▏       | 2/9 [00:18<01:06,  9.44s/it]

Skipped 385 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15053/15053 [00:07<00:00, 1893.81it/s]
Processing files for HDA:  33%|███▎      | 3/9 [00:28<00:57,  9.51s/it]

Skipped 351 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15369/15369 [00:08<00:00, 1897.64it/s]
Processing files for HDA:  44%|████▍     | 4/9 [00:38<00:48,  9.61s/it]

Skipped 376 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15352/15352 [00:08<00:00, 1918.43it/s]
Processing files for HDA:  56%|█████▌    | 5/9 [00:47<00:38,  9.66s/it]

Skipped 372 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15220/15220 [00:08<00:00, 1889.03it/s]
Processing files for HDA:  67%|██████▋   | 6/9 [00:57<00:29,  9.71s/it]

Skipped 386 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15311/15311 [00:08<00:00, 1862.64it/s]
Processing files for HDA:  78%|███████▊  | 7/9 [01:07<00:19,  9.80s/it]

Skipped 312 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15293/15293 [00:08<00:00, 1901.40it/s]
Processing files for HDA:  89%|████████▉ | 8/9 [01:17<00:09,  9.78s/it]

Skipped 384 users with no nighttime observations.


Running centroid HDA: 100%|██████████| 15185/15185 [00:07<00:00, 1936.12it/s]
Processing files for HDA: 100%|██████████| 9/9 [01:27<00:00,  9.68s/it]


Skipped 342 users with no nighttime observations.
Saved 134,218 home locations to: 00_Data/04_HDA_Sample_Data/2019_all_users_centroid_home_locations.csv


# Medium HDA--Grid Frequency Method (A2)

In [4]:
def latlon_to_cell_id(lat, lon, cell_size_meters=20):
    # Approximate degree-per-meter conversion at given latitude
    lat_deg_per_m = 1 / 111320
    lon_deg_per_m = 1 / (40075000 * np.cos(np.radians(lat)) / 360)

    lat_bin = np.floor(lat / (lat_deg_per_m * cell_size_meters))
    lon_bin = np.floor(lon / (lon_deg_per_m * cell_size_meters))
    return lat_bin.astype(int), lon_bin.astype(int)

def grid_frequency_home_detection(data: pd.DataFrame, cell_size_meters=20):
    homes = []
    skipped_no_night = 0

    for caid, user_df in tqdm(data.groupby("caid"), desc="Running grid-frequency HDA"):
        dt = user_df['datetime_pdt'].dt
        is_night = (dt.hour >= 19) | (dt.hour < 7)
        night_df = user_df[is_night]

        if night_df.empty:
            skipped_no_night += 1
            continue  # Skip users with no nighttime pings

        lat = night_df['latitude'].values
        lon = night_df['longitude'].values
        lat_bin, lon_bin = latlon_to_cell_id(lat, lon, cell_size_meters)

        # Combine bins
        bins = list(zip(lat_bin, lon_bin))
        bin_counts = pd.Series(bins).value_counts()

        # Most frequent bin
        top_bin = bin_counts.idxmax()
        mask = [(a == top_bin[0]) & (b == top_bin[1]) for a, b in zip(lat_bin, lon_bin)]
        home_points = night_df.loc[mask, ['latitude', 'longitude']]

        # Mean of lat/lon within the most frequent bin
        home_lat, home_lon = home_points.mean()

        homes.append({
            'caid': caid,
            'latitude': home_lat,
            'longitude': home_lon,
            'cell_id': f"{top_bin[0]}_{top_bin[1]}"
        })
        
    print(f"Skipped {skipped_no_night} users with no nighttime observations.")
    return pd.DataFrame(homes)


In [5]:
def run_grid_frequency_hda_on_all_2019_data(cell_size_meters=20):
    # === Set input/output paths ===
    folder_2019_cleaned = "00_Data/02_Cleaned_Sample_Data/2019_Cleaned_Data"
    output_file = f"00_Data/04_HDA_Sample_Data/2019_all_users_gridfreq_home_locations_{cell_size_meters}m.csv"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # === List all parquet files ===
    cleaned_files = [
        os.path.join(folder_2019_cleaned, f)
        for f in os.listdir(folder_2019_cleaned)
        if f.endswith(".parquet")
    ]

    # === Collect all home detections ===
    all_homes = []

    for filepath in tqdm(cleaned_files, desc="Processing files for Grid-Freq HDA"):
        try:
            df = pd.read_parquet(filepath, engine="pyarrow")
            homes_df = grid_frequency_home_detection(df, cell_size_meters=cell_size_meters)
            all_homes.append(homes_df)
        except Exception as e:
            print(f"Error processing {filepath}: {e}")

    # === Concatenate all results and save ===
    final_df = pd.concat(all_homes, ignore_index=True)
    final_df.to_csv(output_file, index=False)
    print(f"Saved {len(final_df):,} home locations to: {output_file}")

run_grid_frequency_hda_on_all_2019_data()
# 2m 36.7s

Running grid-frequency HDA: 100%|██████████| 15444/15444 [00:15<00:00, 975.41it/s] 
Processing files for Grid-Freq HDA:  11%|█         | 1/9 [00:17<02:19, 17.45s/it]

Skipped 360 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15259/15259 [00:15<00:00, 976.44it/s] 
Processing files for Grid-Freq HDA:  22%|██▏       | 2/9 [00:34<02:00, 17.28s/it]

Skipped 385 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15053/15053 [00:15<00:00, 966.75it/s]
Processing files for Grid-Freq HDA:  33%|███▎      | 3/9 [00:51<01:43, 17.23s/it]

Skipped 351 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15369/15369 [00:15<00:00, 998.23it/s] 
Processing files for Grid-Freq HDA:  44%|████▍     | 4/9 [01:08<01:25, 17.15s/it]

Skipped 376 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15352/15352 [00:15<00:00, 1020.51it/s]
Processing files for Grid-Freq HDA:  56%|█████▌    | 5/9 [01:25<01:07, 16.95s/it]

Skipped 372 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15220/15220 [00:16<00:00, 942.01it/s] 
Processing files for Grid-Freq HDA:  67%|██████▋   | 6/9 [01:43<00:51, 17.23s/it]

Skipped 386 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15311/15311 [00:16<00:00, 908.79it/s] 
Processing files for Grid-Freq HDA:  78%|███████▊  | 7/9 [02:01<00:35, 17.64s/it]

Skipped 312 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15293/15293 [00:15<00:00, 971.07it/s] 
Processing files for Grid-Freq HDA:  89%|████████▉ | 8/9 [02:19<00:17, 17.55s/it]

Skipped 384 users with no nighttime observations.


Running grid-frequency HDA: 100%|██████████| 15185/15185 [00:15<00:00, 961.69it/s]
Processing files for Grid-Freq HDA: 100%|██████████| 9/9 [02:36<00:00, 17.38s/it]


Skipped 342 users with no nighttime observations.
Saved 134,218 home locations to: 00_Data/04_HDA_Sample_Data/2019_all_users_gridfreq_home_locations_20m.csv


# Complex HDA--Binned Clustering Method (A4)

In [8]:
def run_binned_hda_for_single_file(data: pd.DataFrame) -> pd.DataFrame:
    class MeanShift:
        def __init__(self, bandwidth, bin_seeding=True, min_bin_freq=2, max_iter=50):
            self.bandwidth = bandwidth
            self.bin_seeding = bin_seeding
            self.min_bin_freq = min_bin_freq
            self.max_iter = max_iter
            self.cluster_center = None
            self.used_mean = False

        @staticmethod
        def get_bin_seeds(X, bin_size, min_bin_freq=1):
            bin_sizes = defaultdict(int)
            for point in X:
                binned = np.round(point / bin_size)
                bin_sizes[tuple(binned)] += 1
            seeds = [np.array(point) * bin_size for point, freq in bin_sizes.items() if freq >= min_bin_freq]
            return np.array(seeds) if seeds else X

        @staticmethod
        def fit_single_seed(seed, X, nbrs, bandwidth, max_iter):
            stop_thresh = 1e-3 * bandwidth
            mean = seed
            for _ in range(max_iter):
                indices = nbrs.radius_neighbors([mean], bandwidth, return_distance=False)[0]
                if len(indices) == 0:
                    break
                old_mean = mean
                mean = X[indices].mean(axis=0)
                if np.linalg.norm(mean - old_mean) < stop_thresh:
                    break
            return tuple(mean), len(indices)

        def fit(self, X):
            if self.bin_seeding:
                seeds = self.get_bin_seeds(X, self.bandwidth, self.min_bin_freq)
            else:
                seeds = X

            nbrs = NearestNeighbors(radius=self.bandwidth).fit(X)
            results = [self.fit_single_seed(seed, X, nbrs, self.bandwidth, self.max_iter) for seed in seeds]

            clusters = {center: size for center, size in results if size > 0}

            if not clusters:
                self.cluster_center = tuple(X.mean(axis=0))
                self.used_mean = True
                return self

            self.cluster_center = max(clusters.items(), key=lambda x: x[1])[0]
            return self

    slot_size = 30 * 60  # 30 min slots
    radius_m = 250  # meters
    bandwidth_deg = radius_m / 111320  # approx deg
    night_start = 19
    night_end = 7

    homes = []
    valid_caid = set(data['caid'])
    skipped_no_night = 0

    for caid in tqdm(valid_caid, desc="Estimating home locations"):
        user_df = data.loc[data['caid'] == caid]
        dt = user_df['datetime_pdt'].dt
        is_night = (dt.hour >= night_start) | (dt.hour < night_end)
        night_df = user_df[is_night]

        if night_df.empty:
            skipped_no_night += 1
            continue

        t = night_df['datetime_pdt'].astype(int) // 1e9
        slots = (t // slot_size).astype(int)
        night_df = night_df.assign(slot=slots)

        superpings = (
            night_df.groupby('slot')
            .agg({'latitude': 'mean', 'longitude': 'mean'})
            .dropna()
            .to_numpy()
        )

        if len(superpings) == 0:
            continue

        model = MeanShift(
            bandwidth=bandwidth_deg,
            bin_seeding=True,
            min_bin_freq=2,
            max_iter=50
        )
        model.fit(superpings)
        home_lat, home_lon = model.cluster_center

        homes.append({
            'caid': caid,
            'latitude': home_lat,
            'longitude': home_lon
        })

    return pd.DataFrame(homes)

In [None]:
def run_binned_hda_on_all_2019_data():
    input_folder = "00_Data/02_Cleaned_Sample_Data/2019_Cleaned_Data"
    output_folder = "00_Data/04_HDA_Sample_Data/Binned_HDA"
    os.makedirs(output_folder, exist_ok=True)

    parquet_files = [
        f for f in os.listdir(input_folder)
        if f.endswith(".parquet")
    ]

    for file in tqdm(parquet_files, desc="Processing files for Binned HDA"):
        input_path = os.path.join(input_folder, file)
        output_path = os.path.join(output_folder, f"binned_hda_{file.replace('.parquet', '.csv')}")

        if os.path.exists(output_path):
            print(f"Already processed {file}, skipping.")
            continue

        try:
            data = pd.read_parquet(input_path, engine="pyarrow")
            homes_df_binned = run_binned_hda_for_single_file(data)
            homes_df_binned.to_csv(output_path, index=False)
            print(f"{file}: {len(homes_df_binned):,} users processed.")
        except Exception as e:
            print(f"Error processing {file}: {e}")

run_binned_hda_on_all_2019_data()

Processing files for Binned HDA:   0%|          | 0/9 [00:00<?, ?it/s]
Estimating home locations:   4%|▎         | 542/15444 [02:44<1:13:01,  3.40it/s]

In [None]:
def load_all_binned_hda_results(folder_path: str) -> pd.DataFrame:
    csv_files = [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.startswith("binned_hda_") and f.endswith(".csv")
    ]

    df_list = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Failed to load {file}: {e}")

    if not df_list:
        raise ValueError("No valid binned HDA files found.")

    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

binned_hda_folder = "00_Data/04_HDA_Sample_Data/Binned_HDA"
all_binned_hda = load_all_binned_hda_results(binned_hda_folder)
print(f"Loaded {len(all_binned_hda):,} total users from binned HDA files.")

output_file = "00_Data/04_HDA_Sample_Data/2019_all_users_binned_home_locations.csv"
all_binned_hda.to_csv(output_file, index=False)
print(f"Saved combined HDA results to: {output_file}")
