In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from tqdm import tqdm
import os
from pathlib import Path

os.chdir("../..")
os.getcwd()

'/home/bwool/RESEARCH/TRB-Home-Data-Quality-2025'

# home inference

In [None]:
# === Custom Meanshift ===
class MeanShift:
    def __init__(self, bandwidth, bin_seeding=True, min_bin_freq=2, max_iter=50):
        self.bandwidth = bandwidth
        self.bin_seeding = bin_seeding
        self.min_bin_freq = min_bin_freq
        self.max_iter = max_iter
        self.cluster_center = None
        self.used_mean = False

    @staticmethod
    def get_bin_seeds(X, bin_size, min_bin_freq=1):
        bin_sizes = defaultdict(int)
        for point in X:
            binned = np.round(point / bin_size)
            bin_sizes[tuple(binned)] += 1
        seeds = [np.array(point) * bin_size for point, freq in bin_sizes.items() if freq >= min_bin_freq]
        return np.array(seeds) if seeds else X

    @staticmethod
    def fit_single_seed(seed, X, nbrs, bandwidth, max_iter):
        stop_thresh = 1e-3 * bandwidth
        mean = seed
        for _ in range(max_iter):
            indices = nbrs.radius_neighbors([mean], bandwidth, return_distance=False)[0]
            if len(indices) == 0:
                break
            old_mean = mean
            mean = X[indices].mean(axis=0)
            if np.linalg.norm(mean - old_mean) < stop_thresh:
                break
        return tuple(mean), len(indices)

    def fit(self, X):
        if self.bin_seeding:
            seeds = self.get_bin_seeds(X, self.bandwidth, self.min_bin_freq)
        else:
            seeds = X

        nbrs = NearestNeighbors(radius=self.bandwidth).fit(X)
        results = [self.fit_single_seed(seed, X, nbrs, self.bandwidth, self.max_iter) for seed in seeds]

        clusters = {center: size for center, size in results if size > 0}

        if not clusters:
            self.cluster_center = tuple(X.mean(axis=0))
            self.used_mean = True
            return self

        self.cluster_center = max(clusters.items(), key=lambda x: x[1])[0]
        return self


# === Load Data ===
data = pd.read_parquet('00_Sample_Data/2019_Pre_HDA_Data.parquet')

# === Prepare Superpings ===
slot_size = 30 * 60  # 30 min slots
radius_m = 250  # meters
mean_lat = data['latitude'].mean()
bandwidth_deg = radius_m / 111320  # approx degrees at equator

homes = []

valid_caid = set(user_metrics['caid'])

for caid in valid_caid:
    user_df = data.loc[data['caid'] == caid]

    t = user_df['datetime_pdt'].astype(int) // 1e9
    slots = (t // slot_size).astype(int)
    user_df = user_df.assign(slot=slots)

    superpings = (
        user_df.groupby('slot')
        .agg({'latitude': 'mean', 'longitude': 'mean'})
        .dropna()
        .to_numpy()
    )

    if len(superpings) == 0:
        continue

    model = MeanShift(
        bandwidth=bandwidth_deg,
        bin_seeding=True,
        min_bin_freq=2,
        max_iter=50
    )
    model.fit(superpings)
    home_lat, home_lon = model.cluster_center

    homes.append({
        'caid': caid,
        'latitude': home_lat,
        'longitude': home_lon,
        'used_mean': model.used_mean
    })

homes_df = pd.DataFrame(homes)