In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from tqdm import tqdm
import os
from pathlib import Path

os.chdir("../..")
os.getcwd()

'/home/bwool/RESEARCH/TRB-Home-Data-Quality-2025'

In [None]:
# === Core metrics ===
total_records = data.groupby('caid').size().rename('total_records')

prop_ios = data.groupby('caid')['is_iOS'].mean().rename('prop_ios').astype(float)

data['date'] = data['datetime_pdt'].dt.date
days_with_data = data.groupby('caid')['date'].nunique().rename('days_with_data')

prop_high_accuracy = (
    data.groupby('caid')['horizontal_accuracy']
    .apply(lambda x: (x <= 100).mean())
    .rename('prop_high_accuracy')
)

# === Night & bin labels ON MAIN DATA ===
data['hour'] = data['datetime_pdt'].dt.hour
data['minute'] = data['datetime_pdt'].dt.minute

data['is_night'] = ((data['hour'] >= 19) | (data['hour'] < 7))

data['night_date'] = data['datetime_pdt'].dt.date
data.loc[data['hour'] < 7, 'night_date'] -= pd.Timedelta(days=1)

# Only bin night pings
mask_night = data['is_night']
data.loc[mask_night, 'night_minute'] = data.loc[mask_night, 'hour'] * 60 + data.loc[mask_night, 'minute']
data.loc[mask_night, 'night_bin'] = data.loc[mask_night, 'night_minute'] // 30

# === Night metrics ===
temp = data.loc[mask_night, ['caid', 'night_date', 'night_bin']].copy()

# Total night pings & unique nights
night_counts = (
    temp.groupby('caid')
    .agg(
        total_night_pings=('night_date', 'count'),
        unique_nights=('night_date', 'nunique')
    )
)
night_counts['avg_night_pings_per_night'] = (
    night_counts['total_night_pings'] / night_counts['unique_nights']
)

# === Bin-level stats ===
# Bins per night per user-night
bins_per_night = (
    temp.groupby(['caid', 'night_date'])['night_bin']
    .nunique()
    .rename('bins_this_night')
).reset_index()

# Average bins per night
avg_bins_per_night = (
    bins_per_night.groupby('caid')['bins_this_night']
    .mean()
    .rename('avg_bins_per_night')
)

# === Combine all ===
user_metrics = pd.concat([
    total_records,
    prop_ios,
    days_with_data,
    prop_high_accuracy
], axis=1).reset_index()

user_metrics = (
    user_metrics
    .merge(night_counts.reset_index(), on='caid', how='left')
    .merge(avg_bins_per_night.reset_index(), on='caid', how='left')
)

# === Save ===
filepath = '00_Sample_Data/2019_Pre_HDA_Metrics.csv'
user_metrics.to_csv(filepath, index=False)
#

### User Data Quality Feature Descriptions

| Field Name                | Description |
|---------------------------|-------------|
| caid                      | Unique user ID |
| total_records             | Total number of records (rows) per user |
| prop_ios                  | Proportion of records where device type is iOS |
| days_with_data            | Number of unique days with at least one record |
| prop_high_accuracy        | Proportion of records with horizontal accuracy ≤ 100 meters |
| total_night_pings         | Total number of night-time records (7 PM–7 AM) |
| unique_nights             | Number of unique nights with at least one night-time record |
| avg_night_pings_per_night | Average number of night-time records per unique night |
| avg_bins_per_night        | Average number of unique 30-minute bins per night |

# plotting

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
bin_count = 100

# Submetric 1: Avg Night Pings per Night
axs[0].hist(user_metrics['avg_night_pings_per_night'].dropna(), bins=bin_count)
axs[0].set_title('Avg Night Pings per Night')
axs[0].set_xlabel('Avg Night Pings')
axs[0].set_ylabel('Number of Users')

# Submetric 2: Avg Bins per Night
axs[1].hist(user_metrics['avg_bins_per_night'].dropna(), bins=bin_count)
axs[1].set_title('Avg 30-min Bins per Night')
axs[1].set_xlabel('Avg Bins')
axs[1].set_ylabel('Number of Users')

# Submetric 3: Unique Nights
axs[2].hist(user_metrics['unique_nights'].dropna(), bins=bin_count)
axs[2].set_title('Unique Nights with Data')
axs[2].set_xlabel('Unique Nights')
axs[2].set_ylabel('Number of Users')

for ax in axs:
    ax.grid(True)
    ax.set_yscale('log')
plt.tight_layout()
plt.show()


In [None]:
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
bin_count = 50

# --- Helper to add KDE ---
def plot_hist_with_kde(ax, data, bins, title, xlabel):
    # Histogram
    counts, bins, patches = ax.hist(data, bins=bins, alpha=0.6, label='Histogram')

    # KDE
    kde = gaussian_kde(data)
    x_vals = np.linspace(min(data), max(data), 1000)
    kde_vals = kde(x_vals)
    # Scale KDE to histogram height
    kde_scaled = kde_vals * max(counts) / max(kde_vals)
    ax.plot(x_vals, kde_scaled, color='red', label='KDE')

    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Number of Users')
    ax.legend()
    ax.grid(True)
    ax.set_yscale('log')

# --- Submetric 1: Log-space KDE ---
data = user_metrics['avg_night_pings_per_night'].dropna()
clip_max = 400
data_clipped = np.clip(data, 0, clip_max)  # Clip huge outliers for clean plot

# KDE in linear space, but clipped
kde = gaussian_kde(data_clipped)
x_vals = np.linspace(data_clipped.min(), data_clipped.max(), 1000)
kde_vals = kde(x_vals)

counts, bins, patches = axs[0].hist(data_clipped, bins=bin_count, alpha=0.6, label='Histogram')
kde_scaled = kde_vals * max(counts) / max(kde_vals)
axs[0].plot(x_vals, kde_scaled, color='red', label='KDE')

axs[0].set_title(f'Avg Night Pings per Night (clipped at {clip_max})')
axs[0].set_xlabel('Avg Night Pings')
axs[0].set_ylabel('Number of Users')
axs[0].legend()
axs[0].grid(True)
axs[0].set_yscale('log')

# --- Submetric 2 ---
plot_hist_with_kde(
    axs[1],
    user_metrics['avg_bins_per_night'].dropna(),
    bins=bin_count,
    title='Avg 30-min Bins per Night',
    xlabel='Avg Bins'
)
axs[1].set_xlim(0,24)
# --- Submetric 3 ---
plot_hist_with_kde(
    axs[2],
    user_metrics['unique_nights'].dropna(),
    bins=bin_count,
    title='Unique Nights with Data',
    xlabel='Unique Nights'
)

plt.tight_layout()
plt.show()


In [None]:
# Feature to plot
feature = 'avg_night_pings_per_night'

fig, ax = plt.subplots()
ax.hist(user_metrics[feature], bins=100)
ax.set_title('Distribution of Average Night Pings per Night')
ax.set_xlabel('Average Night Pings per Night')
ax.set_ylabel('Count')
ax.grid(axis='y', zorder=0)

for bar in ax.patches:
    bar.set_zorder(2)

ax.set_xlim(left=0, right=1000)
ax.set_yscale('log')
plt.show()

In [None]:
user_metrics['avg_night_pings_per_night'].describe()

In [None]:
# Features to plot
features = [
    'total_night_pings',
    'unique_nights',
    'avg_night_pings_per_night',
    'avg_bins_per_night',
]

# Plot each feature
for feature in features:
    fig, ax = plt.subplots()
    ax.hist(user_metrics[feature], bins=30)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Count')
    ax.grid(axis='y', zorder=0)
    for bar in ax.patches:
        bar.set_zorder(2)
    plt.show()