# Task 4.3: Location & Mobility Features

**Goal**: Engineer spatial behavioral features from GPS data.

**Metrics**:
- **Mobility**: Distance traveled, Location Variance.
- **Routine**: Significant Places (Clusters), Entropy.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append('../../')

from src.features.location_features import process_location_features

## 1. Load Data

In [None]:
DATA_DIR = Path('../../data/processed')
TEST_DATA_DIR = Path('../../data/processed_test')

if (DATA_DIR / 'train.parquet').exists():
    df = pd.read_parquet(DATA_DIR / 'train.parquet')
    print("Loaded full training data")
elif (TEST_DATA_DIR / 'train.parquet').exists():
    df = pd.read_parquet(TEST_DATA_DIR / 'train.parquet')
    print("Loaded test subset data")
else:
    print("No data found. Generating dummy GPS data.")
    dates = pd.date_range(start='2013-03-27', end='2013-06-05', freq='h')
    df = pd.DataFrame({'timestamp': dates})
    # Mock GPS: Random walk
    df['gps_lat'] = 43.7 + np.cumsum(np.random.randn(len(df)) * 0.001)
    df['gps_lon'] = -72.3 + np.cumsum(np.random.randn(len(df)) * 0.001)
    df['participant_id'] = 'u00'

print(f"Shape: {df.shape}")

## 2. Feature Engineering

In [None]:
df_features = process_location_features(df)
df_features[['timestamp', 'distance_traveled', 'location_variance', 'location_cluster']].head(10)

## 3. Visualization

### 3.1 Trajectory Plot
Visualizing movement patterns. Color indicates cluster (Significant Place).

In [None]:
plt.figure(figsize=(10, 8))
# Filter noise (-1)
mask = df_features['location_cluster'] != -1
sns.scatterplot(
    data=df_features[mask], 
    x='gps_lon', 
    y='gps_lat', 
    hue='location_cluster', 
    palette='tab10', 
    s=50
)
# Plot noise smaller
sns.scatterplot(
    data=df_features[~mask], 
    x='gps_lon', 
    y='gps_lat', 
    color='grey', 
    s=10, 
    alpha=0.3, 
    label='Noise'
)
plt.title("GPS Trajectory & Significant Places (Clusters)")
plt.show()

### 3.2 Distance Traveled Distribution

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df_features['distance_traveled'].dropna(), bins=30, log_scale=(False, True))
plt.title("Hourly Distance Traveled (km) - Log Scale")
plt.xlabel("Distance (km)")
plt.show()