In [1]:
import pandas as pd
import numpy as np
import sys
import os

# Add project root to path
sys.path.append(r"D:\data-problem-solving")

from db import load_table, engine

df =  load_table('fitness_sessions')

In [2]:
# Haversine distance in kilometers
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

# Flat distance approximation (in km)
def flat_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2) * 111  # Rough scale factor

In [3]:
result_rows = []

for session_id, group in df.groupby('session_id'):
    if group.shape[0] <= 1:
        continue  # Skip sessions with only one step
    
    start = group.loc[group['step_id'].idxmin()]
    end = group.loc[group['step_id'].idxmax()]
    
    curved = haversine(start['latitude'], start['longitude'],
                       end['latitude'], end['longitude'])
    
    flat = flat_distance(start['latitude'], start['longitude'],
                         end['latitude'], end['longitude'])
    
    result_rows.append({
        'session_id': session_id,
        'avg_distance_curved': round(curved, 4),
        'avg_distance_flat': round(flat, 4),
        'difference': round(abs(flat - curved), 4)
    })

result_df = pd.DataFrame(result_rows)
print(result_df)

   session_id  avg_distance_curved  avg_distance_flat  difference
0         101               0.0283             0.0314      0.0030
1         102               0.0144             0.0157      0.0013
2         201               0.0140             0.0157      0.0017
3         202               0.0131             0.0157      0.0026
4         301               0.0133             0.0157      0.0024
