In [4]:
import os
import numpy as np
from collections import Counter
from typing import Generator, Optional, Dict
from fitparse import FitFile, FitParseError
from dataclasses import dataclass
import traceback
import concurrent.futures
from threading import Lock
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import csv

In [18]:

def preprocess_fit_files(fit_directory: str, output_path: str = 'data/processed/activities.csv') -> None:
    """
    Process all FIT files in directory and save relevant features to CSV incrementally.
    """
    
    # Create output directory if it doesn't exist
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    
    # Get all fit files
    fit_files = [f for f in Path(fit_directory).glob('*.fit')]
    print(f"Found {len(fit_files)} FIT files")
    
    # Initialize counters
    skipped_files = 0
    processed_files = 0
    total_records = 0
    
    # Define CSV headers
    headers = ['activity_id', 'timestamp', 'sport', 'heart_rate', 'speed', 
              'cadence', 'power', 'latitude', 'longitude']
    
    # Create/open CSV file with headers
    with open(output_path, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        
        # Process each file
        for fit_path in tqdm(fit_files):
            try:
                fit_path_str = str(fit_path)
                fit_file = FitFile(fit_path_str)
                session_msg = next(fit_file.get_messages('session'))
                if session_msg.get_value('sport') not in ['running', 'cycling']:
                    continue
                if session_msg.get_value('sub_sport') in ['indoor_running', 'indoor_cycling', 'virtual_activity']:
                    continue
                records = list(fit_file.get_messages('record'))
                has_position_data = any(r.get_value('position_lat') is not None for r in records[:60])
                if not has_position_data:
                    continue
                
                # If we get here, process the activity records
                fit_file = FitFile(str(fit_path))
                activity_id = fit_path.stem  # filename without extension
                
                activity_records = []  # Buffer for this activity's records
                
                for record in fit_file.get_messages('record'):
                    try:
                        # Extract relevant features
                        record_dict = {
                            'activity_id': activity_id,
                            'timestamp': record.get_value('timestamp'),
                            'sport': session_msg.get_value('sport'),
                            'heart_rate': record.get_value('heart_rate'),
                            'speed': record.get_value('speed'),
                            'cadence': record.get_value('cadence'),
                            'power': record.get_value('power'),
                            'latitude': record.get_value('position_lat'),
                            'longitude': record.get_value('position_long')
                        }
                        
                        # Only add records that have at least heart rate and speed
                        if record_dict['heart_rate'] is not None and record_dict['speed'] is not None:
                            activity_records.append(record_dict)
                            
                    except Exception as e:
                        print(f"Error processing record in {fit_path}: {str(e)}")
                        continue
                
                # Write all records for this activity
                writer.writerows(activity_records)
                
                total_records += len(activity_records)
                processed_files += 1
                
                # Flush periodically to ensure data is written to disk
                if processed_files % 100 == 0:
                    f.flush()
                
            except Exception as e:
                print(f"Error processing file {fit_path}: {str(e)}")
                skipped_files += 1
                continue
    
    print(f"\nProcessing complete:")
    print(f"Processed {processed_files} files")
    print(f"Skipped {skipped_files} files")
    print(f"Total records: {total_records}")
    print(f"\nSaved to {output_path}")


In [17]:
preprocess_fit_files('./data/fit/', output_path='./data/records.csv')

Found 3090 FIT files


  0%|          | 1/3090 [00:00<15:43,  3.27it/s]

495122656 -973571951
495122656.0 -973571951.0
495122740 -973571812
495122740.0 -973571812.0
495122899 -973571640
495122899.0 -973571640.0
495123067 -973571365
495123067.0 -973571365.0
495123270 -973570973
495123270.0 -973570973.0
495123480 -973570693
495123480.0 -973570693.0
495123709 -973570202
495123709.0 -973570202.0
495123859 -973569831
495123859.0 -973569831.0
495123953 -973569446
495123953.0 -973569446.0
495124062 -973569032
495124062.0 -973569032.0
495124185 -973568690
495124185.0 -973568690.0
495124329 -973568175
495124329.0 -973568175.0
495124354 -973567714
495124354.0 -973567714.0
495124344 -973567267
495124344.0 -973567267.0
495124361 -973566885
495124361.0 -973566885.0
495124386 -973566406
495124386.0 -973566406.0
495124469 -973565921
495124469.0 -973565921.0
495124554 -973565359
495124554.0 -973565359.0
495124525 -973564897
495124525.0 -973564897.0
495124365 -973564603
495124365.0 -973564603.0
495124130 -973564486
495124130.0 -973564486.0
495123735 -973564364
495123735.0 -

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10693de50>>
Traceback (most recent call last):
  File "/Users/agindin/code/KineticAI/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


495009935 -973313795
495009935.0 -973313795.0
495010219 -973313514
495010219.0 -973313514.0
495010540 -973313280
495010540.0 -973313280.0
495010860 -973313036
495010860.0 -973313036.0
495011111 -973312759
495011111.0 -973312759.0
495011417 -973312449
495011417.0 -973312449.0
495011762 -973312168
495011762.0 -973312168.0
495012199 -973311835
495012199.0 -973311835.0
495012504 -973311601
495012504.0 -973311601.0
495012764 -973311379
495012764.0 -973311379.0
495013039 -973311189
495013039.0 -973311189.0
495013334 -973310980
495013334.0 -973310980.0
495013556 -973310787
495013556.0 -973310787.0
495013905 -973310499
495013905.0 -973310499.0
495014152 -973310303
495014152.0 -973310303.0
495014428 -973310076
495014428.0 -973310076.0
495014815 -973309725
495014815.0 -973309725.0
495015117 -973309428
495015117.0 -973309428.0
495015428 -973309174
495015428.0 -973309174.0
495015735 -973308894
495015735.0 -973308894.0
495016066 -973308606
495016066.0 -973308606.0
495016357 -973308335
495016357.0 -

  0%|          | 3/3090 [00:03<1:01:14,  1.19s/it]

495125271 -973572451
495125271.0 -973572451.0
495125267 -973572456
495125267.0 -973572456.0
495125179 -973572881
495125179.0 -973572881.0
495125177 -973573390
495125177.0 -973573390.0
495125164 -973574078
495125164.0 -973574078.0
495125103 -973574874
495125103.0 -973574874.0
495124936 -973575655
495124936.0 -973575655.0
495124620 -973576369
495124620.0 -973576369.0
495124139 -973577045
495124139.0 -973577045.0
495123516 -973577682
495123516.0 -973577682.0
495122790 -973578276
495122790.0 -973578276.0
495122014 -973578803
495122014.0 -973578803.0
495121283 -973579385
495121283.0 -973579385.0
495120559 -973579950
495120559.0 -973579950.0
495119856 -973580383
495119856.0 -973580383.0
495119144 -973580616
495119144.0 -973580616.0
495118367 -973580612
495118367.0 -973580612.0
495117715 -973580239
495117715.0 -973580239.0
495117352 -973579502
495117352.0 -973579502.0
495117144 -973578561
495117144.0 -973578561.0
495117035 -973577564
495117035.0 -973577564.0
495117007 -973576579
495117007.0 -

  0%|          | 4/3090 [00:05<1:24:31,  1.64s/it]

567305999 -1458599477
567305999.0 -1458599477.0
567304375 -1458596702
567304375.0 -1458596702.0
567302947 -1458594182
567302947.0 -1458594182.0
567300799 -1458592848
567300799.0 -1458592848.0
567298645 -1458591657
567298645.0 -1458591657.0
567296436 -1458590104
567296436.0 -1458590104.0
567294307 -1458588711
567294307.0 -1458588711.0
567292498 -1458586639
567292498.0 -1458586639.0
567292132 -1458583481
567292132.0 -1458583481.0
567292968 -1458581885
567292968.0 -1458581885.0
567294253 -1458581837
567294253.0 -1458581837.0
567296348 -1458582687
567296348.0 -1458582687.0
567298794 -1458583676
567298794.0 -1458583676.0
567301128 -1458583871
567301128.0 -1458583871.0
567301903 -1458583015
567301903.0 -1458583015.0
567302771 -1458580659
567302771.0 -1458580659.0
567305233 -1458580868
567305233.0 -1458580868.0
567307655 -1458581076
567307655.0 -1458581076.0
567309702 -1458579593
567309702.0 -1458579593.0
567311616 -1458578146
567311616.0 -1458578146.0
567311811 -1458578064
567311811.0 -14585

  0%|          | 7/3090 [00:07<48:25,  1.06it/s]  

513827720 -852991083
513827720.0 -852991083.0
513828630 -852990278
513828630.0 -852990278.0
513829580 -852989469
513829580.0 -852989469.0
513829580 -852989469
513829580.0 -852989469.0
513831522 -852987780
513831522.0 -852987780.0
513832450 -852986786
513832450.0 -852986786.0
513833305 -852985727
513833305.0 -852985727.0
513834062 -852984526
513834062.0 -852984526.0
513834720 -852983218
513834720.0 -852983218.0
513835252 -852981909
513835252.0 -852981909.0
513835671 -852980633
513835671.0 -852980633.0
513836139 -852979373
513836139.0 -852979373.0
513836139 -852979373
513836139.0 -852979373.0
513836693 -852978197
513836693.0 -852978197.0
513837885 -852976467
513837885.0 -852976467.0
513838476 -852975878
513838476.0 -852975878.0
513838929 -852975449
513838929.0 -852975449.0
513839247 -852975132
513839247.0 -852975132.0
513839456 -852974872
513839456.0 -852974872.0
513839555 -852974660
513839555.0 -852974660.0
513839629 -852974440
513839629.0 -852974440.0
513839670 -852974146
513839670.0 -

  0%|          | 8/3090 [00:08<45:23,  1.13it/s]

 -973376256
495001696.0 -973376256.0
495001696 -973376256
495001696.0 -973376256.0
495001824 -973377088
495001824.0 -973377088.0
495001824 -973377088
495001824.0 -973377088.0
495002144 -973377920
495002144.0 -973377920.0
495002144 -973377920
495002144.0 -973377920.0
495002176 -973378688
495002176.0 -973378688.0
495002176 -973378688
495002176.0 -973378688.0
495002240 -973379520
495002240.0 -973379520.0
495002240 -973379520
495002240.0 -973379520.0
495002560 -973380352
495002560.0 -973380352.0
495002560 -973380352
495002560.0 -973380352.0
495002752 -973381056
495002752.0 -973381056.0
495002752 -973381056
495002752.0 -973381056.0
495003008 -973381824
495003008.0 -973381824.0
495003008 -973381824
495003008.0 -973381824.0
495003200 -973382656
495003200.0 -973382656.0
495003200 -973382656
495003200.0 -973382656.0
495003552 -973383296
495003552.0 -973383296.0
495003552 -973383296
495003552.0 -973383296.0
495003744 -973384000
495003744.0 -973384000.0
495003744 -973384000
495003744.0 -973384000

  0%|          | 9/3090 [00:08<36:22,  1.41it/s]

495246080 -973567864
495246080.0 -973567864.0
495245720 -973568043
495245720.0 -973568043.0
495245061 -973568348
495245061.0 -973568348.0
495244918 -973569064
495244918.0 -973569064.0
495244715 -973569550
495244715.0 -973569550.0
495244614 -973569975
495244614.0 -973569975.0
495244648 -973570238
495244648.0 -973570238.0
495244364 -973570371
495244364.0 -973570371.0
495244142 -973570642
495244142.0 -973570642.0
495244027 -973571000
495244027.0 -973571000.0
495243548 -973571143
495243548.0 -973571143.0
495243354 -973571485
495243354.0 -973571485.0
495243060 -973571832
495243060.0 -973571832.0
495242749 -973572160
495242749.0 -973572160.0
495242464 -973572416
495242464.0 -973572416.0
495242224 -973572578
495242224.0 -973572578.0
495242037 -973572642
495242037.0 -973572642.0
495241866 -973572782
495241866.0 -973572782.0
495241705 -973572993
495241705.0 -973572993.0
495241436 -973573130
495241436.0 -973573130.0
495241153 -973573238
495241153.0 -973573238.0
495240936 -973573189
495240936.0 -

  0%|          | 12/3090 [00:10<36:19,  1.41it/s]

495280045.0 -973575316.0
495280222 -973575336
495280222.0 -973575336.0
495280509 -973575334
495280509.0 -973575334.0
495280702 -973575284
495280702.0 -973575284.0
495280917 -973575189
495280917.0 -973575189.0
495281215 -973575126
495281215.0 -973575126.0
495281544 -973575016
495281544.0 -973575016.0
495281957 -973574800
495281957.0 -973574800.0
495282352 -973574591
495282352.0 -973574591.0
495282568 -973574390
495282568.0 -973574390.0
495282765 -973574153
495282765.0 -973574153.0
495282988 -973573895
495282988.0 -973573895.0
495283161 -973573640
495283161.0 -973573640.0
495283317 -973573369
495283317.0 -973573369.0
495283430 -973573095
495283430.0 -973573095.0
495283551 -973572765
495283551.0 -973572765.0
495283666 -973572398
495283666.0 -973572398.0
495283789 -973571957
495283789.0 -973571957.0
495283885 -973571489
495283885.0 -973571489.0
495283966 -973571009
495283966.0 -973571009.0
495284067 -973570502
495284067.0 -973570502.0
495284114 -973570116
495284114.0 -973570116.0
495284172

  0%|          | 13/3090 [00:13<1:01:21,  1.20s/it]

512788646 -852726726
512788646.0 -852726726.0
512788539 -852726333
512788539.0 -852726333.0
512788517 -852725930
512788517.0 -852725930.0
512788394 -852725523
512788394.0 -852725523.0
512788200 -852725070
512788200.0 -852725070.0
512788053 -852724697
512788053.0 -852724697.0
512787933 -852724328
512787933.0 -852724328.0
512787789 -852723905
512787789.0 -852723905.0
512787626 -852723492
512787626.0 -852723492.0
512787435 -852723125
512787435.0 -852723125.0
512787192 -852722756
512787192.0 -852722756.0
512786981 -852722406
512786981.0 -852722406.0
512786782 -852722156
512786782.0 -852722156.0
512786487 -852721838
512786487.0 -852721838.0
512786286 -852721526
512786286.0 -852721526.0
512786084 -852721227
512786084.0 -852721227.0
512785885 -852720889
512785885.0 -852720889.0
512785710 -852720528
512785710.0 -852720528.0
512785418 -852720103
512785418.0 -852720103.0
512785055 -852719671
512785055.0 -852719671.0
512784813 -852719257
512784813.0 -852719257.0
512784608 -852718824
512784608.0 -

  1%|          | 16/3090 [00:13<27:55,  1.83it/s]  

In [5]:
def convert_coordinates(input_path: str, output_path: str):
    """Convert coordinates from semicircles to degrees in place."""
    
    print("Converting coordinates from semicircles to degrees...")
    
    # Conversion constant
    SEMICIRCLES_TO_DEGREES = 180.0 / (2**31)
    
    # Read CSV in chunks to handle large files
    chunk_size = 100000
    first_chunk = True
    
    for chunk in pd.read_csv(input_path, chunksize=chunk_size):
        # Convert coordinates
        chunk['latitude'] = chunk['latitude'] * SEMICIRCLES_TO_DEGREES
        chunk['longitude'] = chunk['longitude'] * SEMICIRCLES_TO_DEGREES
        
        # Write to file
        if first_chunk:
            chunk.to_csv(output_path, index=False, mode='w')
            first_chunk = False
        else:
            chunk.to_csv(output_path, index=False, mode='a', header=False)
    
    print("Conversion complete!")
    
    # Verify a few values
    df_sample = pd.read_csv(output_path, nrows=5)
    print("\nSample of converted coordinates:")
    print(df_sample[['latitude', 'longitude']].head())

In [6]:
convert_coordinates(
    input_path='./data/records.csv',
    output_path='./data/records_converted.csv'
)

Converting coordinates from semicircles to degrees...
Conversion complete!

Sample of converted coordinates:
    latitude  longitude
0  41.500921 -81.603900
1  41.500920 -81.603901
2  41.500913 -81.603936
3  41.500913 -81.603979
4  41.500912 -81.604037


In [14]:
def verify_locations(input_path: str):
    """Sample one location per 30 minutes per activity and get city-level location data."""
    
    print("Loading and preprocessing data...")
    df = pd.read_csv(input_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sample one location per 30 minutes per activity
    sampled_locations = []
    for activity_id, group in df.groupby('activity_id'):
        # Get location at each 30-minute mark
        activity_samples = group.groupby(pd.Grouper(key='timestamp', freq='60min')).agg({
            'latitude': 'first',
            'longitude': 'first'
        }).dropna()
        sampled_locations.append(activity_samples)
    
    locations_df = pd.concat(sampled_locations)
    print(f"Sampled {len(locations_df)} locations from {len(df['activity_id'].unique())} activities")
    
    # Get unique lat/lon pairs to minimize API calls
    unique_locations = locations_df.drop_duplicates(['latitude', 'longitude'])
    print(f"Found {len(unique_locations)} unique locations")
    
    # Query Nominatim for each unique location
    location_info = {}
    for idx, row in tqdm(unique_locations.iterrows(), total=len(unique_locations)):
        try:
            url = "https://nominatim.openstreetmap.org/reverse"
            params = {
                'lat': row['latitude'],
                'lon': row['longitude'],
                'format': 'jsonv2',
                'zoom': 10  # City level
            }
            
            response = requests.get(
                url, 
                params=params,
                headers={'User-Agent': 'KineticAI/1.0'}  # Required by Nominatim
            )
            response.raise_for_status()
            data = response.json()
            
            # Get city or nearest populated place
            city = (
                data['address'].get('city') or 
                data['address'].get('town') or 
                data['address'].get('village') or
                data['address'].get('suburb')
            )
            
            state = (
                data['address'].get('state') or
                data['address'].get('state_district')
            )
            
            country = data['address'].get('country')
            
            location_key = f"{city}, {state}, {country}" if state else f"{city}, {country}"
            location_info[(row['latitude'], row['longitude'])] = location_key
            
            # Respect rate limit
            time.sleep(1)
            
        except Exception as e:
            print(f"Error getting location info for {row['latitude']}, {row['longitude']}: {str(e)}")
            continue
    
    # Count activities per location
    location_counts = Counter()
    for idx, row in locations_df.iterrows():
        loc_key = location_info.get((row['latitude'], row['longitude']))
        if loc_key:
            location_counts[loc_key] += 1
    
    # Print results
    print("\nActivity distribution by location:")
    for location, count in sorted(location_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{location}: {count} records")

In [15]:
verify_locations('./data/records_converted.csv')

Loading and preprocessing data...
Sampled 902 locations from 387 activities
Found 891 unique locations


 24%|██▍       | 217/891 [04:52<12:18,  1.10s/it]

Error getting location info for 0.0, 0.0: 'address'


100%|██████████| 891/891 [19:58<00:00,  1.35s/it]



Activity distribution by location:
Manchester, New Hampshire, United States: 292 records
Cleveland, Ohio, United States: 141 records
None, Virginia, United States: 37 records
City of Yonkers, New York, United States: 35 records
Cleveland Heights, Ohio, United States: 34 records
Goffstown, New Hampshire, United States: 24 records
Ludlow, Vermont, United States: 22 records
Bedford, New Hampshire, United States: 20 records
Hooksett, New Hampshire, United States: 19 records
Town of Bedford, New York, United States: 15 records
Shaker Heights, Ohio, United States: 15 records
Ann Arbor, Michigan, United States: 14 records
City of New York, New York, United States: 13 records
None, Hawaii, United States: 13 records
Pepper Pike, Ohio, United States: 12 records
Merrimack, New Hampshire, United States: 10 records
Dunbarton, New Hampshire, United States: 10 records
None, New York, United States: 9 records
Gates Mills, Ohio, United States: 9 records
None, California, United States: 9 records
Beach

In [22]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import time
from tqdm import tqdm

def add_weather_data(input_path: str, output_path: str):
    """Add weather data to activity records, minimizing API calls by sampling every 30 minutes per activity."""
    
    print("Loading and preprocessing data...")
    df = pd.read_csv(input_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Filter out (0,0) coordinates
    df = df[~((df['latitude'] == 0) & (df['longitude'] == 0))]
    df = df.sort_values(['activity_id', 'timestamp'])
    
    # Create weather timestamp column (rounded to nearest 30 minutes)
    df['weather_timestamp'] = df['timestamp'].dt.floor('30min')
    df['lat_rounded'] = df['latitude'].round(2)
    df['lon_rounded'] = df['longitude'].round(2)
    
    # For each activity, sample location every 30 minutes
    weather_points = df.groupby(['activity_id', 'weather_timestamp']).agg({
        'lat_rounded': 'first',
        'lon_rounded': 'first'
    }).reset_index()
    
    # Get unique location-time combinations
    unique_queries = weather_points.drop_duplicates(['lat_rounded', 'lon_rounded', 'weather_timestamp'])
    print(f"Found {len(unique_queries)} unique location-time combinations")
    
    # Initialize weather data storage
    weather_cache = {}
    
    # Weather variables we want
    hourly_params = [
        'temperature_2m',
        'relative_humidity_2m',
        'dew_point_2m',
        'wind_speed_10m',
        'wind_direction_10m',
        'precipitation',
        'cloud_cover',
        'surface_pressure'
    ]
    
    print("Fetching weather data...")
    # Process in batches to avoid rate limits
    batch_size = 100
    for i in tqdm(range(0, len(unique_queries), batch_size)):
        batch = unique_queries.iloc[i:i+batch_size]
        
        for _, row in batch.iterrows():
            cache_key = (row['lat_rounded'], row['lon_rounded'], row['weather_timestamp'])
            
            if cache_key in weather_cache:
                continue
                
            # Get weather data for this location and hour
            start_date = row['weather_timestamp'].strftime('%Y-%m-%d')
            end_date = (row['weather_timestamp'] + timedelta(days=1)).strftime('%Y-%m-%d')
            
            url = 'https://archive-api.open-meteo.com/v1/archive'
            params = {
                'latitude': row['lat_rounded'],
                'longitude': row['lon_rounded'],
                'start_date': start_date,
                'end_date': end_date,
                'hourly': ','.join(hourly_params)
            }
            
            try:
                response = requests.get(url, params=params)
                response.raise_for_status()
                data = response.json()
                
                # Find the matching hour in the response
                target_hour = row['weather_timestamp']
                target_hour_str = target_hour.strftime('%Y-%m-%dT%H:00')
                
                try:
                    hour_index = data['hourly']['time'].index(target_hour_str)
                    
                    # Store weather data for this location-hour
                    weather_cache[cache_key] = {
                        param: data['hourly'][param][hour_index]
                        for param in hourly_params
                    }
                except ValueError:
                    print(f"Could not find hour {target_hour_str} in weather data")
                    continue
                
                # Respect rate limits
                time.sleep(0.1)
                
            except Exception as e:
                print(f"Error fetching weather data for {row['lat_rounded']}, {row['lon_rounded']}, {row['weather_timestamp']}: {str(e)}")
                continue
    
    print("Adding weather data to records...")
    # Create a new dataframe with weather data
    weather_data = []
    
    # Group by activity to ensure we use the right weather data within each activity
    for activity_id, group in df.groupby('activity_id'):
        # Get weather points for this activity
        activity_weather = weather_points[weather_points['activity_id'] == activity_id]
        
        # For each record in the activity
        for _, record in group.iterrows():
            # Find the closest weather timestamp for this record
            weather_matches = activity_weather[
                (activity_weather['weather_timestamp'] >= record['weather_timestamp'] - pd.Timedelta(minutes=30)) &
                (activity_weather['weather_timestamp'] <= record['weather_timestamp'] + pd.Timedelta(minutes=30))
            ]
            
            if len(weather_matches) > 0:
                # Use the closest weather point in time
                closest_weather = weather_matches.iloc[0]
                cache_key = (
                    closest_weather['lat_rounded'],
                    closest_weather['lon_rounded'],
                    closest_weather['weather_timestamp']
                )
                
                if cache_key in weather_cache:
                    weather_row = weather_cache[cache_key].copy()
                    weather_row['activity_id'] = record['activity_id']
                    weather_row['timestamp'] = record['timestamp']
                    weather_data.append(weather_row)
                    continue
            
            # If no match found or no weather data, add empty record
            weather_data.append({
                'activity_id': record['activity_id'],
                'timestamp': record['timestamp'],
                **{param: None for param in hourly_params}
            })
    
    weather_df = pd.DataFrame(weather_data)
    
    # Merge weather data with original data
    result = pd.merge(
        df.drop(['lat_rounded', 'lon_rounded', 'weather_timestamp'], axis=1),
        weather_df,
        on=['activity_id', 'timestamp'],
        how='left'
    )
    
    # Save to new CSV
    result.to_csv(output_path, index=False)
    print(f"Saved enriched data to {output_path}")
    
    # Print some stats
    print("\nWeather data statistics:")
    for param in hourly_params:
        missing = result[param].isna().sum()
        print(f"{param}: {missing} missing values ({missing/len(result)*100:.1f}%)")

In [23]:
add_weather_data(
    input_path='./data/records_converted.csv',
    output_path='./data/records_with_weather.csv'
)

Loading and preprocessing data...
Found 1409 unique location-time combinations
Fetching weather data...


100%|██████████| 15/15 [12:10<00:00, 48.73s/it]


Adding weather data to records...
Saved enriched data to ./data/records_with_weather.csv

Weather data statistics:
temperature_2m: 0 missing values (0.0%)
relative_humidity_2m: 0 missing values (0.0%)
dew_point_2m: 0 missing values (0.0%)
wind_speed_10m: 0 missing values (0.0%)
wind_direction_10m: 0 missing values (0.0%)
precipitation: 0 missing values (0.0%)
cloud_cover: 0 missing values (0.0%)
surface_pressure: 0 missing values (0.0%)
