# Building Test Dataset
---
---

# Imports

In [1]:
import pandas as pd
from pandas import json_normalize
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import time

# GPX file
import gpxpy
import gpxpy.gpx

# GPX file
from bs4 import BeautifulSoup
import lxml

# bearing
from geographiclib.geodesic import Geodesic

# distance
import haversine as hs

# API
import requests

%matplotlib inline
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter("ignore")

In [2]:
# Open .gpx file and parse xml
with open('../data/marin_century_classic_2022.gpx', 'r') as gpx_file:
    gpx = gpxpy.parse(gpx_file)

In [3]:
# check out .gpx file as xml
gpx.to_xml()[:2000]

'<?xml version="1.0" encoding="UTF-8"?>\n<gpx xmlns="http://www.topografix.com/GPX/1/1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd" version="1.1" creator="StravaGPX">\n  <metadata>\n    <name>Marin Century Classic 2022</name>\n    <author>\n      <name>Adriana Machado</name>\n      <link href="https://www.strava.com/athletes/7321904">\n      </link>\n    </author>\n    <copyright author="OpenStreetMap contributors">\n      <year>2020</year>\n      <license>https://www.openstreetmap.org/copyright</license>\n    </copyright>\n    <link href="https://www.strava.com/routes/2951301224719649896">\n    </link>\n  </metadata>\n  <trk>\n    <name>Marin Century Classic 2022</name>\n    <link href="https://www.strava.com/routes/2951301224719649896">\n    </link>\n    <type>Ride</type>\n    <trkseg>\n      <trkpt lat="38.11272" lon="-122.65319000000001">\n        <ele>63.04</ele>\n      </trk

# GPX Import to Dataframe

In [4]:
# check length of tracks
print(f'Tracks: {len(gpx.tracks)}')
# check number of data points
print(f'No. of Track Points: {gpx.get_track_points_no()}')

# extract timestamp, lat, lon, and elevation from gpx file
route_info = []

for track in gpx.tracks:
    for segment in track.segments:
        for point in segment.points:
            route_info.append({
                'latitude': point.latitude,
                'longitude': point.longitude,
                'elevation': point.elevation,
            })

# create dataframe
test_df = pd.DataFrame(route_info)
test_df.head()

Tracks: 1
No. of Track Points: 2849


Unnamed: 0,latitude,longitude,elevation
0,38.11272,-122.65319,63.04
1,38.11343,-122.65308,63.4
2,38.1137,-122.65309,63.68
3,38.11425,-122.65358,64.76
4,38.11461,-122.65354,65.55


In [5]:
test_df.isnull().sum()

latitude     0
longitude    0
elevation    0
dtype: int64

## Create Bearing Column

In [6]:
def get_bearing(lat1, lat2, long1, long2):
    brng = Geodesic.WGS84.Inverse(lat1, long1, lat2, long2)['azi1']
    return brng

In [7]:
get_bearing(38.773795, 38.773860, -121.363652, -121.363689)

-24.017929532262528

In [8]:
test_df['bearing'] = 0
for i in range(1, len(test_df)):
    lat1 = test_df.latitude.iloc[i-1]
    lat2 = test_df.latitude.iloc[i]
    long1 = test_df.longitude.iloc[i-1]
    long2 = test_df.longitude.iloc[i]
    bearing = get_bearing(lat1, lat2, long1, long2)
    # https://stats.stackexchange.com/questions/283572/using-iloc-to-set-values
    test_df.bearing.iloc[[i]] = bearing

In [9]:
test_df.head()

Unnamed: 0,latitude,longitude,elevation,bearing
0,38.11272,-122.65319,63.04,0.0
1,38.11343,-122.65308,63.4,6.978612
2,38.1137,-122.65309,63.68,-1.676102
3,38.11425,-122.65358,64.76,-35.140713
4,38.11461,-122.65354,65.55,5.016818


# Create Elevation Columns

In [10]:
# ELEVATION CHANGE - METERS
test_df['ele_diff_m'] = 0
for i in range(1, len(test_df)-1):
    e1 = test_df.iloc[i]['elevation']
    e2 = test_df.iloc[i-1]['elevation']
    test_df.at[i, 'ele_diff_m'] = (e1 - e2)
test_df['total_ele_change_m'] = round(test_df['ele_diff_m'].cumsum(skipna = True), 4)
test_df.head()

Unnamed: 0,latitude,longitude,elevation,bearing,ele_diff_m,total_ele_change_m
0,38.11272,-122.65319,63.04,0.0,0.0,0.0
1,38.11343,-122.65308,63.4,6.978612,0.36,0.36
2,38.1137,-122.65309,63.68,-1.676102,0.28,0.64
3,38.11425,-122.65358,64.76,-35.140713,1.08,1.72
4,38.11461,-122.65354,65.55,5.016818,0.79,2.51


# Create Distance Columns

In [11]:
# DISTANCE - KILOMETERS
test_df['lat_lon'] = [(lat, lon) for lat, lon in zip(test_df['latitude'], test_df['longitude'])]
test_df['dist_diff_km'] = 0
for i in range(1, len(test_df)-1):
    loc1 = test_df.iloc[i]['lat_lon']
    loc2 = test_df.iloc[i-1]['lat_lon']
    test_df.at[i, 'dist_diff_km'] = round(hs.haversine(loc1, loc2), 4) # kilometers
test_df['total_dist_km'] = test_df['dist_diff_km'].cumsum(skipna = True)
test_df.head()

Unnamed: 0,latitude,longitude,elevation,bearing,ele_diff_m,total_ele_change_m,lat_lon,dist_diff_km,total_dist_km
0,38.11272,-122.65319,63.04,0.0,0.0,0.0,"(38.11272, -122.65319000000001)",0.0,0.0
1,38.11343,-122.65308,63.4,6.978612,0.36,0.36,"(38.11343, -122.65308000000002)",0.0795,0.0795
2,38.1137,-122.65309,63.68,-1.676102,0.28,0.64,"(38.1137, -122.65309)",0.03,0.1095
3,38.11425,-122.65358,64.76,-35.140713,1.08,1.72,"(38.114250000000006, -122.65358)",0.0747,0.1842
4,38.11461,-122.65354,65.55,5.016818,0.79,2.51,"(38.114610000000006, -122.65354)",0.0402,0.2244


In [12]:
test_df.tail()

Unnamed: 0,latitude,longitude,elevation,bearing,ele_diff_m,total_ele_change_m,lat_lon,dist_diff_km,total_dist_km
2844,38.11461,-122.65354,65.49,-152.817726,-0.18,2.45,"(38.114610000000006, -122.65354)",0.025,161.3451
2845,38.11425,-122.65358,64.76,-174.983157,-0.73,1.72,"(38.114250000000006, -122.65358)",0.0402,161.3853
2846,38.1137,-122.65309,63.68,144.858985,-1.08,0.64,"(38.1137, -122.65309)",0.0747,161.46
2847,38.11343,-122.65308,63.4,178.323892,-0.28,0.36,"(38.11343, -122.65308000000002)",0.03,161.49
2848,38.11299,-122.65314,63.22,-173.850794,0.0,0.36,"(38.11299, -122.65314000000001)",0.0,161.49


# Time and Elapsed Time

In [17]:
# determine start date and time
# model time_diff_s total_time_s
# add predictions and convert to dt's and timestamps from start time

# Get Weather Variables

In [13]:
def hist_weather(unix_datetime, lat, lon):
    '''
    Using the OpenWeather OneCall 3.0 API to pull historical daily data
    Input: UNIX datetime for day requesting
    Output: Pandas dataframe for that day's weather
    '''
    
    url_hist_point = f"https://api.openweathermap.org/data/3.0/onecall/timemachine?lat={lat}&lon={lon}&dt={unix_datetime}&appid=d43015e86fe9a04995afe46ef9ad42c4"
    hist_point_req = requests.get(url_hist_point)

    wd_hist = hist_point_req.json()

    wd_hist_df = json_normalize(wd_hist['data'])
    wd_hist_df = wd_hist_df[['dt', 'temp', 'feels_like', 'pressure', 'humidity',
           'dew_point', 'clouds', 'wind_speed', 'wind_deg']]

    
    return wd_hist_df

In [16]:
'''
t_weather = pd.DataFrame()

for i in range(len(test_df)): # iterate through each trackpoint
    # 600 calls per minute
    unix = test_df['dt'][i]
    lat = test_df['latitude'][i]
    lon = test_df['longitude'][i]
    weather = hist_weather(unix, lat, lon) # get historical weather for trackpoint
    t_weather = t_weather.append(weather) # append to weather dataframe
        
test_df = test_df.merge(t_weather, how = 'inner', on = 'dt') # merge activity dataframe with weather dataframe on unix time stamp
'''

"\nt_weather = pd.DataFrame()\n\nfor i in range(len(test_df)): # iterate through each trackpoint\n    # 600 calls per minute\n    unix = test_df['dt'][i]\n    lat = test_df['latitude'][i]\n    lon = test_df['longitude'][i]\n    weather = hist_weather(unix, lat, lon) # get historical weather for trackpoint\n    t_weather = t_weather.append(weather) # append to weather dataframe\n        \ntest_df = test_df.merge(t_weather, how = 'inner', on = 'dt') # merge activity dataframe with weather dataframe on unix time stamp\n"