# TRAIN DATA
*Using code from code notebooks 00_1_garmin_gpx.ipynb & 00_2_openweather_api.ipynb to build out the two training datasets for female average performance and high performance. See code notebooks for sources on code.*

---
---

# Imports

In [1]:
import pandas as pd
from pandas import json_normalize
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import time

# GPX file
import gpxpy
import gpxpy.gpx

# GPX file
from bs4 import BeautifulSoup
import lxml

# bearing
from geographiclib.geodesic import Geodesic

# distance
import haversine as hs

# API
import requests

%matplotlib inline
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter("ignore")

In [1]:
API_key = 'd43015e86fe9a04995afe46ef9ad42c4'

# Functions
---

## Historical Weather

In [2]:
def hist_weather(unix_datetime, lat, lon, API_key = API_key):
    '''
    Using the OpenWeather OneCall 3.0 API to pull historical daily data
    Requires API key for at minimum Startup OpenWeather subscription
    Input: UNIX datetime for day requesting
    Output: Pandas dataframe for that historical datetime's weather
    '''
    
    url_hist_point = f"https://api.openweathermap.org/data/3.0/onecall/timemachine?lat={lat}&lon={lon}&dt={unix_datetime}&appid={API_key}"
    hist_point_req = requests.get(url_hist_point)

    wd_hist = hist_point_req.json()

    wd_hist_df = json_normalize(wd_hist['data'])
    wd_hist_df = wd_hist_df[['dt', 'temp', 'feels_like', 'pressure', 'humidity',
           'dew_point', 'clouds', 'wind_speed', 'wind_deg']]

    
    return wd_hist_df

## GPX to Dataframe

In [3]:
def gpx_to_df(filepath):
    
    # Open .gpx file and parse xml
    with open(filepath, 'r') as gpx_file:
        gpx = gpxpy.parse(gpx_file)
        
    # check file
    print(f'File: {filepath}')
    # check length of tracks
    print(f'Tracks: {len(gpx.tracks)}')
    # check number of data points
    print(f'No. of Track Points: {gpx.get_track_points_no()}')
    
    # extract timestamp, lat, lon, and elevation from gpx file
    route_info = []

    for track in gpx.tracks:
        for segment in track.segments:
            for point in segment.points:
                route_info.append({
                    'timestamp': point.time,
                    'latitude': point.latitude,
                    'longitude': point.longitude,
                    'elevation': point.elevation,
                })
    
    # create dataframe
    route_df = pd.DataFrame(route_info)
    # create UNIX column for later
    # https://statisticsglobe.com/convert-datetime-to-unix-timestamp-python
    route_df['dt'] = route_df['timestamp'].apply(lambda z: int(datetime.datetime.timestamp(z)))
    
    #######################
    # HEART RATE - REMOVED CADENCE DUE TO LACK OF SENSOR DATA EARLIER THIS YEAR
    # add extension data with beautifulsoup
    xml = gpx.to_xml()
    soup = BeautifulSoup(xml, features = 'xml')
    
    # Use Beautiful Soup to find all heart rate
    hr_soup = soup.find_all('ns3:hr')

    # for loop to extract heart rate and cadence values from each point
    hr_list = []
    for i in hr_soup:
        # https://stackoverflow.com/questions/69420686/bs4-element-resultset-elements-to-a-list
        hr_list.append(i.get_text(strip = True))

    # add columns to dataset from extraction
    route_df['heart_rate'] = hr_list
    
    #######################
    # BEARING
    # get_bearing function
    def get_bearing(lat1, lat2, long1, long2):
        brng = Geodesic.WGS84.Inverse(lat1, long1, lat2, long2)['azi1']
        return brng
    
    # assign bearing in df
    route_df['bearing'] = 0
    for i in range(1, len(route_df)):
        lat1 = route_df.latitude.iloc[i-1]
        lat2 = route_df.latitude.iloc[i]
        long1 = route_df.longitude.iloc[i-1]
        long2 = route_df.longitude.iloc[i]
        bearing = get_bearing(lat1, lat2, long1, long2)
        # https://stats.stackexchange.com/questions/283572/using-iloc-to-set-values
        route_df.bearing.iloc[[i]] = bearing
    
    ########################    
    # ELAPSED TIME - SECONDS
    route_df['timestamp'] = pd.to_datetime(route_df['timestamp'])
    route_df['time_diff_s'] = 0
    for i in range(1, len(route_df)-1):
        t1 = route_df.iloc[i]['timestamp']
        t2 = route_df.iloc[i-1]['timestamp']
        # https://www.geeksforgeeks.org/how-to-set-cell-value-in-pandas-dataframe/
        route_df.at[i, 'time_diff_s'] = (t1 - t2).seconds
    route_df['total_time_s'] = route_df['time_diff_s'].cumsum(skipna = True)
    
    #######################
    # ELEVATION CHANGE - METERS
    route_df['ele_diff_m'] = 0
    for i in range(1, len(route_df)-1):
        e1 = route_df.iloc[i]['elevation']
        e2 = route_df.iloc[i-1]['elevation']
        route_df.at[i, 'ele_diff_m'] = (e1 - e2)
    route_df['total_ele_change_m'] = round(route_df['ele_diff_m'].cumsum(skipna = True), 4)
    
    #######################
    # DISTANCE - KILOMETERS
    route_df['lat_lon'] = [(lat, lon) for lat, lon in zip(route_df['latitude'], route_df['longitude'])]
    route_df['dist_diff_km'] = 0
    for i in range(1, len(route_df)-1):
        loc1 = route_df.iloc[i]['lat_lon']
        loc2 = route_df.iloc[i-1]['lat_lon']
        route_df.at[i, 'dist_diff_km'] = round(hs.haversine(loc1, loc2), 4) # kilometers
    route_df['total_dist_km'] = route_df['dist_diff_km'].cumsum(skipna = True)
    
   
    #######################
    route_df.reset_index(drop = True, inplace = True)
    
    return route_df

# Average Cycling Dataset

## Get GPX files and create dataframe

In [9]:
'''
#https://stackoverflow.com/questions/13603215/using-a-loop-in-python-to-name-variables
average = dict()
 
for x in range(1, 13): # number of files
    average[x] = gpx_to_df(f'../data/average/a_{x}.gpx')

a_df = pd.concat(average)
a_df.dropna(inplace = True)
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html
a_df.reset_index(inplace = True, level = [0,1], drop = True)

print()
print(f'Size of a_df: {a_df.shape}')
print()

a_df.head()
'''

File: ../data/average/a_1.gpx
Tracks: 1
No. of Track Points: 400
File: ../data/average/a_2.gpx
Tracks: 1
No. of Track Points: 3015
File: ../data/average/a_3.gpx
Tracks: 1
No. of Track Points: 853
File: ../data/average/a_4.gpx
Tracks: 1
No. of Track Points: 1333
File: ../data/average/a_5.gpx
Tracks: 1
No. of Track Points: 157
File: ../data/average/a_6.gpx
Tracks: 1
No. of Track Points: 2224
File: ../data/average/a_7.gpx
Tracks: 1
No. of Track Points: 1364
File: ../data/average/a_8.gpx
Tracks: 1
No. of Track Points: 1070
File: ../data/average/a_9.gpx
Tracks: 1
No. of Track Points: 739
File: ../data/average/a_10.gpx
Tracks: 1
No. of Track Points: 181
File: ../data/average/a_11.gpx
Tracks: 1
No. of Track Points: 1453
File: ../data/average/a_12.gpx
Tracks: 1
No. of Track Points: 2304

Size of a_df: (15093, 14)



Unnamed: 0,timestamp,latitude,longitude,elevation,dt,heart_rate,bearing,time_diff_s,total_time_s,ele_diff_m,total_ele_change_m,lat_lon,dist_diff_km,total_dist_km
0,2022-07-20 16:07:45+00:00,38.773466,-121.363686,35.799999,1658333265,78,0.0,0,0,0.0,0.0,"(38.77346634864807, -121.36368582956493)",0.0,0.0
1,2022-07-20 16:07:46+00:00,38.773542,-121.363672,35.599998,1658333266,79,8.292053,1,1,-0.200001,-0.2,"(38.77354153431952, -121.36367183178663)",0.0084,0.0084
2,2022-07-20 16:07:49+00:00,38.77363,-121.363682,35.200001,1658333269,82,-5.32118,3,4,-0.399998,-0.6,"(38.77363029867411, -121.36368239298463)",0.0099,0.0183
3,2022-07-20 16:07:51+00:00,38.773789,-121.363733,35.0,1658333271,83,-13.956066,2,6,-0.200001,-0.8,"(38.77378871664405, -121.36373268440366)",0.0181,0.0364
4,2022-07-20 16:07:52+00:00,38.773786,-121.363766,35.0,1658333272,83,-96.936537,1,7,0.0,-0.8,"(38.77378553152084, -121.36376612819731)",0.0029,0.0393


## Get historical weather and join with average dataframe

In [10]:
hist_weather(1658333265, 38.77346634864807, -121.36368582956493)

Unnamed: 0,dt,temp,feels_like,pressure,humidity,dew_point,clouds,wind_speed,wind_deg
0,1658333265,297.65,297.17,1019,39,282.8,1,0.45,177


In [11]:
'''
a_weather = pd.DataFrame()

for i in range(len(a_df)): # iterate through each trackpoint
    # 600 calls per minute
    unix = a_df['dt'][i]
    lat = a_df['latitude'][i]
    lon = a_df['longitude'][i]
    weather = hist_weather(unix, lat, lon) # get historical weather for trackpoint
    a_weather = a_weather.append(weather) # append to weather dataframe
        
a_df = a_df.merge(a_weather, how = 'inner', on = 'dt') # merge activity dataframe with weather dataframe on unix time stamp
'''

In [12]:
# a_df

Unnamed: 0,timestamp,latitude,longitude,elevation,dt,heart_rate,bearing,time_diff_s,total_time_s,ele_diff_m,total_ele_change_m,lat_lon,dist_diff_km,total_dist_km,temp,feels_like,pressure,humidity,dew_point,clouds,wind_speed,wind_deg
0,2022-07-20 16:07:45+00:00,38.773466,-121.363686,35.799999,1658333265,78,0.000000,0,0,0.000000,0.0,"(38.77346634864807, -121.36368582956493)",0.0000,0.0000,297.65,297.17,1019,39,282.80,1,0.45,177
1,2022-07-20 16:07:46+00:00,38.773542,-121.363672,35.599998,1658333266,79,8.292053,1,1,-0.200001,-0.2,"(38.77354153431952, -121.36367183178663)",0.0084,0.0084,297.65,297.17,1019,39,282.80,1,0.45,177
2,2022-07-20 16:07:49+00:00,38.773630,-121.363682,35.200001,1658333269,82,-5.321180,3,4,-0.399998,-0.6,"(38.77363029867411, -121.36368239298463)",0.0099,0.0183,297.65,297.17,1019,39,282.80,1,0.45,177
3,2022-07-20 16:07:51+00:00,38.773789,-121.363733,35.000000,1658333271,83,-13.956066,2,6,-0.200001,-0.8,"(38.77378871664405, -121.36373268440366)",0.0181,0.0364,297.67,297.17,1019,38,282.43,1,0.45,177
4,2022-07-20 16:07:52+00:00,38.773786,-121.363766,35.000000,1658333272,83,-96.936537,1,7,0.000000,-0.8,"(38.77378553152084, -121.36376612819731)",0.0029,0.0393,297.67,297.17,1019,38,282.43,1,0.45,177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15088,2022-07-05 01:59:12+00:00,37.792068,-122.391728,0.800000,1656986352,132,74.344141,2,17183,0.000000,-4.4,"(37.792068449780345, -122.3917275108397)",0.0095,31.5135,292.05,292.19,1016,84,289.29,67,3.09,256
15089,2022-07-05 01:59:15+00:00,37.792000,-122.391614,1.200000,1656986355,131,127.396091,3,17186,0.400000,-4.0,"(37.791999550536275, -122.39161393605173)",0.0126,31.5261,292.05,292.19,1016,84,289.29,67,3.09,256
15090,2022-07-05 01:59:19+00:00,37.791908,-122.391498,1.600000,1656986359,130,135.017529,4,17190,0.400000,-3.6,"(37.79190776869655, -122.39149834960699)",0.0144,31.5405,292.05,292.19,1016,84,289.29,67,3.09,256
15091,2022-07-05 01:59:21+00:00,37.791908,-122.391492,1.800000,1656986361,127,88.125336,2,17192,0.200000,-3.4,"(37.79190793633461, -122.39149189554155)",0.0006,31.5411,292.05,292.19,1016,84,289.29,67,3.09,256


# High Cycling Dataset

## Get GPX files and create dataframe

In [16]:
'''
#https://stackoverflow.com/questions/13603215/using-a-loop-in-python-to-name-variables
high = dict()

for x in range(1, 17): # number of files
    high[x] = gpx_to_df(f'../data/high/h_{x}.gpx')

h_df = pd.concat(high)
h_df.dropna(inplace = True)
h_df.reset_index(inplace = True, level = [0,1], drop = True)
print()
print(f'Size of h_df: {h_df.shape}')
print()
h_df.head()
'''

File: ../data/high/h_1.gpx
Tracks: 1
No. of Track Points: 1341
File: ../data/high/h_2.gpx
Tracks: 1
No. of Track Points: 818
File: ../data/high/h_3.gpx
Tracks: 1
No. of Track Points: 5334
File: ../data/high/h_4.gpx
Tracks: 1
No. of Track Points: 3541
File: ../data/high/h_5.gpx
Tracks: 1
No. of Track Points: 4800
File: ../data/high/h_6.gpx
Tracks: 1
No. of Track Points: 2054
File: ../data/high/h_7.gpx
Tracks: 1
No. of Track Points: 546
File: ../data/high/h_8.gpx
Tracks: 1
No. of Track Points: 6551
File: ../data/high/h_9.gpx
Tracks: 1
No. of Track Points: 314
File: ../data/high/h_10.gpx
Tracks: 1
No. of Track Points: 1185
File: ../data/high/h_11.gpx
Tracks: 1
No. of Track Points: 1139
File: ../data/high/h_12.gpx
Tracks: 1
No. of Track Points: 1890
File: ../data/high/h_13.gpx
Tracks: 1
No. of Track Points: 821
File: ../data/high/h_14.gpx
Tracks: 1
No. of Track Points: 5056
File: ../data/high/h_15.gpx
Tracks: 1
No. of Track Points: 1264
File: ../data/high/h_16.gpx
Tracks: 1
No. of Track Po

Unnamed: 0,timestamp,latitude,longitude,elevation,dt,heart_rate,bearing,time_diff_s,total_time_s,ele_diff_m,total_ele_change_m,lat_lon,dist_diff_km,total_dist_km
0,2022-04-02 01:03:00+00:00,38.773809,-121.36425,36.400002,1648861380,84,0.0,0,0,0.0,0.0,"(38.7738086655736, -121.36424959637225)",0.0,0.0
1,2022-04-02 01:03:02+00:00,38.773917,-121.364357,36.400002,1648861382,84,-37.730123,2,2,0.0,0.0,"(38.77391695976257, -121.36435663327575)",0.0152,0.0152
2,2022-04-02 01:03:03+00:00,38.773931,-121.364338,36.400002,1648861383,84,46.140647,1,3,0.0,0.0,"(38.77393095754087, -121.3643380254507)",0.0022,0.0174
3,2022-04-02 01:03:06+00:00,38.77383,-121.363695,36.200001,1648861386,87,101.328638,3,6,-0.200001,-0.2,"(38.773830039426684, -121.36369454674423)",0.0569,0.0743
4,2022-04-02 01:03:13+00:00,38.774046,-121.363505,35.799999,1648861393,89,34.489431,7,13,-0.400002,-0.6,"(38.77404646016657, -121.36350461281836)",0.0292,0.1035


## Get historical weather and join with high dataframe

In [17]:
'''
h_weather = pd.DataFrame()

for i in range(len(h_df)): # iterate through each trackpoint
    # 600 calls per minute
    unix = h_df['dt'][i]
    lat = h_df['latitude'][i]
    lon = h_df['longitude'][i]
    weather = hist_weather(unix, lat, lon) # get historical weather for trackpoint
    h_weather = h_weather.append(weather) # append to weather dataframe
        
h_df = h_df.merge(h_weather, how = 'inner', on = 'dt') # merge activity dataframe with weather dataframe on unix time stamp
'''

In [19]:
# h_df.head()

Unnamed: 0,timestamp,latitude,longitude,elevation,dt,heart_rate,bearing,time_diff_s,total_time_s,ele_diff_m,total_ele_change_m,lat_lon,dist_diff_km,total_dist_km,temp,feels_like,pressure,humidity,dew_point,clouds,wind_speed,wind_deg
0,2022-04-02 01:03:00+00:00,38.773809,-121.36425,36.400002,1648861380,84,0.0,0,0,0.0,0.0,"(38.7738086655736, -121.36424959637225)",0.0,0.0,297.94,297.1,1011,24,276.02,0,2.57,170
1,2022-04-02 01:03:02+00:00,38.773917,-121.364357,36.400002,1648861382,84,-37.730123,2,2,0.0,0.0,"(38.77391695976257, -121.36435663327575)",0.0152,0.0152,298.2,297.39,1010,24,276.24,0,2.57,170
2,2022-04-02 01:03:03+00:00,38.773931,-121.364338,36.400002,1648861383,84,46.140647,1,3,0.0,0.0,"(38.77393095754087, -121.3643380254507)",0.0022,0.0174,298.2,297.39,1010,24,276.24,0,2.57,170
3,2022-04-02 01:03:06+00:00,38.77383,-121.363695,36.200001,1648861386,87,101.328638,3,6,-0.200001,-0.2,"(38.773830039426684, -121.36369454674423)",0.0569,0.0743,297.95,297.11,1011,24,276.03,0,2.57,170
4,2022-04-02 01:03:13+00:00,38.774046,-121.363505,35.799999,1648861393,89,34.489431,7,13,-0.400002,-0.6,"(38.77404646016657, -121.36350461281836)",0.0292,0.1035,297.94,297.1,1011,24,276.02,0,2.57,170


# Save CSV

In [18]:
# a_df.to_csv('../data/average/a_df.csv', index = False)
# h_df.to_csv('../data/high/h_df.csv', index = False)