# In this notebook, I will merge all Strava and NRC activities in one single DataFrame

In [1]:
import pandas as pd
import numpy as np
import numpy_ext
import gpxpy
import geopy.distance
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [18]:
directory = 'all_gpx_files'
 
# iterate over files in
# that directory
def gpx_iterate(directory):
    gpx_file_list=[]
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            gpx_file_list.append(f)
    return gpx_file_list

## Great, that's how I'll iterate over every gpx file : )

In [21]:
file_extensions = gpx_iterate(directory)

In [23]:
file_extensions[0:10]

['all_gpx_files/6239779520.gpx',
 'all_gpx_files/1056004315.gpx',
 'all_gpx_files/0b0a39b7-9362-4b1d-9895-6e2c38046bc3.gpx',
 'all_gpx_files/1511363289.gpx',
 'all_gpx_files/1460279536.gpx',
 'all_gpx_files/3468966238.gpx',
 'all_gpx_files/575a84eb-24bc-4241-9651-224c31be8582.gpx',
 'all_gpx_files/6133203641.gpx',
 'all_gpx_files/2950351388.gpx',
 'all_gpx_files/70a4ee20-02f7-4ab7-a400-b425cd759a92.gpx']

In [36]:
df_list=['time','latitude','longitude','elevation','distance','cumulative_distance',\
                           'duration','cumulative_duration','pace_metric']

In [37]:
df_list

['time',
 'latitude',
 'longitude',
 'elevation',
 'distance',
 'cumulative_distance',
 'duration',
 'cumulative_duration',
 'pace_metric']

In [73]:
all_gpx_df = pd.DataFrame(columns={'time','latitude','longitude','elevation','distance','cumulative_distance',\
                           'duration','cumulative_duration','pace_metric'})
all_gpx_df = all_gpx_df[['time','latitude','longitude','elevation','distance','cumulative_distance',\
                           'duration','cumulative_duration','pace_metric']]
all_gpx_df

Unnamed: 0,time,latitude,longitude,elevation,distance,cumulative_distance,duration,cumulative_duration,pace_metric


In [74]:
test=file_extensions[0:10]

In [77]:
# all_gpx_df=pd.DataFrame()
for file in file_extensions:
    gpx_path = file
    with open(gpx_path) as f:
        gpx = gpxpy.parse(f)
        points = []
        for segment in gpx.tracks[0].segments:
            for p in segment.points:
                points.append({
                    'time': p.time,
                    'latitude': p.latitude,
                    'longitude': p.longitude,
                    'elevation': p.elevation,
                })
        gpx_df = pd.DataFrame.from_records(points)
        # Cumulative distance.
        coords = [(p.latitude, p.longitude) for p in gpx_df.itertuples()]
        gpx_df['distance'] = [0] + [geopy.distance.distance(from_, to).m for from_, to in zip(coords[:-1], coords[1:])]
        gpx_df['cumulative_distance'] = gpx_df.distance.cumsum()
        # Timing.
        gpx_df['duration'] = gpx_df.time.diff().dt.total_seconds().fillna(0)
        gpx_df['cumulative_duration'] = gpx_df.duration.cumsum()
        gpx_df['pace_metric'] = pd.Series((gpx_df.duration / 60) / (gpx_df.distance / 1000)).bfill()
    all_gpx_df = pd.concat([all_gpx_df,gpx_df])
    

In [78]:
all_gpx_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1577238 entries, 0 to 993
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype                        
---  ------               --------------    -----                        
 0   time                 1577238 non-null  datetime64[ns, SimpleTZ("Z")]
 1   latitude             1577238 non-null  float64                      
 2   longitude            1577238 non-null  float64                      
 3   elevation            1575282 non-null  float64                      
 4   distance             1577238 non-null  float64                      
 5   cumulative_distance  1577238 non-null  float64                      
 6   duration             1577238 non-null  float64                      
 7   cumulative_duration  1577238 non-null  float64                      
 8   pace_metric          1577237 non-null  float64                      
dtypes: datetime64[ns, SimpleTZ("Z")](1), float64(8)
memory usage: 120.3 MB


## Cracked it : )

In [79]:
all_gpx_df.to_csv('GPX_DATA.csv')

# The above file is 1.5 million rows, and is lagging Tableau. Considering reducing my gpx data points by a factor of possibly 10

In [None]:
all_gpx_df = pd.read_csv('GPX_DATA.csv')

In [3]:
all_gpx_df.head()

Unnamed: 0.1,Unnamed: 0,time,latitude,longitude,elevation,distance,cumulative_distance,duration,cumulative_duration,pace_metric
0,0,2021-11-10 18:45:08+00:00,29.426778,-98.489987,197.1,0.0,0.0,0.0,0.0,2.629462
1,1,2021-11-10 18:45:09+00:00,29.426796,-98.489925,197.1,6.338432,6.338432,1.0,1.0,2.629462
2,2,2021-11-10 18:45:10+00:00,29.426848,-98.48989,197.1,6.690013,13.028445,1.0,2.0,2.491276
3,3,2021-11-10 18:45:11+00:00,29.426876,-98.489894,197.1,3.127776,16.156221,1.0,3.0,5.328599
4,4,2021-11-10 18:45:12+00:00,29.426863,-98.4899,197.1,1.554134,17.710355,1.0,4.0,10.724088


## If you use the time as the index, resampling could work.
- or something like 
    - df1 = df[df.index % 3 != 0]  # Excludes every 3rd row starting from 0
    - df2 = df[df.index % 3 == 0]  # Selects every 3rd raw starting from 0

In [50]:
# just testing here

# Load gpx.
gpx_path = 'all_gpx_files/70a4ee20-02f7-4ab7-a400-b425cd759a92.gpx'
with open(gpx_path) as f:
    gpx = gpxpy.parse(f)

In [51]:
gpx

GPX(tracks=[GPXTrack(name='RUN 2020-05-08T23:27:41.313Z', segments=[GPXTrackSegment(points=[...])])])