# Exploratory Data Analysis for Machine Learning

## About Dataset

The data contains running activity data from March 2022 to December 2023. The data has been downloaded using the Strava API.

The data consists of 105 rows and 19 variables. With 4 float variables, 8 integer variables, and 7 categorical variables.

In [41]:
from IPython.display import clear_output

In [55]:
%pip install kagglehub
%pip install pandas
%pip install openpyxl
%pip install seaborn
%pip install plotly
%pip install scikit-learn
%pip install geopandas
%pip install "nbformat>=4.2.0"
clear_output() 

In [56]:
import kagglehub
from pathlib import Path
import pandas as pd
import seaborn as sns
import plotly.express as px
import numpy as np
from scipy.stats import skew, kurtosis
import geopandas as gpd
import matplotlib.pyplot as plt

In [2]:
# Download latest version
path = kagglehub.dataset_download("ajitjadhav1/strava-running-activity-data")

try:
    Path(path).rename(r"data\Strava Running Data.xlsx")
except FileExistsError:
    print(
        "Path to dataset files:",
        r"data\Strava Running Data.xlsx\Strava Running Data.xlsx",
    )

Path to dataset files: data\Strava Running Data.xlsx\Strava Running Data.xlsx


In [60]:
data = pd.read_excel(r"data\Strava Running Data.xlsx\Strava Running Data.xlsx")
data

Unnamed: 0,Sr. no.,start_date_local,type,distance,moving_time,elapsed_time,total_elevation_gain,start_latlng,end_latlng,sport_type,start_date,timezone,achievement_count,kudos_count,comment_count,athlete_count,photo_count,average_speed,max_speed
0,1,2023-12-09T09:09:19Z,Run,10879.7,4023,4617,91.4,"[40.70327935740352, -73.99619171395898]","[40.69340907968581, -73.97922154515982]",Run,2023-12-09T14:09:19Z,(GMT-05:00) America/New_York,4,4,0,3,0,2.704,6.228
1,2,2023-12-07T17:31:50Z,Run,1304.4,722,62993,0.0,"[40.7220459356904, -74.03641730546951]","[40.719722136855125, -74.03257705271244]",Run,2023-12-07T22:31:50Z,(GMT-05:00) America/New_York,0,0,0,1,0,1.807,4.044
2,3,2023-12-03T09:18:13Z,Run,17503.0,7370,7462,68.4,"[40.74991073459387, -73.98751585744321]","[40.73538766242564, -73.9793517999351]",Run,2023-12-03T14:18:13Z,(GMT-05:00) America/New_York,2,5,0,4,0,2.375,6.778
3,4,2023-12-02T09:41:14Z,Run,3457.8,1791,2170,3.9,"[40.72203076444566, -74.03641068376601]","[40.71398631669581, -74.03902692720294]",Run,2023-12-02T14:41:14Z,(GMT-05:00) America/New_York,0,2,0,3,0,1.931,4.522
4,5,2023-12-01T17:06:05Z,Run,10108.2,4128,4221,6.3,"[40.72190687991679, -74.03624514117837]","[40.71669920347631, -74.03264125809073]",Run,2023-12-01T22:06:05Z,(GMT-05:00) America/New_York,4,5,0,1,0,2.449,5.462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,101,2022-06-12T09:20:44Z,Run,7857.2,3900,4113,116.6,"[39.27252494730055, -76.69631998054683]","[39.27245143800974, -76.69639013707638]",Run,2022-06-12T13:20:44Z,(GMT-05:00) America/New_York,6,0,0,1,0,2.015,3.326
101,102,2022-05-28T11:11:38Z,Run,6067.2,3334,3555,88.2,"[39.272494772449136, -76.69640866108239]","[39.27246367558837, -76.69636884704232]",Run,2022-05-28T15:11:38Z,(GMT-05:00) America/New_York,3,0,0,1,0,1.820,3.172
102,103,2022-05-22T07:13:30Z,Run,4587.6,2263,2478,56.6,"[39.272621758282185, -76.69633297249675]","[39.27250734530389, -76.69635468162596]",Run,2022-05-22T11:13:30Z,(GMT-05:00) America/New_York,5,0,0,1,0,2.027,3.557
103,104,2022-05-21T07:27:40Z,Run,4313.6,2182,2426,55.0,"[39.27257808856666, -76.69626507908106]","[39.27250818349421, -76.69635669328272]",Run,2022-05-21T11:27:40Z,(GMT-05:00) America/New_York,4,0,0,1,0,1.977,3.874


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Sr. no.               105 non-null    int64  
 1   start_date_local      105 non-null    object 
 2   type                  105 non-null    object 
 3   distance              105 non-null    float64
 4   moving_time           105 non-null    int64  
 5   elapsed_time          105 non-null    int64  
 6   total_elevation_gain  105 non-null    float64
 7   start_latlng          105 non-null    object 
 8   end_latlng            105 non-null    object 
 9   sport_type            105 non-null    object 
 10  start_date            105 non-null    object 
 11  timezone              105 non-null    object 
 12  achievement_count     105 non-null    int64  
 13  kudos_count           105 non-null    int64  
 14  comment_count         105 non-null    int64  
 15  athlete_count         1

In [5]:
data.describe()

Unnamed: 0,Sr. no.,distance,moving_time,elapsed_time,total_elevation_gain,achievement_count,kudos_count,comment_count,athlete_count,photo_count,average_speed,max_speed
count,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0,105.0
mean,53.0,7067.378095,3554.019048,4753.238095,31.277143,3.07619,0.866667,0.028571,1.628571,0.0,2.197362,4.988343
std,30.454885,5785.399832,2627.200905,6956.141991,50.915702,3.657698,1.092515,0.167398,3.000824,0.0,0.797553,2.8577
min,1.0,0.0,85.0,217.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,27.0,3181.3,1880.0,2042.0,0.0,0.0,0.0,0.0,1.0,0.0,2.015,3.8
50%,53.0,5266.2,2564.0,3058.0,5.0,2.0,0.0,0.0,1.0,0.0,2.375,4.586
75%,79.0,10137.9,4768.0,5181.0,55.0,4.0,2.0,0.0,1.0,0.0,2.566,5.714
max,105.0,32192.3,14536.0,62993.0,290.4,17.0,5.0,1.0,27.0,0.0,6.085,18.281


In [6]:
data["year"] = pd.DatetimeIndex(data["start_date"]).year
data["month"] = pd.DatetimeIndex(data["start_date"]).month
data["day"] = pd.DatetimeIndex(data["start_date"]).day

## Initial Plan for the data

The data will be analyzed over various variables per type.

In [8]:
cont_cols = (
    "distance",
    "moving_time",
    "elapsed_time",
    "total_elevation_gain",
    "kudos_count",
    "average_speed",
    "max_speed",
)
for col in cont_cols:
    fig1 = px.line(data, "start_date", col, color="type")
    fig1.show()
    fig2 = px.histogram(data, "type", col)
    fig2.show()

Check for skewness and kurtosis. Skewness is the measure of asymmetry of the probability distribution of a continous variable. A skewness of -1 to 1 is considered acceptable. Kurtosis is the measure of tailedness of the probability distribution of a continouse variable. A kurtosis value of -2 to 2.

In [10]:
for col in cont_cols:
    print(f"Skewness of {col}:", skew(data[col]))
    print(f"Kurtosis of {col}:", kurtosis(data[col]))

Skewness of distance: 1.5934895889435066
Kurtosis of distance: 3.252392540915764
Skewness of moving_time: 1.404366354542567
Kurtosis of moving_time: 2.260747478757996
Skewness of elapsed_time: 6.360598252087873
Kurtosis of elapsed_time: 47.5401833619346
Skewness of total_elevation_gain: 2.1649192823144796
Kurtosis of total_elevation_gain: 5.7073612532609665
Skewness of kudos_count: 1.4218115145156738
Kurtosis of kudos_count: 2.390504995033231
Skewness of average_speed: -0.3281567753891057
Kurtosis of average_speed: 6.647197383202823
Skewness of max_speed: 1.5477080073010587
Kurtosis of max_speed: 5.548894957555968


Apply Log Transformation to evaluate impact on skewness and kurtosis.

In [21]:
for col in cont_cols:
    print(f"Skewness of {col}:", skew(np.log(data[data[col] > 0][col])))
    print(f"Kurtosis of {col}:", kurtosis(np.log(data[data[col] > 0][col])))

Skewness of distance: -0.732391837424972
Kurtosis of distance: 1.3243355403890913
Skewness of moving_time: -0.9826159268824273
Kurtosis of moving_time: 2.326464261467911
Skewness of elapsed_time: -0.09438798360714662
Kurtosis of elapsed_time: 2.0047473049324616
Skewness of total_elevation_gain: 0.14670236014215177
Kurtosis of total_elevation_gain: -1.5431234170410872
Skewness of kudos_count: 0.5757576562408753
Kurtosis of kudos_count: -0.2764166416483054
Skewness of average_speed: -1.0039616666331685
Kurtosis of average_speed: 7.357739216433007
Skewness of max_speed: 1.1536706878469605
Kurtosis of max_speed: 1.6751444422215576


Apply Square Root Transformation to evaluate impact on skewness and kurtosis.

In [22]:
for col in cont_cols:
    print(f"Skewness of {col}:", skew(np.sqrt(data[data[col] > 0][col])))
    print(f"Kurtosis of {col}:", kurtosis(np.sqrt(data[data[col] > 0][col])))

Skewness of distance: 0.6475068973500417
Kurtosis of distance: 0.5824194631416924
Skewness of moving_time: 0.48035441982506044
Kurtosis of moving_time: 0.043862820834660976
Skewness of elapsed_time: 2.962151072940719
Kurtosis of elapsed_time: 14.806947773639155
Skewness of total_elevation_gain: 0.7202721241797174
Kurtosis of total_elevation_gain: -0.5952934016173193
Skewness of kudos_count: 1.1438995920738122
Kurtosis of kudos_count: 1.446491249020875
Skewness of average_speed: 0.6081591254042092
Kurtosis of average_speed: 9.612991003243007
Skewness of max_speed: 1.8381537009549378
Kurtosis of max_speed: 4.3718317187733575


Categorical variables will be analyzed by checking the number of unique variables. Plots of the catrgorical variables will include histograms and boxplots.

In [23]:
for col in ("type", "sport_type", "timezone"):
    if data[col].dtype == "object":
        print(col, data[col].unique())

type ['Run' 'Hike' 'Workout' 'Walk']
sport_type ['Run' 'Hike' 'Tennis' 'Walk' 'Workout']
timezone ['(GMT-05:00) America/New_York' '(GMT-04:00) America/Anguilla'
 '(GMT+00:00) GMT']


Histogram plots of sport_type show that running is by far the most popular activity. The majority of activities occur in the GMT-5:00 timezone.

In [29]:
px.histogram(data, x="sport_type")

In [30]:
px.histogram(data, x="timezone")

Boxplots of the sport_type with elapsed_time and distance are shown. Running as the most varied amount of workouts via time and distance. There are a coupple of extreme data points. Running also is the most likely activity to receive a kudos.

In [34]:
px.box(data, x="sport_type", y="elapsed_time")

In [35]:
px.box(data, x="sport_type", y="distance")

In [36]:
px.box(data, x="sport_type", y="kudos_count")

## Data Cleaning and Feature Engineering

The start_date is used to extract the Day, Month, and Year. Each column is confirmed to have no null values.

The distance variables are converted to miles, the time variables are converted to minutes, the speed variables are converted to miles per hour.

A pace variables is added by dividing moving_time_minutes by distace_miles.

In [13]:
data.isnull().any()

Sr. no.                 False
start_date_local        False
type                    False
distance                False
moving_time             False
elapsed_time            False
total_elevation_gain    False
start_latlng            False
end_latlng              False
sport_type              False
start_date              False
timezone                False
achievement_count       False
kudos_count             False
comment_count           False
athlete_count           False
photo_count             False
average_speed           False
max_speed               False
year                    False
month                   False
day                     False
dtype: bool

In [31]:
data["distance_miles"] = data["distance"] * 0.000621371
data["moving_time"] = pd.to_timedelta(data["moving_time"], unit="s")
data["moving_time_minutes"] = data["moving_time"].dt.total_seconds() / 60
data["elapsed_time"] = pd.to_timedelta(data["elapsed_time"], unit="s")
data["elapsed_time_minutes"] = data["elapsed_time"].dt.total_seconds() / 60
data["average_speed_mph"] = data["average_speed"] * 2.23694
data["max_speed_mph"] = data["max_speed"] * 2.23694
data["pace"] = data["moving_time"] / data["distance_miles"]

In [47]:
px.scatter(data, "elapsed_time", "distance", color="type")

### Outliers

There are 2 data points that are to the extreme for elapsed_time. These data points will be removed.

In [44]:
data = data.drop(data[data["elapsed_time"] == 33260].index)
data = data.drop(data[data["elapsed_time"] == 62993].index)
data

Unnamed: 0,Sr. no.,start_date_local,type,distance,moving_time,elapsed_time,total_elevation_gain,start_latlng,end_latlng,sport_type,start_date,timezone,achievement_count,kudos_count,comment_count,athlete_count,photo_count,average_speed,max_speed
0,1,2023-12-09T09:09:19Z,Run,10879.7,4023,4617,91.4,"[40.70327935740352, -73.99619171395898]","[40.69340907968581, -73.97922154515982]",Run,2023-12-09T14:09:19Z,(GMT-05:00) America/New_York,4,4,0,3,0,2.704,6.228
2,3,2023-12-03T09:18:13Z,Run,17503.0,7370,7462,68.4,"[40.74991073459387, -73.98751585744321]","[40.73538766242564, -73.9793517999351]",Run,2023-12-03T14:18:13Z,(GMT-05:00) America/New_York,2,5,0,4,0,2.375,6.778
3,4,2023-12-02T09:41:14Z,Run,3457.8,1791,2170,3.9,"[40.72203076444566, -74.03641068376601]","[40.71398631669581, -74.03902692720294]",Run,2023-12-02T14:41:14Z,(GMT-05:00) America/New_York,0,2,0,3,0,1.931,4.522
4,5,2023-12-01T17:06:05Z,Run,10108.2,4128,4221,6.3,"[40.72190687991679, -74.03624514117837]","[40.71669920347631, -74.03264125809073]",Run,2023-12-01T22:06:05Z,(GMT-05:00) America/New_York,4,5,0,1,0,2.449,5.462
5,6,2023-11-29T17:22:51Z,Run,6694.7,2913,2983,9.8,"[40.72185751050711, -74.03604313731194]","[40.719508565962315, -74.03692323714495]",Run,2023-11-29T22:22:51Z,(GMT-05:00) America/New_York,1,2,1,1,0,2.298,4.972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,101,2022-06-12T09:20:44Z,Run,7857.2,3900,4113,116.6,"[39.27252494730055, -76.69631998054683]","[39.27245143800974, -76.69639013707638]",Run,2022-06-12T13:20:44Z,(GMT-05:00) America/New_York,6,0,0,1,0,2.015,3.326
101,102,2022-05-28T11:11:38Z,Run,6067.2,3334,3555,88.2,"[39.272494772449136, -76.69640866108239]","[39.27246367558837, -76.69636884704232]",Run,2022-05-28T15:11:38Z,(GMT-05:00) America/New_York,3,0,0,1,0,1.820,3.172
102,103,2022-05-22T07:13:30Z,Run,4587.6,2263,2478,56.6,"[39.272621758282185, -76.69633297249675]","[39.27250734530389, -76.69635468162596]",Run,2022-05-22T11:13:30Z,(GMT-05:00) America/New_York,5,0,0,1,0,2.027,3.557
103,104,2022-05-21T07:27:40Z,Run,4313.6,2182,2426,55.0,"[39.27257808856666, -76.69626507908106]","[39.27250818349421, -76.69635669328272]",Run,2022-05-21T11:27:40Z,(GMT-05:00) America/New_York,4,0,0,1,0,1.977,3.874


In [46]:
px.scatter(data, "elapsed_time", "distance", color="type")

### Graph location of activities.

In [127]:
# def coord(coord_str):
#     coords = [float(coord) for coord in coord_str.strip("[]").split(", ")]
#     return coords
data["start_latlng"] = data["start_latlng"].str.replace("[", "").str.replace("]", "")
data["end_latlng"] = data["end_latlng"].str.replace("[", "").str.replace("]", "")
data[["start_lat", "start_long"]] = data["start_latlng"].str.split(",", expand=True)
data[["end_lat", "end_long"]] = data["end_latlng"].str.split(",", expand=True)

geo_df = data[data["start_lat"] != ""]
geo_df.loc[:, "start_lat"] = geo_df.loc[:, "start_lat"].astype(float)
geo_df.loc[:, "start_long"] = geo_df.loc[:, "start_long"].astype(float)
geo_df.loc[:, "end_lat"] = geo_df.loc[:, "end_lat"].astype(float)
geo_df.loc[:, "end_long"] = geo_df.loc[:, "end_long"].astype(float)

In [128]:
color_scale = [(0, 'orange'), (1,'red')]

fig = px.scatter_mapbox(geo_df, 
                        lat="start_lat", 
                        lon="start_long", 
                        # hover_name="Address", 
                        # hover_data=["Address", "Listed"],
                        color="type",
                        color_continuous_scale=color_scale,
                        size="distance",
                        zoom=8, 
                        height=800,
                        width=800)
for i, row in geo_df.iterrows():
    fig.add_trace(px.line_mapbox(
        lat=[row['start_lat'], row['end_lat']],
        lon=[row['start_long'], row['end_long']],
        color_discrete_sequence=['red']
    ).data[0])
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()