In [None]:
# Import Packages
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

# Show All Columns and Rows for Dataframes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Load in Data

## Load in Actute Training Load Data

In [None]:
# Load in Acute Training Load Data
atl_1_main = pd.read_json('Raw Data/Acute_Training_Load/MetricsAcuteTrainingLoad_20231103_20240211_117832404.json')
atl_2_main = pd.read_json('Raw Data/Acute_Training_Load/MetricsAcuteTrainingLoad_20240211_20240521_117832404.json')
atl_3_main = pd.read_json('Raw Data/Acute_Training_Load/MetricsAcuteTrainingLoad_20240521_20240829_117832404.json')

# Make Copies of Dataframes
atl_1 = atl_1_main.copy()
atl_2 = atl_2_main.copy()
atl_3 = atl_3_main.copy()

# View Imported File
atl_1
#atl_2
#atl_3

In [None]:
# Assess atl_1 characteristcs
print(atl_1.shape)
print(atl_1.dtypes)

In [None]:
# View Data
## atl_1

# Filter out records with 'acwrStatus' == "NONE"\
atl_1_cleaned = atl_1[atl_1['acwrStatus'] != "NONE"]

atl_1_cleaned.head()

# atl_1_cleaned.shape ###(170,10) Dropped 34 of 204 records

### It appears the NONE records are from when I got the watch
### None of the records in atl_2 or atl_3 have NONE values for acwrStatus
### print(atl_2[atl_2['acwrStatus'] == "NONE"].shape)
### print(atl_3[atl_3['acwrStatus'] == "NONE"].shape)

#### Assess Null Values in Acute Training Load Data

In [None]:
# Count null values in each column
null_counts_1 = atl_1.isna().sum()
null_counts_2 = atl_2.isna().sum()
null_counts_3 = atl_3.isna().sum()

# Display the null counts
print(null_counts_1)
print(null_counts_2)
print(null_counts_3)

In [None]:
# Assess Datatypes
atl_1_cleaned.dtypes

### Combine 3 Acute Training Load Dataframes into 1 Dataframe

In [None]:
# Combine the three DataFrames into one
combined_atl = pd.concat([atl_1_cleaned, atl_2, atl_3], ignore_index=True)

# Change 'calendarDate' to Datetime
## combined_atl['calendarDate'] = pd.to_datetime(combined_atl['calendarDate']) ### Doesn't work as intended
combined_atl['calendarDate'] = pd.to_datetime(combined_atl['timestamp']).dt.date

### 'calendarDate' is in a really weird format so I am overriding it with the date from 'timestamp'
combined_atl.head()

In [None]:
# Drop 'deviceId'
combined_atl = combined_atl.drop('deviceId', axis=1) ### Comment out after first execution

# View Results
combined_atl.head()

# Check for null values --> None
## combined_atl.isna().sum()

### Assess Null Values in Combined Dataframe

In [None]:
# Count null values in each column
null_counts = combined_atl.isna().sum()

# Display the null counts
print(null_counts)

In [None]:
# Drop rows where dailyAcuteChronicWorkloadRatio is NaN
combined_atl_cleaned = combined_atl[combined_atl['dailyAcuteChronicWorkloadRatio'].notna()]
combined_atl_cleaned

### PROBLEM: There are multiple records for each 'calendarDate'. I only want 1 record for the MAX 'timestamp'

### Filter to 1 record for each 'calendarDate'.
#### If there are multiple records for 1 Date, then keep the record with the maximum timestamp for that day

In [None]:
# Group by 'calendarDate' and get the index of the row with the greatest 'timestamp' for each day
max_timestamp_idx = combined_atl_cleaned.groupby('calendarDate')['timestamp'].idxmax()

# Select the rows with the maximum 'timestamp' for each day
combined_atl_cleaned = combined_atl_cleaned.loc[max_timestamp_idx]

# Verify the result
combined_atl_cleaned

In [None]:
# Drop 'acwrStatusFeedback' and 'timestamp' as these columns do not provide any value for ML model
combined_atl_cleaned = combined_atl_cleaned.drop(['userProfilePK','acwrStatusFeedback','timestamp'],axis=1)
combined_atl_cleaned.head()

### Save Pre-Processed Acute Training Load Data into a csv

In [None]:
# Save the DataFrame to a CSV file
combined_atl_cleaned.to_csv('Processed_Data/Acute_Training_Load_Cleaned.csv', index=False)

## Load in Bio Metrics Data

In [None]:
## Load in Bio Metric Data -> Not going to use for initial model.

# bio_profile_main = pd.read_json('Raw Data/Bio_Metrics/117832404_userBioMetricProfileData.json')  ### Not Useful
# bio_metrics_main = pd.read_json('Raw Data/Bio_Metrics/117832404_userBioMetrics.json')  ### Not Useful

## Make Copies of Dataframes
# bio_profile = bio_profile_main.copy()
# bio_metrics = bio_metrics_main.copy()

# # View Imported File
# bio_profile.head() ### 1 Row of Useless Data for ML Model
# bio_metrics.head() ### Useless Data for ML Model

# # Convert the dictionary-like values in 'allDayStress' and specify suffixes to avoid column overlap
# bio_metrics = bio_metrics.join(bio_metrics['metaData'].apply(pd.Series), rsuffix='_meta')

# combined_uds.head()

# # Drop Columns
# bio_metrics = bio_metrics.drop(['allDayStress','calendarDate_stress','userProfilePK_stress'], axis=1)

# # View Results
# bio_metrics.head()

In [None]:
# Assess Bio Metrics Data characteristcs

## print(bio_metrics.shape)
## print(bio_metrics.dtypes)

## Load in Fitness Age Data

In [None]:
## Load in Fitness Age Data -> Not going to use for initial model. Maybe be valuable to bring in current age vs fitness age
# fit_age_main = pd.read_json('Raw Data/Fitness Age and Heart Rate Zones/117832404_fitnessAgeData.json')

## Make Copies of Dataframes
# fit_age = fit_age_main.copy()

## View Imported File
# fit_age

In [None]:
# Assess fit_age characteristcs
## print(fit_age.shape)
## print(fit_age.dtypes)

## Load in Heart Rate Zone Data

In [None]:
# Load in HR Zone Data
hr_zones_main = pd.read_json('Raw Data/Fitness Age and Heart Rate Zones/117832404_heartRateZones.json')

# Make Copy of Dataframe
hr_zones = hr_zones_main.copy()

# View Imported File
hr_zones

# Preliminary Analysis

### Do I need to build an archive for this?
### I can create a HR_Zone variable in MASTER table that takes the average HR and places the run in one of the zones. 
### It would be nice to see the % I speed in each zone in each run.
### I'm not sure if that is available.

In [None]:
# Assess fit_age characteristcs
print(hr_zones.shape)
print(hr_zones.dtypes)

## Load in Hydration Data

In [None]:
# Load in Hydration Data -> Not going to use for initial model.
## hydro_1 = pd.read_json('Raw Data/Hydration_Data/HydrationLogFile_2023-10-14_2024-01-22.json')
## hydro_2 = pd.read_json('Raw Data/Hydration_Data/HydrationLogFile_2024-01-22_2024-05-01.json')
## hydro_3 = pd.read_json('Raw Data/Hydration_Data/HydrationLogFile_2024-05-01_2024-08-09.json')

# View Imported File
## hydro_1

In [None]:
# Assess hydro_1 characteristcs
## print(hydro_1.shape)
## print(hydro_1.dtypes)

## Load in Max Met Data

In [None]:
# Load in Max Met Data
maxmet_1_main = pd.read_json('Raw Data/Max_Met_Data/MetricsMaxMetData_20231103_20240211_117832404.json')
maxmet_2_main = pd.read_json('Raw Data/Max_Met_Data/MetricsMaxMetData_20240211_20240521_117832404.json')
maxmet_3_main = pd.read_json('Raw Data/Max_Met_Data/MetricsMaxMetData_20240521_20240829_117832404.json')

# Make Copies of Dataframes
maxmet_1 = maxmet_1_main.copy()
maxmet_2 = maxmet_2_main.copy()
maxmet_3 = maxmet_3_main.copy()

# View Imported File
maxmet_1
## maxmet_2
## maxmet_3

In [None]:
# Assess maxmet_1 characteristcs
print(maxmet_1.shape)
print(maxmet_1.dtypes)

### Combine 3 Maxmet Dataframes into 1 Dataframe

In [None]:
# Combine the three DataFrames into one
combined_maxmet = pd.concat([maxmet_1, maxmet_2, maxmet_3], ignore_index=True)

# Convert 'calendarDate' to Datetime
combined_maxmet['calendarDate'] = pd.to_datetime(combined_maxmet['calendarDate'])

combined_maxmet
## combined_maxmet

In [None]:
print(combined_maxmet.shape)
print(combined_maxmet.dtypes)

### Assess Null Values for Maxmet Data

In [None]:
# Count null values in each column
null_counts = combined_maxmet.isna().sum()

# Display the null counts
print(null_counts)

### Drop Invaluable Columns from Maxmet Data

In [None]:
# View Dataframe
## combined_maxmet

# Specify Columns to drop
columns_to_drop = ['deviceId', 'subSport', 'maxMetCategory', 'calibratedData']

maxmet_cleaned = combined_maxmet.drop(columns_to_drop, axis=1)
maxmet_cleaned

### Filter to 1 record for each 'calendarDate'.
#### If there are multiple records for 1 Date, then keep the record with the maximum updateTimestamp for that day

In [None]:
# Group by 'calendarDate' and get the index of the row with the greatest 'updateTimestamp' for each day
max_timestamp_idx = maxmet_cleaned.groupby('calendarDate')['updateTimestamp'].idxmax()

# Select the rows with the maximum 'timestamp' for each day
maxmet_cleaned = maxmet_cleaned.loc[max_timestamp_idx]

# Verify the result
maxmet_cleaned

### Add Records to Maxmet
#### If there is a date of 08-01 and the next records is 08-07, I want to duplicate the 08-01 record as 08-02,08-03, etc. to fill in the gaps until 08-07
#### This will allow me to have a row for every date when I join on 'calendarDate' later on to create my ML MASTER TBL

In [None]:
# Create a complete date range from the first to the last date
date_range = pd.date_range(start=maxmet_cleaned['calendarDate'].min(), 
                           end=maxmet_cleaned['calendarDate'].max())

# Reindex the DataFrame with the new date range
maxmet_cleaned_2 = maxmet_cleaned.set_index('calendarDate').reindex(date_range)

# Forward fill the missing values to copy the previous dayâ€™s record
maxmet_cleaned_2 = maxmet_cleaned_2.ffill()

# Reset the index to bring 'calendarDate' back as a column
maxmet_cleaned_2 = maxmet_cleaned_2.reset_index().rename(columns={'index': 'calendarDate'})

# Verify the result
maxmet_cleaned_2

In [None]:
# Drop 'updateTimestamp' and 'sport' as these columns do not provide any value for ML model
maxmet_cleaned_2 = maxmet_cleaned_2.drop(['userProfilePK','updateTimestamp','sport'],axis=1)
maxmet_cleaned_2.head()

In [None]:
# Fill missing 'vo2MaxValue' and 'maxMet' values with the value from the preceeding row
maxmet_cleaned_2.loc[:, 'vo2MaxValue'] = maxmet_cleaned_2['vo2MaxValue'].ffill()
maxmet_cleaned_2.loc[:, 'maxMet'] = maxmet_cleaned_2['maxMet'].ffill()

### Save Pre-Processed Max Met Data into a csv

In [None]:
# Save the DataFrame to a CSV file
maxmet_cleaned_2.to_csv('Processed_Data/MaxMet_Cleaned.csv', index=False)

## Load in Race Prediction Data

In [None]:
# Load in Race Prediction Data
racepred_1_main = pd.read_json('Raw Data/Race_Predictions/RunRacePredictions_20231103_20240211_117832404.json')
racepred_2_main = pd.read_json('Raw Data/Race_Predictions/RunRacePredictions_20240211_20240521_117832404.json')
racepred_3_main = pd.read_json('Raw Data/Race_Predictions/RunRacePredictions_20240521_20240829_117832404.json')

# Make Copies of Dataframes
racepred_1 = racepred_1_main.copy()
racepred_2 = racepred_2_main.copy()
racepred_3 = racepred_3_main.copy()

# View Imported File
racepred_1.head()

# Preliminary Analysis

### There are multiple rows per Day. It may be best to take the average for the day. 
### Well maybe not because if the garmin algorithm is causing it to change intra-day, 
### then our algorithm should do the same thing. 
### Maybe start with a daily average and then progress from there

In [None]:
# Assess racepred_1 characteristcs
print(racepred_1.shape)
print(racepred_1.dtypes)

### Combine 3 Race Prediction Dataframes into 1 Dataframe

In [None]:
# Combine the three DataFrames into one
combined_racepred = pd.concat([racepred_1, racepred_2, racepred_3], ignore_index=True)

# Convert 'calendarDate' to Datetime
combined_racepred['calendarDate'] = pd.to_datetime(combined_racepred['calendarDate'])

# View Dataframe
combined_racepred.head()

### Clean up remaining data types for Race Predication Data

In [None]:
# Define a function to convert seconds to a timedelta
def to_timedelta(seconds):
    return pd.Timedelta(seconds=seconds)

# List of columns to convert
columns_to_convert = ['raceTime5K', 'raceTime10K', 'raceTimeHalf', 'raceTimeMarathon']

# Apply the formatting function to each race time column
for column in columns_to_convert:
    combined_racepred[column] = combined_racepred[column].apply(to_timedelta)

In [None]:
combined_racepred##.head()

In [None]:
combined_racepred.dtypes

### Drop Invaluable columns from Race Prediction Data

In [None]:
# Specify Columns to drop
columns_to_drop = ['deviceId'] ##, 'timestamp']

racepred_cleaned = combined_racepred.drop(columns_to_drop, axis=1)

## racepred_cleaned.shape (917,7)

### Group by calendarDate and select the MIN race time for each category

In [None]:
# Group by 'calendarDate' and find the minimum race times
min_race_times = racepred_cleaned.groupby('calendarDate').agg({
    'raceTime5K': 'min',
    'raceTime10K': 'min',
    'raceTimeHalf': 'min',
    'raceTimeMarathon': 'min'
}).reset_index()

# View Results
min_race_times

## min_race_times['calendarDate'].unique().value_counts()
## min_race_times.shape (280,5)

### Save Pre-Processed Race Prediction Data into a csv

In [None]:
# Save the DataFrame to a CSV file
min_race_times.to_csv('Processed_Data/RacePredictions_Cleaned.csv', index=False)

### Load in Sleep Data

In [None]:
# Load in Sleep Data
sleep_1_main = pd.read_json('Raw Data/Sleep_Data/2023-10-15_2024-01-23_117832404_sleepData.json')
sleep_2_main = pd.read_json('Raw Data/Sleep_Data/2024-01-23_2024-05-02_117832404_sleepData.json')
sleep_3_main = pd.read_json('Raw Data/Sleep_Data/2024-05-02_2024-08-10_117832404_sleepData.json')

# Make Copies of Dataframes
sleep_1 = sleep_1_main.copy()
sleep_2 = sleep_2_main.copy()
sleep_3 = sleep_3_main.copy()

# View Imported File
sleep_1.head()

# Assess sleep_1 characteristcs
## print(sleep_1.shape)
## print(sleep_1.dtypes)

### Combine 3 Sleep Data Dataframes into 1 Dataframe

In [None]:
# Combine the three DataFrames into one
combined_sleep = pd.concat([sleep_1, sleep_2, sleep_3], ignore_index=True)

# Convert 'calendarDate' to Datetime
combined_sleep['calendarDate'] = pd.to_datetime(combined_sleep['calendarDate'])

combined_sleep.head()

In [None]:
# Convert the dictionary like values in 'sleepScores' column into separate columns
combined_sleep = combined_sleep.join(combined_sleep['sleepScores'].apply(pd.Series))

combined_sleep.head()

In [None]:
# Drop 'sleepScores' now that I have extracted information into other columns
combined_sleep = combined_sleep.drop('sleepScores',axis=1)
combined_sleep.head()

### Assess Null Values for Sleep Data

In [None]:
# Count null values in each column
null_counts = combined_sleep.isna().sum()

# Display the null counts
print(null_counts)

In [None]:
# View Dataframe
combined_sleep

In [None]:
# Convert the sleep start and end timestamps to datetime format
combined_sleep['sleepStartTimestampGMT'] = pd.to_datetime(combined_sleep['sleepStartTimestampGMT'])
combined_sleep['sleepEndTimestampGMT'] = pd.to_datetime(combined_sleep['sleepEndTimestampGMT'])

# Calculate the time difference between sleepStart and sleepEnd
combined_sleep['sleepDuration'] = combined_sleep['sleepEndTimestampGMT'] - combined_sleep['sleepStartTimestampGMT']

# Create a new column where sleepDuration is in float hours
combined_sleep['sleepDurationHours'] = (combined_sleep['sleepDuration'].dt.total_seconds() / 3600).round(1)  # Convert to hours as a float

# View Dataframe
combined_sleep.head()


In [None]:
# Reorder Columns: 
columns = combined_sleep.columns.to_list()

# Get the index of the 'calendarDate' column
distance_index = columns.index('calendarDate')

# Remove 'sleepDurationHours' and 'sleepDuration' columns from the list
columns.remove('sleepDurationHours')
columns.remove('sleepDuration')

# Insert 'sleepDurationHours' right after 'calendarDate'
columns.insert(distance_index + 1, 'sleepDurationHours')

# Insert 'sleepDuration' right after 'sleepDurationHours'
columns.insert(distance_index + 2, 'sleepDuration')

# Reassign the new column order to the DataFrame
combined_sleep = combined_sleep[columns]

# Drop Columns
cols_to_drop = ['sleepStartTimestampGMT','sleepEndTimestampGMT','sleepWindowConfirmationType']
combined_sleep = combined_sleep.drop(cols_to_drop,axis=1)

# View Dataframe
combined_sleep.head()

In [None]:
# Drop the last column by position using iloc
combined_sleep = combined_sleep.iloc[:, :-1]
combined_sleep.head()

In [None]:
# Count null values in each column
null_counts = combined_sleep.isna().sum()

# Display the null counts
print(null_counts)

In [None]:
# Count null values in each column
null_counts = combined_sleep.isna().sum()

# Display the null counts
print(null_counts)

In [None]:
# View Rows in df where 'remSleepSeconds' is null
combined_sleep[combined_sleep['remSleepSeconds'].isna()]

In [None]:
# View Data types
combined_sleep.dtypes

In [None]:
# Define a function to replace all null values with the column's average value for float64 datatype columns
def fill_null_with_mean(df, columns):
    for col in columns:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(mean_value)  # Assign the filled column back to the DataFrame
    return df

# Get the list of float64 columns
float_columns = [col for col in combined_sleep.columns if combined_sleep[col].dtype == 'float64']

# Apply the function to replace null values with the mean
combined_sleep = fill_null_with_mean(combined_sleep, float_columns)

# View the DataFrame
combined_sleep

In [None]:
# Count null values in each column
null_counts = combined_sleep.isna().sum()

# Display the null counts
print(null_counts)

In [None]:
# View rows where 'calendarDate' isna()
## combined_sleep[combined_sleep['calendarDate'].isna()]

# Manually assign the correct dates to the specific indices where 'calendarDate' is NaT
combined_sleep.loc[109, 'calendarDate'] = pd.Timestamp('2024-03-14')
combined_sleep.loc[257, 'calendarDate'] = pd.Timestamp('2024-08-09')
combined_sleep

In [None]:
# Count null values in each column
null_counts = combined_sleep.isna().sum()

# Display the null counts
print(null_counts)

In [None]:
# Example: Make sure 'sleepDurationHours' does not have NaN values before applying
combined_sleep['sleepDuration'] = combined_sleep.apply(
    lambda row: pd.Timedelta(hours=row['sleepDurationHours']) if pd.isna(row['sleepDuration']) and pd.notna(row['sleepDurationHours']) else row['sleepDuration'],
    axis=1
)

combined_sleep

#### Re-format 'sleepDuration' so that the field can be converted to a duration dtype in PBI

In [None]:
# Convert timedelta to string and remove '0 days '
combined_sleep['sleepDurationFormatted'] = combined_sleep['sleepDuration'].apply(lambda x: str(x).split(' ')[-1])

# Check the result
print(combined_sleep[['sleepDuration', 'sleepDurationFormatted']])

In [None]:
# Reorder Columns
columns = combined_sleep.columns.to_list()

# Get the index of the 'sleepDuration' column
sleep_duration_index = columns.index('sleepDuration')

# Remove 'sleepDurationFormatted' column from the list
columns.remove('sleepDurationFormatted')

# Insert 'sleepDurationFormatted' right after 'sleepDuration'
columns.insert(sleep_duration_index + 1, 'sleepDurationFormatted')

# Reassign the new column order to the DataFrame
combined_sleep = combined_sleep[columns]

# View Dataframe
combined_sleep.head()

In [None]:
combined_sleep

In [None]:
# Round sleepDuration to seconds to remove microseconds and nanoseconds
combined_sleep['sleepDuration'] = combined_sleep['sleepDuration'].dt.round('s')  # Use 's' instead of 'S'

# If you want to format the duration as 'hh:mm:ss' without nanoseconds
combined_sleep['sleepDurationFormatted'] = combined_sleep['sleepDuration'].apply(lambda x: str(x).split(' ')[-1])

combined_sleep

In [None]:
# Replace NaN values in 'insight' with "NONE"
combined_sleep['insight'] = combined_sleep['insight'].fillna("NONE")

# View value counts for 'feedback' categories
## combined_sleep['feedback'].value_counts()

# Replace NaN values in 'insight' with "NONE"
combined_sleep['feedback'] = combined_sleep['feedback'].fillna("NONE")

# View Dataframe
combined_sleep

### Convert columns LIKE '%Seconds%' to hours

In [None]:
combined_sleep_cleaned = combined_sleep.copy()

# Convert seconds to hours (1 hour = 3600 seconds)
def seconds_to_hours(seconds):
    return round(seconds / 3600, 1)

# Identify columns that contain 'Seconds' in their name
columns_to_convert = [col for col in combined_sleep_cleaned.columns if 'Seconds' in col]

# Apply the conversion function to these columns
for col in columns_to_convert:
    combined_sleep_cleaned[col] = combined_sleep_cleaned[col].apply(seconds_to_hours)

combined_sleep_cleaned.head()

In [None]:
combined_sleep_cleaned[combined_sleep_cleaned['sleepDuration'].isna()]

### Rename Columns

In [None]:
def rename_seconds_to_hours(df):
    # Rename columns by replacing 'Seconds' with 'Hours'
    df = df.rename(columns={col: col.replace('Seconds', 'Hours') for col in df.columns if 'Seconds' in col})
    return df

# Apply the function to your DataFrame
combined_sleep_cleaned = rename_seconds_to_hours(combined_sleep_cleaned)

combined_sleep_cleaned.head()

# Confirm that there is 1 row per CalendarDate
## combined_sleep['calendarDate'].unique().value_counts()

In [None]:
combined_sleep_cleaned[combined_sleep_cleaned['sleepDuration'].isna()] # No NaT values anymore

### Drop inisignificant columns for ML Model

In [None]:
# Create a list of columns to drop
cols_to_drop_sleep = ['averageRespiration','lowestRespiration','highestRespiration','retro'
                      ### Dropping all columns LIKE 'Hours' because the Corresponding Sleep Score should be sufficient
                      ,'deepSleepHours','lightSleepHours','remSleepHours','awakeSleepHours','unmeasurableHours','awakeCount'
                      ,'restlessMomentCount'
                     ]
combined_sleep_cleaned_ML = combined_sleep_cleaned.drop(cols_to_drop_sleep,axis=1)

In [None]:
# Convert Floats to Int
combined_sleep_cleaned_ML = combined_sleep_cleaned_ML.astype({col: 'int' for col in combined_sleep_cleaned_ML.select_dtypes(include='float').columns})
combined_sleep_cleaned_ML.head()


In [None]:
# Convert Floats to Int
# combined_sleep_cleaned = combined_sleep_cleaned.astype({col: 'int' for col in combined_sleep_cleaned.select_dtypes(include='float').columns})
# combined_sleep_cleaned.head()

# Drop Columns
cols_to_drop = ['retro','napList']
combined_sleep_cleaned = combined_sleep_cleaned.drop(cols_to_drop,axis=1)

# View Dataframe
combined_sleep_cleaned.head()

### Save Pre-Processed Sleep Data into a csv

In [None]:
# Save the DataFrame to a CSV file
combined_sleep_cleaned_ML.to_csv('Processed_Data/Sleep_Cleaned_ML.csv', index=False) #For ML Model
combined_sleep_cleaned.to_csv('PBI Data/Sleep_Cleaned_PBI.csv', index=False) #For PBI


### Load in Summarized Activity Data
#### Choosing to ignore this data for initial Model
#### See ML_Data_Prep for data transformation on this dataset

In [None]:
# Load in Summarized Activity Data

## activities = pd.read_json('Raw Data/Summarized_Activities/summarizedActivities.json')
## activities.head() ### PROBLEM!!! This is a 1x1 data frame
### NOTE: Can I manually break the text by a delimiter or something to fix this? YES!!!

activities_main = pd.read_json('Raw Data/Summarized_Activities/ahearnzach3@gmail.com_0_summarizedActivities_Cleaned.txt')

# Make Copy of Dataframe
activities = activities_main.copy()

# View Imported File
activities ### PROBLEM!!! This is a 1x1 data frame - RESOLVED. Remove first part of json file

# Assess activities characteristcs
## print(activities.shape)
## activities.dtypes

### Load in Training History Data!

In [None]:
# Load in Summarized Activity Data
training_hist_1_main = pd.read_json('Raw Data/Training_History/TrainingHistory_20231103_20240211_117832404.json')
training_hist_2_main = pd.read_json('Raw Data/Training_History/TrainingHistory_20240211_20240521_117832404.json')
training_hist_3_main = pd.read_json('Raw Data/Training_History/TrainingHistory_20240521_20240829_117832404.json')

# Make Copies of Dataframes
training_hist_1 = training_hist_1_main.copy()
training_hist_2 = training_hist_2_main.copy()
training_hist_3 = training_hist_3_main.copy()

# View Imported File
training_hist_3

# Preliminary Analysis

### There are multiple records per day as well. Need to factor these changes in.
### Maybe it is better to get a daily prediction, and then I can circle back to intra day updates.


### Combine 3 Training History Dataframes into 1 Dataframe

In [None]:
# Combine the three DataFrames into one
combined_training_hist = pd.concat([training_hist_1, training_hist_2, training_hist_3], ignore_index=True)

# Convert 'calendarDate' to Datetime
combined_training_hist['calendarDate'] = pd.to_datetime(combined_training_hist['calendarDate'])

combined_training_hist

In [None]:
# Assess combined_training_hist characteristcs
print(combined_training_hist.shape)
print(combined_training_hist.dtypes)

### Group By 'calendarDate' and Select the Last Training Status of Each Day

In [None]:
# Group by 'calendarDate' and get the index of rows with the maximum timestamp
idx = combined_training_hist.groupby('calendarDate')['timestamp'].idxmax()

# Select the rows with the maximum timestamp for each day
combined_training_hist_cleaned = combined_training_hist.loc[idx]

# View Results
combined_training_hist_cleaned

# Confirm that there is only 1 row per 'calendarDate'
## combined_training_hist_cleaned['calendarDate'].unique().value_counts()

# Assess combined_training_hist_cleaned characteristics
## print(combined_training_hist_cleaned.shape) (280,9)
## print(combined_training_hist_cleaned.dtypes)

In [None]:
# Count null values in each column
null_counts = combined_training_hist_cleaned.isna().sum()

# Display the null counts
print(null_counts)

In [None]:
# Drop records where 'trainingStatus' is 'NO_STATUS'
combined_training_hist_cleaned = combined_training_hist_cleaned[combined_training_hist_cleaned['trainingStatus'] != 'NO_STATUS']
combined_training_hist_cleaned

In [None]:
combined_training_hist_cleaned = combined_training_hist_cleaned.drop(['sport'
                                                                      ,'subSport'
                                                                      ,'deviceId'
                                                                      ,'timestamp'
                                                                      ,'trainingStatus2FeedbackPhrase'
                                                                      ,'userProfilePK']
                                                                      , axis=1)
combined_training_hist_cleaned.head()

In [None]:
# Count null values in each column
null_counts = combined_training_hist_cleaned.isna().sum()

# Display the null counts
print(null_counts)

### Save Pre-Processed Training History Data into a csv

In [None]:
# Save the DataFrame to a CSV file
combined_training_hist_cleaned.to_csv('Processed_Data/Training_History_Cleaned.csv', index=False)

### Load in UDS Data

In [None]:
# Load in UDS Data
uds_1_main = pd.read_json('Raw Data/UDS_Data/UDSFile_2023-10-14_2024-01-22.json')
uds_2_main = pd.read_json('Raw Data/UDS_Data/UDSFile_2024-01-22_2024-05-01.json')
uds_3_main = pd.read_json('Raw Data/UDS_Data/UDSFile_2024-05-01_2024-08-09.json')

# Make Copies of Dataframes
uds_1 = uds_1_main.copy()
uds_2 = uds_2_main.copy()
uds_3 = uds_3_main.copy()

# View Imported File
uds_1.head()

# Assess uds_1 characteristcs
## print(uds_1.shape)
## print(uds_1.dtypes)

### Combine 3 UDS Dataframes into 1 Dataframe

In [None]:
# Combine the three DataFrames into one
combined_uds = pd.concat([uds_1, uds_2, uds_3], ignore_index=True)

# Convert 'calendarDate' to Datetime
combined_uds['calendarDate'] = pd.to_datetime(combined_uds['calendarDate'])

combined_uds

In [None]:
# Assess combined_uds characteristcs
print(combined_uds.shape)
print(combined_uds.dtypes)

In [None]:
# Count null values in each column
null_counts = combined_uds.isna().sum()

# Display the null counts
print(null_counts)

### Extract data from dictionary type columns

In [None]:
combined_uds

In [None]:
# Convert the dictionary-like values in 'allDayStress' and specify suffixes to avoid column overlap
combined_uds = combined_uds.join(combined_uds['allDayStress'].apply(pd.Series), rsuffix='_stress')

## combined_uds.head()

# Drop Columns
combined_uds = combined_uds.drop(['allDayStress','calendarDate_stress','userProfilePK_stress'], axis=1)

# View Results
combined_uds.head()

In [None]:
combined_uds.shape

In [None]:
# Normalize each list of dictionaries in place without exploding the DataFrame
expanded_df = pd.json_normalize(combined_uds['aggregatorList'])

# Combine the new columns back into the original dataframe without altering row count
combined_uds = pd.concat([combined_uds, expanded_df], axis=1)

# Drop the original list column if necessary
combined_uds = combined_uds.drop(columns=['aggregatorList'])

combined_uds.head()

In [None]:
# Drop Last 2 columns as the 3rd to last contains all necessary information
combined_uds = combined_uds.iloc[:, :-2]

# View Results
combined_uds.head()

In [None]:
combined_uds.shape

In [None]:
# Rename '0' column 
combined_uds = combined_uds.rename(columns={combined_uds.columns[-1]: 'Total_stress_data'})
combined_uds.head()

In [None]:
# Convert the dictionary like values in 'bodyBattery' column into separate columns
combined_uds = combined_uds.join(combined_uds['Total_stress_data'].apply(pd.Series), rsuffix='_stress')

# Drop 'bodyBatter' Column
combined_uds = combined_uds.drop(['Total_stress_data'], axis=1)

combined_uds.head()
## combined_uds.shape (259,68)

In [None]:
# Convert the dictionary like values in 'bodyBattery' column into separate columns
combined_uds = combined_uds.join(combined_uds['bodyBattery'].apply(pd.Series), rsuffix='_battery')

# Drop 'bodyBatter' Column
combined_uds = combined_uds.drop(['bodyBattery'], axis=1)

combined_uds.head()

In [None]:
# Drop Other Duplicate Columns
combined_uds = combined_uds.drop(['userProfilePK_battery','calendarDate_battery'], axis=1)
combined_uds.head()

### Don't worry about using this for now

In [None]:
# # Normalize each list of dictionaries in place without exploding the DataFrame
# expanded_df = pd.json_normalize(combined_uds['bodyBatteryStatList'])

# # Combine the new columns back into the original dataframe without altering row count
# combined_uds = pd.concat([combined_uds, expanded_df], axis=1)

# # Drop the original list column if necessary
# combined_uds = combined_uds.drop(columns=['bodyBatteryStatList'])

# combined_uds.head()
# ## combined_uds.shape (259,78)

In [None]:
# Convert the dictionary like values in 'respiration' column into separate columns
## combined_uds = combined_uds.join(combined_uds['respiration'].apply(pd.Series))

## combined_uds.head()

In [None]:
# Convert the dictionary like values in 'hydration' column into separate columns
## combined_uds = combined_uds.join(combined_uds['hydration'].apply(pd.Series))

## combined_uds.head()

In [None]:
# combined_uds.isna().sum()

In [None]:
# Drop the '0' column
## combined_uds = combined_uds.drop(columns=[combined_uds.columns[-1]])
# combined_uds.head()

In [None]:
# Assess the number of unique values in each column
# for col in combined_uds.columns:
#     print(f"{col}: {combined_uds[col].nunique()}")

In [None]:
# Drop other insignificant columns
combined_uds = combined_uds.drop(['uuid'
                                  ,'userProfilePK'
                                  ,'wellnessStartTimeGmt'
                                  ,'wellnessEndTimeGmt'
                                  ,'wellnessStartTimeLocal'
                                  ,'wellnessEndTimeLocal'
                                  ,'includesWellnessData'
                                  ,'includesActivityData'
                                  ,'includesCalorieConsumedData'
                                  ,'includesSingleMeasurement'
                                  ,'includesContinuousMeasurement'
                                  ,'includesAllDayPulseOx'
                                  ,'includesSleepPulseOx'
                                  ,'source'
                                  ,'userFloorsAscendedGoal'
                                 ], axis=1)
combined_uds.head()

In [None]:
combined_uds = combined_uds.drop(['durationInMilliseconds'
                                  ,'wellnessKilocalories'
                                  ,'remainingKilocalories' # Same Values as 'wellnessKilocalories'
                                  ,'wellnessTotalKilocalories' # Same Values as 'wellnessKilocalories'
                                  ,'wellnessActiveKilocalories'
                                  ,'dailyStepGoal'
                                  ,'wellnessDistanceMeters'
                                  ,'userIntensityMinutesGoal'
                                  ,'minAvgHeartRate'
                                  ,'maxAvgHeartRate'
                                  ,'version'
                                  ,'restingCaloriesFromActivity'
                                  ,'restingHeartRateTimestamp'
                                 # ,'hydration'
                                  ,'dailyTotalFromEpochData'
                                  ,'type'
                                  ,'uncategorizedDuration'
                                  ,'totalDuration'
                                  ,'lowDuration'
                                  ,'bodyBatteryVersion'
                                 ],axis=1)
                                  
                                  
combined_uds.head()   

## combined_uds.shape (259, 38)

In [None]:
# Convert the dictionary like values in 'respiration' column into separate columns
respiration = combined_uds.join(combined_uds['respiration'].apply(pd.Series), rsuffix='_respiration')
respiration.head()

### I don't see much value in these columns. Going to drop 'respiration' from df without using any of the dictionary columns

# Drop 'respiration' Column
combined_uds = combined_uds.drop(['respiration'], axis=1)

combined_uds.head()

In [None]:
# Convert the dictionary like values in 'hydration' column into separate columns
hydration = combined_uds.join(combined_uds['hydration'].apply(pd.Series), rsuffix='_hydration')
hydration.head()

### I don't see much value in these columns. Going to drop 'hydration' from df without using any of the dictionary columns

# Drop 'hydration' Column
combined_uds = combined_uds.drop(['hydration'], axis=1)

combined_uds.head()

In [None]:
# Convert the dictionary like values in 'bodyBatteryFeedback' column into separate columns
bodyBatteryFeedback = combined_uds.join(combined_uds['bodyBatteryFeedback'].apply(pd.Series), rsuffix='_bodyBatteryFeedback')
bodyBatteryFeedback

### I don't see much value in these columns for intial model.
### Going to drop 'bodyBatteryFeedback' from df without using any of the dictionary columns
### May be more value in 'bodyBatteryFeedback' for later models

# Drop 'bodyBatteryFeedback' Column
combined_uds = combined_uds.drop(['bodyBatteryFeedback'], axis=1)

# Drop 'bodyBatteryStatList' Column
combined_uds = combined_uds.drop(['bodyBatteryStatList'], axis=1)
combined_uds.head()

In [None]:
# Replace NaN values with 0.0 in the 'isVigorousDay' column
combined_uds['isVigorousDay'] = combined_uds['isVigorousDay'].fillna(0.0)

# Verify the changes
combined_uds['isVigorousDay'].value_counts()

In [None]:
combined_uds.isna().sum()

In [None]:
# Drop Last Columns of Dataframe
combined_uds = combined_uds.iloc[:, :-1]
combined_uds

In [None]:
combined_uds.isna().sum()

In [None]:
combined_uds[combined_uds['activityDuration'].isna()]

In [None]:
# Fill missing values in each column with the column's mean
combined_uds = combined_uds.fillna(combined_uds.mean(numeric_only=True))
combined_uds

# Verify that the missing values have been filled
#combined_uds.isna().sum() # This should show 0 for columns that had missing values

In [None]:
# Covert all 'float' type columns to 'int'
combined_uds = combined_uds.astype({col: 'int' for col in combined_uds.select_dtypes(include='float').columns})
combined_uds

### Save Pre-Processed UDS Data into a csv

In [None]:
# Save the DataFrame to a CSV file
combined_uds.to_csv('Processed_Data/UDS_Cleaned.csv', index=False)

### Load Running Data

In [None]:
# Import Dataset
running_main = pd.read_csv("Workout_Data_20240804.csv")
## running_main.head()

# Create a copy of the original dataset
running = running_main.copy()
running.head()

In [None]:
# Assess Dimensionality and Columns Names
print(running.shape)
print(running.columns)

### 1. Running Data Cleaning and Prepocessing

In [None]:
# Dropping all NULL columns
print(running.shape)
print(running.dropna(axis = 1, how='all').shape) #Compare shape after dropping null columns

running_cleaned = running.dropna(axis = 1, how='all')
running_cleaned.head()

In [None]:
# Add _ into Column Headers
running_cleaned.columns = running_cleaned.columns.str.replace(' ', '_')
running_cleaned.head()

In [None]:
#Remove special characters from column names
running_cleaned.columns = running_cleaned.columns.str.replace(r'[^A-Za-z0-9_]+', '', regex=True)

## df_cleaned[['Normalized_Power_NP']]
running_cleaned.head()

### Feature Engineering

In [None]:
# Create Distant Group Column to group runs into mileage buckets
bins = [0, 3, 5, 7, 10, 13, float('inf')]
labels = ['0-3 miles', '3-5 miles', '5-7 miles', '7-10 miles', '10-13 miles', '13+ miles']

# Use Distance Value to assign Distance Group

## running_cleaned.loc[:, 'Distance Group'] = pd.cut(df_cleaned['Distance'], bins=bins, labels=labels, right=False) -> This was causing an error message
running_cleaned = running_cleaned.assign(Distance_Group=pd.cut(running_cleaned['Distance'], bins=bins, labels=labels, right=False))

# View Results
print(running_cleaned[['Distance', 'Distance_Group']].head())

In [None]:
# Drop Favorite Column as I have not been updating that field
running_cleaned = running_cleaned.drop('Favorite', axis = 1) #-> UNCOMMENT BEFORE RUNNING THIS CELL!!!
running_cleaned.head()

In [None]:
# Move 'Distance_Group' directly after 'Distance'
columns = running_cleaned.columns.to_list()

# Get the index of the 'Distance' column
distance_index = columns.index('Distance')

# Insert 'Distance Group' right after 'Distance'
columns.insert(distance_index + 1, columns.pop(columns.index('Distance_Group')))

# Update df_cleaned with new Column Order
running_cleaned = running_cleaned[columns]

running_cleaned.head()

In [None]:
# Convert 'Date' to datetime and set the time to 00:00:00
running_cleaned['Date'] = pd.to_datetime(running_cleaned['Date']).dt.normalize()

# Rename 'Date' to 'calendarDate'
running_cleaned.rename(columns={'Date': 'calendarDate'}, inplace=True)

print(running_cleaned.dtypes.head())
running_cleaned.head()

In [None]:
# Create Week of Year Field
## running_cleaned['Week_of_Year'] = running_cleaned['Date'].dt.isocalendar().week

# Create Month Field
## running_cleaned['Month'] = running_cleaned['Date'].dt.month

# Create Year Field
## running_cleaned['Year'] = running_cleaned['Date'].dt.year

# View New Fields and Data Types
## print(running_cleaned[['Date','Week_of_Year','Month','Year']].dtypes)
## running_cleaned[['Date','Week_of_Year','Month','Year']].head()

In [None]:
running_cleaned.isna().sum() ### No missing values

In [None]:
# Derive a count of unique values for each column in running_cleaned ### USE THIS BEFORE JOINING ALL DATAFRAMES TOGETHER
for col in running_cleaned.columns:
    print(f"{col}: {running_cleaned[col].nunique()}")

In [None]:
running_cleaned = running_cleaned.drop(['Decompression','Training_Stress_Score'], axis=1)
running_cleaned.head()

In [None]:
running_cleaned

In [None]:
# Replace '--' with 0 in 'Total_Ascent' and 'Total_Descent' columns
running_cleaned[['Total_Ascent', 'Total_Descent']] = running_cleaned[['Total_Ascent', 'Total_Descent']].replace('--', 0)

# Verify the changes
running_cleaned[['Total_Ascent', 'Total_Descent']].head()
## running_cleaned[running_cleaned['Total_Ascent']== '--']

In [None]:
# Calculate the averages for the 'Max_Elevation' and 'Min_Elevation' where Title == 'Charlotte Running'
charlotte_running_mask = running_cleaned['Title'] == 'Charlotte Running'
avg_max_elevation = running_cleaned.loc[charlotte_running_mask, 'Max_Elevation'].replace('--', pd.NA).astype(float).mean()
avg_min_elevation = running_cleaned.loc[charlotte_running_mask, 'Min_Elevation'].replace('--', pd.NA).astype(float).mean()

# Replace '--' with the calculated averages and round to nearest integer
running_cleaned['Max_Elevation'] = running_cleaned['Max_Elevation'].replace('--', avg_max_elevation).astype(float).round().astype(int)
running_cleaned['Min_Elevation'] = running_cleaned['Min_Elevation'].replace('--', avg_min_elevation).astype(float).round().astype(int)

# Verify the changes
running_cleaned[['Max_Elevation', 'Min_Elevation']].head()

### For Training Status and Race Predictor Purposes I can remove a fair amount of columns
#### Many of these columns will be running specific metrics that are either very similar throughout the column or columns that I am using domain knowledge to remove

In [None]:
cols_to_drop = ['Avg_Run_Cadence','Max_Run_Cadence','Avg_Stride_Length','Avg_Vertical_Ratio','Avg_Vertical_Oscillation'
                ,'Avg_Ground_Contact_Time','Avg_GAP','Normalized_Power_NP','Avg_Power','Max_Power','Best_Lap_Time'
                ,'Number_of_Laps','Moving_Time','Elapsed_Time','Title'
               ]
running_cleaned = running_cleaned.drop(cols_to_drop, axis=1)
running_cleaned.head()

In [None]:
# Check run count per day
running_cleaned['calendarDate'].value_counts()

# Keep only the record with the longest run (max Distance) for each calendarDate
running_cleaned = running_cleaned.loc[running_cleaned.groupby('calendarDate')['Distance'].idxmax()]
running_cleaned.head()

### Save Pre-Processed Running Data into a csv

In [None]:
# Save the DataFrame to a CSV file
running_cleaned.to_csv('Processed_Data/Running_Cleaned.csv', index=False)