In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
## Some other info from livestrong.com
# if weigth=72.5kg, 30mins weights training burns about 115cal....quite modest!!

## Predefined Exercise Types from Samsung SDK

##Exercise_name	Exercise_type
#----------------------------
#Other		0 

#Walking	1001

#Running	1002
#Basketball, general	4003
#Volleyball, general, 6~9 member team, non-competitive	5001
#Beach volleyball	5002
#Badminton, competitive	6003
#Yoga	9002
#Circuit training, moderate effort	10007
#Plank	10025
#Cycling	11007
#Swimming, general, leisurely, not lap swimming	14001
#Step machine	15001
#Weight machine	15002
#Exercise bike, Moderate to vigorous effort (90-100 watts)	15003
#Rowing machine	15004

#Treadmill, combination of jogging and walking	15005

#Elliptical trainer, moderate effort	15006

In [3]:
## Constants based on the exercise type
ex_type_walking   = 1001
ex_type_running   = 1002
ex_type_treadmill = 15005
ex_type_weight    = [0, 15002]

In [4]:
## Path for files downloaded from Samsung Health
data_dir = "/persistentStorage/SHealth_DataSet/Shealth/"

exercise_data_file_prefix = "com.samsung.health.exercise"

## Read exercise data file
file_from_disk = None
for file in os.listdir(data_dir):
    if ((exercise_data_file_prefix in file) and
        (file.endswith(".csv"))):
        file_from_disk = file

if (file_from_disk == None):
    print ("Exercise Data File Not Located!") 
    exit()

## Create exercise data dataframe
pd_exercise_data = pd.read_csv(filepath_or_buffer=data_dir+file_from_disk,
                               header=1)

## Unnecessary columns
cols_to_remove = ["max_rpm", "mean_rpm", "max_caloricburn_rate", "additional", "deviceuuid", "datauuid", 
                  "mean_caloricburn_rate", "create_time", "time_offset", "custom", "end_time", "max_power",
                  "pkg_name", "mean_power", "exercise_custom_type", "update_time"]

pd_exercise_data.drop(columns=cols_to_remove, inplace=True)

In [5]:
## Helper functions and constants

## Regular expression for matching float in a string
re_float = re.compile("\d+(\.\d+)?")

## Extract distance, milestone_distance and milestone_time from treadmill comments.
def extract_dist_milestone_time_from_comment (str_to_process):

    ## Format of data in comments - Tot distance, Milestone distance, Milestone time
    
    ## Remove spaces and then split on ","
    items = str_to_process.replace(" ", "").split(",")
    
    dist = mdist = mtime = 0

    if (re_float.match(items[0])):
        dist = float(items[0]) * 1000 ## meters
    else:
        print ("Incorrect Tot. Distance format in treadmill!!")

    if (len(items) == 3):
        if (re_float.match(items[1])):
            mdist = float(items[1]) * 1000 ## meters
        else:
            print ("Incorrect Milestone Distance format in treadmill!!")
            
        if (re_float.match(items[2])):
            mtime = float(items[2]) * 1000 ## millisecs
        else:
            print ("Incorrect Milestone Time format in treadmill!!")
        
    return dist, mdist, mtime

## Update NaN mean_speed values using the distance and duration
def update_mean_speed_from_dist_dur (df_row):
    
    ## Mean Speed
    if (df_row["mean_speed"] == 0.0):
        if (df_row["duration"] > 0):
            return (df_row["distance"] / (df_row["duration"] / 1000))
        else:
            print ("Warning!! There are entries where duration is 0!!")
            return 0
    else:
        return (df_row["mean_speed"])

## Update NaN max_speed based on mean_speed        
def update_max_speed_from_dist_dur (df_row):
    
    ## Max Speed
    if (df_row["max_speed"] == 0.0):
        if (df_row["mean_speed"] != 0.0):
            return (df_row["mean_speed"])
        else:
            return 0
    else:
        return (df_row["max_speed"])
    
## Update Heart Rate for missing data
def update_heart_rate (df):
    
    test_flag = False
    
    ## Update mean_heart_rate for missing data, by averaging it with similar data.
    ## If this cannot be done, delete that row.
    dist_no_mean_hr_index = df.index[df["mean_heart_rate"] == 0.0]
    dist_no_mean_hr = [[index, df.loc[index, "distance"]]
                       for index in dist_no_mean_hr_index
                      ]
   
    for index, dist in dist_no_mean_hr:
        if (test_flag): print ("\nDIST --> ", dist/1000)
        
        count = pred_max_value = pred_min_value = pred_mean_value = 0
        rows_to_drop = []
        
        for item, row in df.iterrows():
            if ((row["distance"] > dist-100) and
                (row["distance"] < dist+100) and
                (row["mean_heart_rate"] > 0)
               ):
                if (test_flag): print ("DIS: %f DUR: %f MaxHR: %f MinHR: %f MeanHR: %f" %(row["distance"]/1000,
                                                                                          row["duration"]/1000/60,
                                                                                          row["max_heart_rate"],
                                                                                          row["min_heart_rate"],
                                                                                          row["mean_heart_rate"]))
                pred_max_value  += row["max_heart_rate"]
                pred_min_value  += row["min_heart_rate"]
                pred_mean_value += row["mean_heart_rate"]
                count += 1
        
        if (count == 0):
            if (test_flag): print ("No close values found for %f" %(dist))
            rows_to_drop.append(index)
            continue
        
        df.loc[index, "max_heart_rate"]  = pred_max_value/count
        df.loc[index, "min_heart_rate"]  = pred_min_value/count
        df.loc[index, "mean_heart_rate"] = pred_mean_value/count 
            
        if (test_flag): print ("Predicted values: MaxHR: %f MinHR %f MeanHR %f"
                               %(pred_max_value/count, pred_min_value/count, pred_mean_value/count))
    
    df.drop(index=rows_to_drop, inplace=True)
        
    return df

## Correct outlier HR due to misfitted Gear Fit2
def correct_heart_rate (df):

    test_flag = False
    
    ## Update Max, Min and Mean HR for outliers. 
    ## These were mainly caused due to loosely fitted Gear Fit2
    max_reference_hr = 186
    df_index = df.index[df["max_heart_rate"] >= max_reference_hr]
    dist_dur_cal_hr_list = [[index,
                             df.loc[index, "distance"], 
                             df.loc[index, "max_heart_rate"], 
                             df.loc[index, "mean_speed"]      ]
                            for index in df_index
                           ]

    if (test_flag): print("outlier percentage -- %f" %(len(dist_dur_cal_hr_list)/len(df) * 100))
        
    for stuff in dist_dur_cal_hr_list:
        if (test_flag): print("Ref Dist - %f  MXH - %f   MSP - %f" %(stuff[1], stuff[2], stuff[3]))
        count = pred_max_hr = pred_min_hr = pred_mean_hr = 0.0

        for item, row in df.iterrows():
            if ((row["distance"] >= stuff[1] - 100) and
                (row["distance"] <= stuff[1] + 100) and
                (row["max_heart_rate"] < max_reference_hr) and
                (row["mean_speed"] >= stuff[3] - 0.3) and
                (row["mean_speed"] <= stuff[3] + 0.3)):
                
                count += 1
                pred_max_hr  += row["max_heart_rate"]
                pred_min_hr  += row["min_heart_rate"]
                pred_mean_hr += row["mean_heart_rate"]
                if (test_flag): print ("\tClose Dist-%f Dur-%f Cal-%f MXH-%f MNH-%f MIH-%f MSP-%f" 
                                       %(row["distance"], 
                                         row["duration"],
                                         row["calorie"],
                                         row["max_heart_rate"],
                                         row["mean_heart_rate"],
                                         row["min_heart_rate"],
                                         row["mean_speed"]))
        if (count > 0):
            if (test_flag): print("Pred_Max_HR = %f -- Pred_Min_HR = %f -- Pred_Mean_HR = %f" 
                                  %(pred_max_hr/count,
                                    pred_min_hr/count,
                                    pred_mean_hr/count))
        
            df.loc[stuff[0], "max_heart_rate"]  = pred_max_hr/count
            df.loc[stuff[0], "min_heart_rate"]  = pred_min_hr/count
            df.loc[stuff[0], "mean_heart_rate"] = pred_mean_hr/count
            
    return df

def update_hr_for_weight_train (df):
    test_flag = False
    nt_fd_indx = []

    indx_to_drop = df.index[df["calorie"] < 30]
    df.drop(index=indx_to_drop, inplace=True)
    
    mhr_zero_df_idx = df.index[df["mean_heart_rate"] == 0]
    mhr_zero_list = sorted(([[index, 
                      df.loc[index, "duration"],
                      df.loc[index, "calorie"]
                     ] for index in mhr_zero_df_idx]), key= lambda x: x[2])
    
    for stuff in mhr_zero_list:

        if (test_flag): print ("REF: DUR-%f   CAL-%f" %(stuff[1]/(1000*3600), stuff[2]))
        count = pred_max = pred_min = pred_mean = 0
        
        for item, row in df.iterrows():

            ## We need two types of classification here:
            ## 1. Calories burnt less than 150, use average of HR from those records where calories are +-20
            ## 2. Calories burnt are more than 150, use the average of HR from those records where duration is 
            ##    +-10mins as this corresponds to the specific weight training records
            
            if (stuff[2] < 150):
                if ((row["calorie"] >= stuff[2] - 30) and
                    (row["calorie"] <= stuff[2] + 30) and
                    (row["mean_heart_rate"] > 0.0)):
                    if (test_flag): print ("CLOSE: Dur-%f  Cal-%f  Min-%f  Max-%f  Mean-%f" 
                                                                            %(row["duration"] / (1000*3600),
                                                                              row["calorie"],
                                                                              row["min_heart_rate"],
                                                                              row["max_heart_rate"],
                                                                              row["mean_heart_rate"]))
                    count     += 1
                    pred_max  += row["max_heart_rate"]
                    pred_min  += row["min_heart_rate"]
                    pred_mean += row["mean_heart_rate"]
            else:
                if ((row["calorie"] >= stuff[2] - 100) and
                    (row["calorie"] <= stuff[2] + 100) and
                    (row["mean_heart_rate"] > 0.0)):
                    if (test_flag): print ("CLOSE: Dur-%f  Cal-%f  Min-%f  Max-%f  Mean-%f" 
                                                                            %(row["duration"] / (1000*3600),
                                                                              row["calorie"],
                                                                              row["min_heart_rate"],
                                                                              row["max_heart_rate"],
                                                                              row["mean_heart_rate"]))
                    count     += 1
                    pred_max  += row["max_heart_rate"]
                    pred_min  += row["min_heart_rate"]
                    pred_mean += row["mean_heart_rate"]

                
        if (count > 0):
            if (test_flag): print ("PRED: MIN-%f  MAX-%f  MEAN-%f" %(pred_min/count,
                                                                     pred_max/count,
                                                                     pred_mean/count))
        else:
            nt_fd_indx.append(stuff[0])
    df.drop(index=nt_fd_indx, inplace=True)
    
    return (df)

In [6]:
#################Temp Test Block########################

def check_nulls (df):
    null_cols = [col for col in df.columns if df[col].isnull().any()]
    print (len(df))
    for col in null_cols:
        print(col, df[col].isnull().sum())
        
        if (col == "comment"):
            print (df[df[col].isnull() == False]["comment"])
    return

def temp_block_of_code (df):
    
    pass
    return

In [7]:
## Extract data from the exercise data file

## Extract Walking or Running Data
def extract_walk_run_data(ex_type):
    
    if (ex_type == ex_type_walking):
        df = pd_exercise_data[pd_exercise_data["exercise_type"] == ex_type_walking].copy()
    elif (ex_type == ex_type_running):
        df = pd_exercise_data[pd_exercise_data["exercise_type"] == ex_type_running].copy()
    else:
        print ("Incorrect ex type!!!!!")
        return
    
    ## Count type = stride is valid for walking exercise and cadence indicates the stride or steps/min.
    ### However this data is missing for the walking exercise type, so remove count_type and cadence columns.
    ## Count is missing for most of the data, so remove this column.
    ## Comments are blank. Remove this as well.
    additional_cols_to_remove = ["count_type",   "max_cadence", 
                                 "mean_cadence", "count", 
                                 "comment"]
    
    df.drop(columns=additional_cols_to_remove, inplace=True)
    
    ## Lets fill up some of the NaN columns
    nan_col_values_dict = {"live_data"        : "NA" 
                           , "altitude_gain"    : 0.0
                           , "location_data"    : "NA"
                           , "decline_distance" : 0.0
                           , "altitude_loss"    : 0.0
                           , "incline_distance" : 0.0
                           , "max_altitude"     : 0.0
                           , "min_altitude"     : 0.0
                           , "mean_altitude"    : 0.0
                           , "mean_speed"       : 0.0
                           , "max_speed"        : "NA"
                           , "mean_heart_rate"  : 0.0
                           , "min_heart_rate"   : 0.0
                           , "max_heart_rate"   : 0.0
                          }

    df.fillna(value=nan_col_values_dict, inplace=True)
    
    ## Some entries related to duration seem to 0. These could be removed from the dataset.
    ### This also helps, so mean_speed calculation does not cause exception.
    df = df[df["duration"] > 0]
    
    ## Mean_speed could be NaN for manual entries. Calculate this from the distance and duraton.
    ### Not overwriting mean_speed for all rows, because it seems that mean_speed is not the result of
    ### distance(m) / duration(s).
    df.loc[:, "mean_speed"] = df.apply(lambda ms_value: update_mean_speed_from_dist_dur(ms_value), axis=1)
    df.loc[:, "max_speed"]  = df.apply(lambda ms_value: update_max_speed_from_dist_dur(ms_value), axis=1)
    
    
    ## Update the mean_heart_rate for missing data. If average cannot be found, delete the rows.
    df = update_heart_rate(df)
    return (df if len(df) > 0 else None)

## Extract Treadmill Data
def extract_treadmill_data():
    df = pd_exercise_data[pd_exercise_data["exercise_type"] == ex_type_treadmill].copy()

    ## Altitude related features are not really relevant for treadmill data.
    ## Remove the cadence related features since count and count type data is not available
    additional_cols_to_remove = ["altitude_gain", "max_altitude", 
                                 "min_altitude",  "altitude_loss",
                                 "count_type",    "max_cadence", 
                                 "mean_cadence",  "count"]
    df.drop(columns=additional_cols_to_remove, inplace=True)
    
    ## Lets fill up some of the NaN columns
    nan_col_values_dict = {"live_data"          : "NA" 
                           , "location_data"    : "NA"
                           , "decline_distance" : 0.0
                           , "incline_distance" : 0.0
                           , "mean_speed"       : 0.0
                           , "max_speed"        : "NA"
                           , "mean_heart_rate"  : 0.0
                           , "min_heart_rate"   : 0.0
                           , "max_heart_rate"   : 0.0
                          }

    df.fillna(value=nan_col_values_dict, inplace=True)
    
    ## Update information from comments into treadmill data
    ## Format of data in comments - Tot distance, Milestone distance, Milestone time
    ## For this info, 3 new columns are added to the treadmill dataframe
    ## Tot distance has to be manually entered for treadmill and there is no column as of now to store this info
    ## -- Tot distance value will be copied to the existing distance column.
    ## Milestone distance is the maximum of 2.5, 5, 10, 15, 20....
    ## -- Needs a new column
    ## Milestone time is time taken to complete the milestone distance
    ## -- Needs a new column.
    df.insert(loc=len(df.columns), column="milestone", value=0.0)
    df.insert(loc=len(df.columns), column="milestone_time", value=0.0)
    df.loc[:, ["distance", "milestone", "milestone_time"]] = [extract_dist_milestone_time_from_comment(comment) 
                                                              for comment in df["comment"]]

    ## Some entries related to duration seem to 0. These could be removed from the dataset.
    ### This also helps, so mean_speed calculation does not cause exception.
    df = df[df["duration"] > 0]

    ## Mean_speed could be NaN for manual entries. Calculate this from the distance and duraton.
    df.loc[:, "mean_speed"] = df.apply(lambda ms_val: update_mean_speed_from_dist_dur(ms_val), axis=1)
    df.loc[:, "max_speed"]  = df.apply(lambda ms_val: update_max_speed_from_dist_dur(ms_val), axis=1)
    
    ## Update the mean_heart_rate for missing data. If average cannot be found, delete the rows.
    df = update_heart_rate(df)
    df = correct_heart_rate(df)

    return (df if len(df) > 0 else None)

## Extract Weight training data
def extract_weight_train ():
    df = pd_exercise_data[pd_exercise_data["exercise_type"].isin(ex_type_weight)].copy()

    ## Lets fill up some of the NaN columns
    nan_col_values_dict = {"live_data"          : "NA" 
                           , "location_data"    : "NA"
                           , "mean_heart_rate"  : 0.0
                           , "min_heart_rate"   : 0.0
                           , "max_heart_rate"   : 0.0
                          }
    df.fillna(value=nan_col_values_dict, inplace=True)

    ## Columns that are empty all together or having 0.0 as values
    additional_cols_to_remove = ["altitude_gain", "max_altitude", 
                                 "min_altitude",  "altitude_loss",
                                 "count_type",    "max_cadence", 
                                 "mean_cadence",  "count", 
                                 "max_speed", "comment",
                                 "distance", "decline_distance",
                                 "incline_distance", "mean_speed"]
    
    df.drop(columns=additional_cols_to_remove, inplace=True)
    
    df = update_hr_for_weight_train (df)
    return df

In [8]:
pd_weight = extract_weight_train()
pd_weight["calorie"].sum()
pd_weight.to_pickle("weight_ex_dataframe.pkl")

In [9]:
## Treadmill Data
pd_treadmill = extract_treadmill_data()
pd_treadmill["distance"].sum() / 1000
pd_treadmill.to_pickle("treadmill_ex_dataframe.pkl")

In [10]:
## Running Data
pd_running = extract_walk_run_data(ex_type_running)
pd_running["distance"].sum() / 1000
pd_running.to_pickle("running_ex_dataframe.pkl")

In [11]:
## Walking Data
pd_walking = extract_walk_run_data (ex_type_walking)
pd_walking["distance"].sum() / 1000
pd_walking.to_pickle("walking_ex_dataframe.pkl")