In [1]:
import pandas as pd
import os
import re

data_dir = "/persistentStorage/SHealth_DataSet/Shealth/"

## Read Exercise Type csv into dataframe
file_name = data_dir + "exercise_type.csv"

pd_exercise_type = pd.read_csv(file_name, header=0, names=["exercise_name", "exercise_type"])
pd_exercise_type.dropna(inplace=True)
#pd_exercise_type

In [2]:
## Function to extract file name from the full file name maintained by Samsung
def get_short_file_name (full_file_name):
    
    name = re.sub (r"com.samsung.*health.", "", full_file_name)
    end_index = name.find (".")
    name = name[:end_index]
    
    return name

## Function to extract the full file name, from the list of files.
def get_full_file_name (list_to_search, str_to_search):
    for f_name, s_name in list_to_search:
        if str_to_search in s_name:
            return f_name
    
    return None

## Function to get exercise type from the exercise name
def get_exercise_name_type (item_to_search, ex_type_df=pd_exercise_type):
    
    for (ex_name, ex_type) in zip(ex_type_df["exercise_name"], ex_type_df["exercise_type"]):
        
        if (type(item_to_search) == str):
            if (item_to_search.lower() in ex_name.lower()):
                return ex_type
        else:
            if (item_to_search == ex_type):
                return ex_name
    
    return None

In [3]:
## Create file list
## 1. Drop the files related to photos and user_profile
## 2. Read only .csv files
file_list = [(full_file_name, get_short_file_name (full_file_name))
             for full_file_name in os.listdir (data_dir)
             if ((".csv" in full_file_name) and
                 (".photo" not in full_file_name) and
                 (".user_profile" not in full_file_name)
                )
            ]
#file_list

In [4]:
## Read Exercise csv into dataframe
file_name = data_dir + get_full_file_name(file_list, "exercise")
pd_exercise = pd.read_csv(file_name, header=1)

#pd_exercise[pd_exercise["count_type"] == 30004]["exercise_type"].unique()

In [5]:
cols_to_remove = ["max_rpm", "mean_rpm", "max_caloricburn_rate", "additional", "deviceuuid", "datauuid", 
                  "mean_caloricburn_rate", "create_time", "time_offset", "custom", "end_time", "max_power",
                  "pkg_name", "mean_power", "exercise_custom_type", "update_time"]

pd_exercise.drop(columns=cols_to_remove, axis=1, inplace=True)
#pd_exercise.columns

In [6]:
## Get Walking Data
ex_type = get_exercise_name_type(item_to_search="walk")

if (ex_type == None):
    print ("Walking Data Not Found!!")
    exit()

walk_data = pd_exercise[pd_exercise["exercise_type"] == ex_type].copy()
print (walk_data["distance"].sum())
walk_data[walk_data["distance"] / 1000 > 10]["distance"] / 1000

556598.04538


0      12.652310
237    23.251703
239    11.550666
312    12.670000
315    10.110000
372    12.148380
430    12.610000
681    20.000000
718    11.462000
852    10.020000
976    10.600000
Name: distance, dtype: float64

In [8]:
## Get Treadmill Data
ex_type = get_exercise_name_type(item_to_search="tread")

if (ex_type == None):
    print ("Tread Data Not Found!!")
    exit()

tread_data = pd_exercise[pd_exercise["exercise_type"] == ex_type].copy()

## Update information from comments into treadmill data
## Format of data in comments - Tot distance, Milestone distance, Milestone time
## For this info, 3 new columns are added to the treadmill dataframe
## Tot distance has to be manually entered for treadmill and there is no column as of now to store this info
## -- Tot distance value will be copied to the existing distance column.
## Milestone distance is the maximum of 2.5, 5, 10, 15, 20....
## -- Needs a new column
## Milestone time is time taken to complete the milestone distance
## -- Needs a new column.

if (tread_data["comment"].isnull().any()):
    print ("NULL Comment found in raw treadmill data. This needs to be manually corrected!!")
    exit()

## Compile a regular expression for float
re_float = re.compile("\d+(\.\d+)?")

def extract_dist_time (item_from_df):
    items = item_from_df.split(",")
    #print ("Raw -- ", items, "Len -- ", len(items))
    
    dist = mdist = mtime = 0
    
    if ((len(items) == 1) and re_float.match(items[0])):
        dist = float(items[0])
    elif ((len(items) == 3) and [re_float.match(x) for x in items]):
        dist  = float(items[0])
        mdist = float(items[1])
        mtime = float(items[2])
    else:
        print ("Incorrect Data format in treadmill!!")
        exit()
    
    return dist, mdist, mtime

tread_data.insert(loc=len(tread_data.columns), column="milestone_distance", value=0)
tread_data.insert(loc=len(tread_data.columns), column="milestone_time", value=0)

tread_data.loc[:, ["distance", "milestone_distance", "milestone_time"]] = [extract_dist_time(item) 
                                                                           for item in tread_data["comment"]
                                                                           ]        
tread_data[["comment", "distance", "milestone_distance", "milestone_time"]]


Unnamed: 0,comment,distance,milestone_distance,milestone_time
8,6.4,6.40,0.0,0.0
15,5.4,5.40,0.0,0.0
24,1.5,1.50,0.0,0.0
29,2.85,2.85,0.0,0.0
39,"10.9, 10, 3601",10.90,10.0,3601.0
43,3.3,3.30,0.0,0.0
50,"12, 10, 3608",12.00,10.0,3608.0
54,1.5,1.50,0.0,0.0
55,5.5,5.50,0.0,0.0
66,3.45,3.45,0.0,0.0
