In [1]:
import pandas as pd
import numpy as np
import os
import json
from pprint import pprint

In [2]:
## Path for files downloaded from Samsung Health
data_dir  = "/persistentStorage/SHealth_DataSet/Shealth/"
json_path = "/persistentStorage/SHealth_DataSet/Shealth/jsons/"

sleep_file_prefix      = "com.samsung.shealth.sleep"
sleep_data_file_prefix = "com.samsung.shealth.sleep_data"

sleep_file = None
sleep_data_file = None

for file in os.listdir(data_dir):
    if ((sleep_file_prefix in file) and
        (sleep_data_file_prefix not in file) and
        (file.endswith("csv"))):
        sleep_file = file
        
    if ((sleep_data_file_prefix in file) and
        (file.endswith("csv"))):
        sleep_data_file = file
        
if ((sleep_file == None) or 
    (sleep_data_file == None)):
    print ("File not found")

print (sleep_file, sleep_data_file)

com.samsung.shealth.sleep.201809031104.csv com.samsung.shealth.sleep_data.201809031104.csv


In [3]:
## Read and process Sleep dataframe
pd_sleep_1 = pd.read_csv(data_dir+sleep_file, header=1)

cols_to_remove = ["com.samsung.health.sleep.custom", "extra_data", "com.samsung.health.sleep.comment",
                  "original_efficiency", "original_bed_time", "original_wake_up_time",
                  "com.samsung.health.sleep.pkg_name", "com.samsung.health.sleep.deviceuuid",
                  "com.samsung.health.sleep.create_time", "com.samsung.health.sleep.update_time"]

pd_sleep_1.drop(columns=cols_to_remove, inplace=True)

col_rename_dict = {"com.samsung.health.sleep.datauuid"   : "sleep_uuid",
                   "com.samsung.health.sleep.time_offset": "sleep_time_offset",
                   "com.samsung.health.sleep.end_time"   : "sleep_end_time",
                   "com.samsung.health.sleep.start_time" : "sleep_start_time"}

pd_sleep_1.rename(columns=col_rename_dict, inplace=True)

for col in pd_sleep_1.columns:
    if (pd_sleep_1[col].isnull().any()):
        print ("Null values in %s is %d" %(col, pd_sleep_1[col].isnull().sum()))
    else:
        print ("No null values in %s" %(col))

pd_sleep_1.shape

No null values in sleep_uuid
No null values in efficiency
Null values in has_sleep_data is 152
No null values in sleep_time_offset
No null values in sleep_end_time
Null values in quality is 202
No null values in sleep_start_time


(649, 7)

In [4]:
## Read and process sleep status dataframe
pd_sleep_2 = pd.read_csv(data_dir+sleep_data_file, header=1)

cols_to_remove = ["start_time", "pkg_name", "update_time", "create_time", "time_offset", "datauuid",
                  "comment", "deviceuuid", "json_version"]

pd_sleep_2.drop(columns=cols_to_remove, inplace=True)
print (pd_sleep_2.columns)
pd_sleep_2.shape

Index(['sleep_status', 'sleep_uuid'], dtype='object')


(498, 2)

In [5]:
## Merge the two dataframes - left join=sleep_uuid which is also the key
pd_sleep_data = pd.merge(left=pd_sleep_1, right=pd_sleep_2, how="left", on="sleep_uuid")
pd_sleep_data.shape

(649, 8)

In [6]:
pd_sleep_data.columns

Index(['sleep_uuid', 'efficiency', 'has_sleep_data', 'sleep_time_offset',
       'sleep_end_time', 'quality', 'sleep_start_time', 'sleep_status'],
      dtype='object')

In [7]:
def convert_ms_to_date_time_with_offset (df_series):
    
    ## Convert to UTC Date+Time
    df_series = pd.to_datetime(arg=df_series, unit="ms")
    
    ## Timezone conversion to local time
    df_series = df_series.dt.tz_localize("UTC").dt.tz_convert("Europe/Helsinki")
    
    return df_series

pd_sleep_data["sleep_start_time"] = convert_ms_to_date_time_with_offset(pd_sleep_data["sleep_start_time"])
pd_sleep_data["sleep_end_time"]   = convert_ms_to_date_time_with_offset(pd_sleep_data["sleep_end_time"])

In [8]:
pd_sleep_data.sort_values(ascending=True, by=["sleep_start_time"])

Unnamed: 0,sleep_uuid,efficiency,has_sleep_data,sleep_time_offset,sleep_end_time,quality,sleep_start_time,sleep_status
375,91d4823e-90be-4c7a-8370-8961697a4c3d,0.000000,,UTC+0300,2017-02-27 05:30:00+02:00,50004.0,2017-02-26 23:00:00+02:00,
273,6aeb5517-0af4-456f-a60c-e0b6c5565884,0.000000,,UTC+0300,2017-02-28 05:30:00+02:00,50003.0,2017-02-27 22:30:00+02:00,
72,18a908df-30b6-4e54-9430-cbf1e5ddef0e,0.000000,,UTC+0300,2017-03-01 05:30:00+02:00,50003.0,2017-02-28 23:30:00+02:00,
280,6e2c5424-ae9f-4797-8cb3-c786c4c8e502,0.000000,,UTC+0300,2017-03-02 05:30:00+02:00,50004.0,2017-03-01 22:30:00+02:00,
522,d126088f-b435-45f7-b5e3-edc8ced42707,0.000000,,UTC+0300,2017-03-03 05:30:00+02:00,50004.0,2017-03-02 22:30:00+02:00,
37,0de90c28-29f7-4edc-820a-48a93dcf3223,0.000000,,UTC+0300,2017-03-04 08:20:00+02:00,50005.0,2017-03-03 23:00:00+02:00,
46,100c4f58-05a4-4245-9467-8036d4cbc3ae,0.000000,,UTC+0300,2017-03-05 08:30:00+02:00,50005.0,2017-03-05 00:00:00+02:00,
39,0e37c478-25ce-4b8a-ad6f-934509ee64d2,0.000000,,UTC+0300,2017-03-06 05:30:00+02:00,50003.0,2017-03-05 22:30:00+02:00,
68,168d63f0-5e26-40b5-8233-7bb0eef285b9,0.000000,,UTC+0300,2017-03-07 05:40:00+02:00,50003.0,2017-03-06 22:30:00+02:00,
299,750df087-2abe-4bfc-b9e5-c77ff8b6e493,0.000000,,UTC+0300,2017-03-08 05:30:00+02:00,50002.0,2017-03-07 22:30:00+02:00,


In [9]:
sleep_json_dir = json_path + sleep_data_file_prefix + "/"

pd_sleep_data["sleep_status"] = pd_sleep_data["sleep_status"].fillna("None")

sleep_stat_dur = []
for item, row in pd_sleep_data.iterrows():
    json_file = row["sleep_status"]
    stuff = None

    if (json_file != "None"):
        json_abs_file = sleep_json_dir + json_file
        
        with open(json_abs_file) as f:
            json_data = json.load(f)
        
        index = 0
        json_data = sorted(json_data, key=lambda x: x["start_time"])

        while (index < len(json_data)):
            json_item = json_data[index]
            stat = json_item["status"]
            
            if (index+1 < len(json_data)):
                json_next_item = json_data[index+1]
                dur = json_next_item["start_time"] - json_item["start_time"]
            else:
                dur = 0

            found = False
            if (stuff != None):
                for subitem in stuff:
                    if (stat == subitem["stat"]):
                        subitem["dur"] += dur
                        found = True
                        break
                        
                if (not found):
                    stuff.append({"stat": stat, "dur": dur})
            else:
                stuff = [{"stat": stat, "dur": dur}]
            index += 1
    sleep_stat_dur.append(stuff)

In [10]:
## Col name list
col_list = []
for item in sleep_stat_dur:
    if (item != None):
        for subitem in item:
            if subitem["stat"] not in col_list: 
                col_list.append(subitem["stat"])

col_list = [("stat_"+str(x)) for x in sorted(col_list)]
col_list

['stat_0',
 'stat_10',
 'stat_20',
 'stat_30',
 'stat_40',
 'stat_50',
 'stat_60',
 'stat_70',
 'stat_80',
 'stat_90',
 'stat_100']

In [11]:
sleep_stat_dur

[None,
 [{'stat': 60, 'dur': 600000},
  {'stat': 30, 'dur': 600000},
  {'stat': 100, 'dur': 1200000},
  {'stat': 90, 'dur': 1200000},
  {'stat': 20, 'dur': 0}],
 [{'stat': 70, 'dur': 1200000},
  {'stat': 100, 'dur': 8400000},
  {'stat': 90, 'dur': 3600000},
  {'stat': 80, 'dur': 3000000},
  {'stat': 40, 'dur': 0}],
 [{'stat': 70, 'dur': 2400000},
  {'stat': 100, 'dur': 10200000},
  {'stat': 90, 'dur': 5400000},
  {'stat': 80, 'dur': 1200000},
  {'stat': 50, 'dur': 0}],
 None,
 [{'stat': 60, 'dur': 600000},
  {'stat': 90, 'dur': 6600000},
  {'stat': 100, 'dur': 6600000},
  {'stat': 80, 'dur': 3600000},
  {'stat': 70, 'dur': 600000},
  {'stat': 20, 'dur': 0}],
 [{'stat': 60, 'dur': 1200000},
  {'stat': 90, 'dur': 5400000},
  {'stat': 100, 'dur': 13200000},
  {'stat': 70, 'dur': 3000000},
  {'stat': 80, 'dur': 1200000},
  {'stat': 50, 'dur': 0}],
 None,
 [{'stat': 60, 'dur': 1200000},
  {'stat': 100, 'dur': 18000000},
  {'stat': 90, 'dur': 5400000},
  {'stat': 80, 'dur': 1800000},
  {'sta

In [12]:
stat_col_names = []
for n in range(0, 101, 10):
    col_name = "stat_" + str(n)
    stat_col_names.append(col_name)
    pd_sleep_data.insert(loc=len(pd_sleep_data.columns), column=col_name, value=0.0)

print (pd_sleep_data.columns)

Index(['sleep_uuid', 'efficiency', 'has_sleep_data', 'sleep_time_offset',
       'sleep_end_time', 'quality', 'sleep_start_time', 'sleep_status',
       'stat_0', 'stat_10', 'stat_20', 'stat_30', 'stat_40', 'stat_50',
       'stat_60', 'stat_70', 'stat_80', 'stat_90', 'stat_100'],
      dtype='object')


In [13]:
for ind, row in pd_sleep_data.iterrows():
    stat_dict = sleep_stat_dur[row.name]
    if (stat_dict != None):
        for j in stat_dict:
            col_name = "stat_" + str(j["stat"])
            pd_sleep_data.loc[ind, col_name] = j["dur"]

In [14]:
col_list = stat_col_names
col_list.append("efficiency")
col_list.append("sleep_start_time")
col_list.append("sleep_end_time")

In [15]:
pd_temp = pd_sleep_data[col_list].copy()

for ind, row in pd_temp.iterrows():
    for col in pd_temp.columns:
        if ((col != "efficiency") and 
            (col != "sleep_start_time") and 
            (col != "sleep_end_time") and
            (row[col] > 0)):
            pd_temp.loc[ind, col] = row[col] / (1000*60)

In [16]:
pd_temp.sort_values(ascending=False, by="sleep_start_time")

Unnamed: 0,stat_0,stat_10,stat_20,stat_30,stat_40,stat_50,stat_60,stat_70,stat_80,stat_90,stat_100,efficiency,sleep_start_time,sleep_end_time
52,0.0,0.0,0.0,0.0,0.0,10.0,50.0,10.0,50.0,70.0,250.0,89.438200,2018-09-02 22:30:00+03:00,2018-09-03 05:54:00+03:00
195,0.0,0.0,0.0,0.0,0.0,0.0,20.0,30.0,60.0,120.0,300.0,92.350746,2018-09-01 22:35:00+03:00,2018-09-02 07:31:00+03:00
421,0.0,0.0,0.0,0.0,0.0,0.0,10.0,20.0,20.0,40.0,150.0,92.561980,2018-09-01 02:59:00+03:00,2018-09-01 07:00:00+03:00
42,0.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,20.0,0.0,110.0,91.250000,2018-09-01 00:02:00+03:00,2018-09-01 02:41:00+03:00
648,0.0,0.0,0.0,0.0,0.0,10.0,0.0,10.0,20.0,30.0,70.0,88.666664,2018-08-31 03:36:00+03:00,2018-08-31 06:05:00+03:00
139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,10.0,60.0,140.0,95.045050,2018-08-30 23:43:00+03:00,2018-08-31 03:24:00+03:00
126,0.0,0.0,0.0,0.0,0.0,10.0,0.0,30.0,30.0,110.0,250.0,92.643680,2018-08-29 22:49:00+03:00,2018-08-30 06:03:00+03:00
636,0.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,50.0,120.0,280.0,93.067230,2018-08-28 22:07:00+03:00,2018-08-29 06:02:00+03:00
401,0.0,0.0,0.0,0.0,0.0,10.0,0.0,20.0,40.0,90.0,250.0,93.078760,2018-08-27 23:10:00+03:00,2018-08-28 06:08:00+03:00
343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,50.0,140.0,190.0,92.195120,2018-08-26 23:02:00+03:00,2018-08-27 06:03:00+03:00


In [17]:
## stat_100 = Motionless_sleep
## stat_90 - stat_70 = Light sleep
## stat_60 - stat_00 = Restless

pd_sleep_data.rename(columns={"stat_100": "motionless_sleep_duration"}, inplace=True)
pd_sleep_data.insert(loc=len(pd_sleep_data.columns), column="light_sleep_duration", value=0)
pd_sleep_data.insert(loc=len(pd_sleep_data.columns), column="restless_duration", value=0)

for ind, row in pd_sleep_data.iterrows():
    pd_sleep_data.loc[ind, "light_sleep_duration"] = row["stat_90"] + row["stat_80"] + row["stat_70"]
    pd_sleep_data.loc[ind, "restless_duration"]    = row["stat_60"] + row["stat_50"] + row["stat_40"] + row["stat_30"] + row["stat_20"] + row["stat_10"]

pd_sleep_data.drop(["stat_90", "stat_80", "stat_70", "stat_60", "stat_50", "stat_40", "stat_30", "stat_20",
                    "stat_10", "stat_0"], axis=1, inplace=True)

In [18]:
pd_sleep_data.to_pickle("sleep_dataframe.pkl")