# Wearable Device data preprocessing

## Explanations on Device and Data Variables

* Wearable Device = Samsung Galaxy Watch 4 Classic 
* ~.device_profile = Device information and ID.
* ~.ecg = Electrocardiogram value. It only contains mean heart rate value.
* ~.floors_climbed = Number of floors you've climbed. (Inaccurate)
* ~.food_info = Information of the food you take. (Inaccurate)
* ~.food_intake = Food intake information including variables like calories. (Inaccurate)
* ~.height =  User's height. User should put the information in samsung health application.
* ~.nutrition = Food intake nutrition information. (Inaccurate)
* ~.sleep_stage = Sleep stages including REM sleep. Automatically calculates when user sleep with their watch on. 
* ~.user_profile = User profiles like birth, nationality, etc. User should put the information.
* ~.water_intake = Glass of water intake. User should put the information everytime.
* ~.weight = User's weight. User should put the information.
* ~.activity.day_summary = Summary of user's activity information. Contains step count, active time, distance, run time, and walk time.
* ~.activity.goal = Not sure.
* ~.activity.level = Not sure.
* ~.alered_heart_rate = Not sure. Maybe alerted history, when user's heart rate is unusually high or low.
* ~.best_records = Not sure.
* ~.breathing = Breathing training history.
* ~.calories_burned.details = Not sure.
* ~.exercise = User's exercise information. Contains exercise distance, calories, etc.
* ~.exercise.weather = Not sure.
* ~.food_frequent = Not sure.
* ~.goal_history = Not sure.
* ~.insight.milestones = Not sure.
* ~.permission = Not sure.
* ~.preferences = Not sure.
* ~.program.sleep_coaching_mission = Counting sleep coaching program. 
* ~.program.sleep_coaching.session = Not sure.
* ~.report = Not sure.
* ~.rewards = Not sure.
* ~.sleep = Data collected during sleep. 
* ~.sleep_combined = Data collected during sleep. (Inaccurate)
* ~.sleep_goal = Not sure.
* ~.sleep_snoring = Not sure. Maybe information about snoring during sleep. Stored in samsung health application.
* ~.social.public_challenge.extra = Not sure.
* ~.social.service_status = Not sure.
* ~.stand_day_summary = Not sure.
* ~.step_dailty_trend = Not sure.
* ~.stress = Daily stress tracking value. 
* ~.stress.histogram = Not sure.
* ~.tracker.heart_rate = Daily heart rate tracking value. Contains the stress score between 1 and 100.
* ~.tracker.oxygen_saturation = Oxygen saturation value. Measured only when user activates.
* ~.tracker.pedometer_day_summary = Daily step counts, based on pedometer. But data must be organized by date.
* ~.tracker.pedometer_recommendation = Not sure.
* ~.tracker.pedometer_step_count = Daily step counts, based on pedometer. But data must be organized by date. 
* /files/ecg = pdf file format of ecg measures. Shows the 30seconds ecg results.

## Data file list that we need for Analysis

* ecg (both numerical csv file and pdf file) -- 
* activity_day_summary 
* exercise (maybe. for tracking what kind of exercise user did and for how long)
* sleep 
* stress
* stress.histogram (maybe. check json file)
* heart_rate
* oxygen saturation (maybe. have to search for more references)
* pedometer_day_summary (have to check the columns)
* tracker_pedometer_step_count

## Extracting data that we need

In [None]:
import json
import os
import os.path
import sqlite3
import numpy as np
import pandas as pd

In [None]:
from functools import reduce
from glob import glob

In [None]:
# First setting the basic paths for data files
global csv_paths
wearable_path = 'E:/RESEARCH/wearable/' #adjust wearable data path

In [None]:
# Exercise type code - 'exercise_type' column
exercise_type = {
    1001: 'walking',
    1002: 'running',
    14001: 'Swimming',
    11007: 'Cycling',
    0: 'other'
}

In [None]:
# Defini tion for converting the data collected time.
def conv_date(x, col_name=''):
    try:
        if any('day_time' in col for col in x.columns):
            col_name = 'day_time'
            x['day_time'] = pd.to_datetime(
                x[[col for col in x.columns if col_name in col][0]], unit='ms')
        if any('start_time' in col for col in x.columns):
            col_name = 'start_time'
            x['day_time'] = pd.to_datetime(
                x[[col for col in x.columns if col_name in col][0]])
        x['day_time'] = pd.to_datetime(x['day_time'])
        
    except RuntimeError as e:
        print(e)
        print("Can't find one of these columns")
        print("Column names", x.columns)

In [None]:
# Galaxy watch data contains json files. Definition for calling json files.
def get_json_file(s):
    join = os.path.join('**', s + '*.json')
    json_paths = glob(join, recursive=True)
    json_file = open(json_paths[0])
    return json.load(json_file)

In [None]:
# mergeing data frame csv files that we need for the analysis
def df_meged():
    global csv_paths
    
    selected_csv = [
        'com.samsung.shealth.activity.day_summary'
    ]

In [None]:
# extracting heart rate data (most important)
def heart_rate_data():
    """
    heart_rate_data reads json format heart rate data from wearable device,
    then return the extracted results as hr_df.
    
    dataframe contains heart rate, maximum heart rate, minimum heart rate, and measured time variables.
    """
    hr_path = os.path.join(wearable_path, '**', 'com.samsung.shealth.tracker.heart_rate','**','*.json')
    hr_files_path = glob(hr_path, recursive=True)
    hr_df = pd.DataFrame(columns=['heart_rate', 'heart_rate_max', 'heart_rate_min', 'start_time', 'end_time'])
    hr_df = hr_df.fillna(0) #filling NA/NAN values
    
    for f in hr_file_paths:
        try:
            with open(f) as file:
                json_hr = json.load(file)
                part_df = pd.DataFrame(json_hr)
                hr_df = pd.concat([hr_df, part_df])

        except ValueError as e:
            print(e)
    
    hr_df['day_time'] = pd.to_datetime(hr_df['start_time'], unit='ms')
    return hr_df

In [None]:
# Extracting heart rate data by hourly time period
def hourly_heart_rate():
    hr_df = heart_rate_data()
    times = pd.DatetimeIndex(hr_df.day_time)
    hr_df = hr_df.groupby([times.hour, times.date]).agg('mean')
    hr_df.reset_index(inplace=True)
    hr_df['day_time'] = pd.to_datetime(
        pd.to_datetime(hr_df['level_1']).dt.strftime('%Y-%m-%d') + ' ' + hr_df['day_time'].astype(str) + ':00')

    return hr_df

## Extracting Final Data

In [None]:
def data_extraction():
    global csv_paths
    base_dir = os.path.join(wearable_path)
    
    dump_dirs = glob(os.path.join(base_dir, '*'))
    dump_dir = os.path.basename(dump_dirs[0])
    print(len(dump_dirs), 'dumps found, taking first:', dump_dir)

    csv_paths = glob(os.path.join(base_dir, dump_dir, '*.csv'))
    print(len(csv_paths), 'csvs found')
    
    hr_data = hr_raw().sort_values(by='day_time', ascending=True)
    exercise_data = get_exercise_data()
    
    df_data_list = [hr_data]
    df_name_list = ['hr_data']

    for i, d in enumerate(df_data_list):
        try:
            d.to_csv('./' + str(df_name_list[i]) + '.csv')
        except Exception as e:
            print(e)
            
if __name__ == "__main__":
    data_extraction()

## Individual file preprocessing

### 1. Demographic + Labeling data Coding File

* 

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
coded = pd.read_csv('E:/RESEARCH/Datasets/wearable/AI_coded_merged.csv', sep=',')
coded.head()

Unnamed: 0,sub,name,class,exam_date,age,sex,residence,edu,marriage,job,...,PSQI_K_v2,PSQI_K_v3,STAI_X2_v1,PHQ_9_v1,PHQ_9_v2,PHQ_9_v3,GAD_7_v1,STAI_X1_v1,RAS_v1,RSE_v1
0,1.0,BKJ,3.0,2021-05-10,47.0,2.0,1.0,16.0,2.0,3.0,...,2.0,3.0,23.0,0.0,0.0,0.0,0.0,20.0,60.0,40.0
1,2.0,KHM,3.0,2021-05-12,34.0,2.0,1.0,18.0,2.0,2.0,...,5.0,1.0,30.0,0.0,0.0,0.0,0.0,31.0,60.0,37.0
2,3.0,YEJ,3.0,2021-05-18,34.0,2.0,1.0,16.0,2.0,2.0,...,4.0,5.0,32.0,0.0,0.0,0.0,0.0,31.0,46.0,33.0
3,4.0,SYJ,3.0,2021-05-24,29.0,2.0,1.0,17.0,2.0,3.0,...,4.0,4.0,32.0,0.0,2.0,8.0,0.0,32.0,54.0,38.0
4,5.0,CDH,3.0,2021-05-25,35.0,1.0,1.0,16.0,2.0,2.0,...,1.0,1.0,25.0,0.0,0.0,0.0,0.0,30.0,48.0,37.0


In [3]:
coded.shape

(144, 74)

In [4]:
coded.columns

Index(['sub', 'name', 'class', 'exam_date', 'age', 'sex', 'residence', 'edu',
       'marriage', 'job', 'religion', 'height_v1', 'height_v2', 'height_v3',
       'weight_v1', 'weight_v2', 'weight_v3', 'SBP_v1', 'SBP_v2', 'SBP_v3',
       'DBP_v1', 'DBP_v2', 'DBP_v3', 'HR_v1', 'HR_v2', 'HR_v3', 'BT_v1',
       'BT_v2', 'BT_v3', 'alcohol', 'alcohol_freq', 'drink_amount',
       'drink_7up', 'smoke', 'exercise_hour', 'exercise_intensity', 'walking',
       'SSS_v1', 'NEO_v1', 'MDQ_v1', 'MINI_suicidality_v1',
       'MINI_suicidality_v2', 'MINI_suicidality_v3', 'MINI_danger_v1',
       'MINI_danger_v2', 'MINI_danger_v3', 'HAMD_v1', 'HAMD_v2', 'HAMD_v3',
       'HAMA_v1', 'HAMA_v2', 'HAMA_v3', 'CGI-S_v1', 'CGI-S_v3', 'CGI-I_v3',
       'WHOQOL-BREF_v1', 'BIS_v1', 'BIS_v2', 'BIS_v3', 'BHS_v1', 'BHS_v2',
       'BHS_v3', 'K_MAIA_v1', 'PSQI_K_v1', 'PSQI_K_v2', 'PSQI_K_v3',
       'STAI_X2_v1', 'PHQ_9_v1', 'PHQ_9_v2', 'PHQ_9_v3', 'GAD_7_v1',
       'STAI_X1_v1', 'RAS_v1', 'RSE_v1'],
      dtype

In [None]:
# columns organized
col_label = ['class'] #1=dp,2=si,3=nor
col_SSS = ['SSS_v1']
col_NEO = ['NEO_v1']
col_MINI = ['MINI_suicidality_v1','MINI_suicidality_v2','MINI_suicidality_v3', 'MINI_danger_v1','MINI_danger_v2','MINI_danger_v3']
col_HAMD = ['HAMD_v1','HAMD_v2','HAMD_v3']
col_HAMA = ['HAMA_v1','HAMA_v2','HAMA_v3']
col_PHQ9 =['PHQ_9_v1','PHQ_9_v2','PHQ_9_v3']
col_BIS = ['BIS_v1','BIS_v2','BIS_v3']
col_BHS = ['BHS_v1','BHS_v2','BHS_v3']
col_basic = ['name','age','sex','residence','edu','marriage','job','religion']
col_body = ['height_v1','height_v2','height_v3','weight_v1','weight_v2','weight_v3','SBP_v1','SBP_v2','SBP_v3','DBP_v1','DBP_v2','DBP_v3','HR_v1','HR_v3','HR_v3','BT_v1','BT_v2','BT_v3']
col_health =['alcohol','alcohol_freq','drink_amount','drink_7up','smoke','exercise_hour','exercise_intensity','walking']

In [None]:
#drop 통해서 필요 없는 변수들 제거 or 필요 변수(label) select
coded_drop = coded.drop(['name','disorder','VISIT','HAMD', 'HAMA','PDSS','ASI','APPQ','PSWQ','SPI','PSS','BIS','SSI'], axis=1)
coded_drop.head()