## SSP(Sleep Sensor Prediction)_JY Data Preprocessing
* Preprocessing

In [3]:
# !pip install pandas numpy matplotlib seaborn scikit-learn torch xgboost lightgbm catboost pyarrow fastparquet py7zr

In [44]:
import warnings
warnings.filterwarnings('ignore')

import os
import random
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
plt.rc('font', family='NanumGothicCoding')
plt.rcParams['axes.unicode_minus'] = False
plt.rc('axes', unicode_minus=False)

from datetime import datetime

from argparse import ArgumentParser

parser = ArgumentParser(description="SSP_JY")

## DATA
parser.add_argument("--seed", default=42, type=int)
parser.add_argument("--valid_path", default="./data/valid_data", type=str)
parser.add_argument("--test_path", default="./data/test_data", type=str)

args = parser.parse_args('')

CFG = {
    "SEED"       : args.seed,
    "VALID_PATH" : args.valid_path,
    "TEST_PATH"  : args.test_path,
}

def seed_everything(SEED):
    os.environ['PYTHONHASHSEED'] = str(SEED)
    random.seed(SEED)
    np.random.seed(SEED)

seed_everything(CFG['SEED'])

idx = f"{parser.description}_{CFG['SEED']}"
idx

'SSP_JY_42'

## preprocess.py

In [50]:
## 피실험자별 date 추출
train_label = pd.read_csv('./data/valid_data/val_label.csv')
train_label['date'] = pd.to_datetime(train_label['date'])

test_label = pd.read_csv('./data/answer_sample.csv')
test_label['date'] = pd.to_datetime(test_label['date'])

In [51]:
df_activity   = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_activity.parquet.gzip'))
df_gps        = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_gps.parquet.gzip'))
df_m_light    = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_light.parquet.gzip'))
df_pedo       = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__w_pedo.parquet.gzip'))
df_heart_rate = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__w_heart_rate.parquet.gzip'))
df_w_light    = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__w_light.parquet.gzip'))
df_usage      = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_usage_stats.parquet.gzip'))
df_ambience   = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_ambience.parquet.gzip'))

ts_activity   = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_activity.parquet.gzip'))
ts_gps        = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_gps.parquet.gzip'))
ts_m_light    = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_light.parquet.gzip'))
ts_pedo       = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_w_pedo.parquet.gzip'))
ts_heart_rate = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_w_heart_rate.parquet.gzip'))
ts_w_light    = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_w_light.parquet.gzip'))
ts_usage      = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_usage_stats.parquet.gzip'))
ts_ambience   = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_ambience.parquet.gzip'))

#### m_activity

In [52]:
## activity 활동성에 따른 재매핑

activity_map = {
    '8':'5',
    '1':'4',
    '7':'3',
    '3':'2',
    '0':'1',
    '4':'0'
}

df_activity.m_activity = df_activity.m_activity.map(activity_map)
ts_activity.m_activity = ts_activity.m_activity.map(activity_map)

In [53]:
## train

ids  = [1,2,3,4]
activity = {
    'subject_id': [],
    'hour': [],
    'activity': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.date
    
    for day in user_timestamp:

        ## 윤지 인사이트 적용
        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_activity = df_activity.loc[(df_activity.subject_id == id) & (df_activity.timestamp.dt.floor('H') == hour), :]

            if not user_activity.empty:
                act = user_activity.m_activity.max()
            else:
                act = '0'

            activity['subject_id'].append(id)
            activity['hour'].append(hour)
            activity['activity'].append(act)

train_activity = pd.DataFrame(activity)
# train_activity

In [54]:
## test

ids = [5,6,7,8]
activity = {
    'subject_id': [],
    'hour': [],
    'activity': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_activity = ts_activity.loc[(ts_activity.subject_id == id) & (ts_activity.timestamp.dt.floor('H') == hour), :]
            
            if not user_activity.empty:
                act = user_activity.m_activity.max()
            else:
                act = '0'
            
            activity['subject_id'].append(id)
            activity['hour'].append(hour)
            activity['activity'].append(act)

test_activity = pd.DataFrame(activity)
# test_activity

#### m_gps

In [55]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # 지구의 반지름 (km)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def hourly_distance(df):
    distances = haversine(df['latitude'].shift(), df['longitude'].shift(),
                            df['latitude'], df['longitude'])
    distances = distances.fillna(0)
    return distances.sum()   # sum 을 통해 시간별 이동거리를 구하였음

In [56]:
## train

ids = [1,2,3,4]
gps = {
    'subject_id': [],
    'hour': [],
    'distance': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_gps = df_gps.loc[(df_gps.subject_id == id) & (df_gps.timestamp.dt.floor('H') == hour), :]
            
            if not user_gps.empty:
                distance = hourly_distance(user_gps)
            else:
                distance = 0.0

            gps['subject_id'].append(id)
            gps['hour'].append(hour)
            gps['distance'].append(distance)

train_gps = pd.DataFrame(gps)
# train_gps

In [57]:
## test

ids = [5,6,7,8]
gps = {
    'subject_id': [],
    'hour': [],
    'distance': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_gps = ts_gps.loc[(ts_gps.subject_id == id) & (ts_gps.timestamp.dt.floor('H') == hour), :]
            
            if not user_gps.empty:
                distance = hourly_distance(user_gps)
            else:
                distance = 0.0

            gps['subject_id'].append(id)
            gps['hour'].append(hour)
            gps['distance'].append(distance)

test_gps = pd.DataFrame(gps)
# test_gps

#### m_light

In [58]:
## train

ids = [1,2,3,4]
avg_m_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': [],
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_m_light = df_m_light.loc[(df_m_light.subject_id == id) & (df_m_light.timestamp.dt.floor('H') == hour), :]

            if not user_m_light.empty:
                max_light = user_m_light.m_light.max()
                mean_light = user_m_light.m_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_m_light['subject_id'].append(id)
            avg_m_light['hour'].append(hour)
            avg_m_light['max_light'].append(max_light)
            avg_m_light['mean_light'].append(mean_light)

train_m_light = pd.DataFrame(avg_m_light)
# train_m_light

In [59]:
## test

ids = [5,6,7,8]
avg_m_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': [],
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_m_light = ts_m_light.loc[(ts_m_light.subject_id == id) & (ts_m_light.timestamp.dt.floor('H') == hour), :]

            if not user_m_light.empty:
                max_light = user_m_light.m_light.max()
                mean_light = user_m_light.m_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_m_light['subject_id'].append(id)
            avg_m_light['hour'].append(hour)
            avg_m_light['max_light'].append(max_light)
            avg_m_light['mean_light'].append(mean_light)

test_m_light = pd.DataFrame(avg_m_light)
# test_m_light

#### w_pedo

In [60]:
## 건혁 인사이트 적용
'''
:Drop Columns: ['step_frequency', 'walking_steps','distance', 'speed']
'''

df_pedo = df_pedo[['subject_id', 'timestamp', 'burned_calories', 'running_steps', 'steps']]
ts_pedo = ts_pedo[['subject_id', 'timestamp', 'burned_calories', 'running_steps', 'steps']]

In [61]:
## train

ids = [1,2,3,4]
avg_pedo = {
    'subject_id': [],
    'hour': []
}
mean_cols = ['mean_burned_calories', 'mean_running_steps', 'mean_steps']
sum_cols = ['sum_burned_calories', 'sum_running_steps', 'sum_steps']
user_avg_pedo = pd.DataFrame(columns=mean_cols)
user_sum_pedo = pd.DataFrame(columns=sum_cols)

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_pedo = df_pedo.loc[(df_pedo.subject_id == id) & (df_pedo.timestamp.dt.floor('H') == hour), :]
            
            if not user_pedo.empty:
                user_pedo_mean = pd.DataFrame(user_pedo.iloc[:, 2:].mean()).T
                user_pedo_mean.columns = mean_cols
                user_pedo_sum  = pd.DataFrame(user_pedo.iloc[:, 2:].sum()).T
                user_pedo_sum.columns = sum_cols
            else:
                user_pedo_mean = pd.DataFrame(np.zeros((1,3)), columns=mean_cols)
                user_pedo_sum  = pd.DataFrame(np.zeros((1,3)), columns=sum_cols)
            
            avg_pedo['subject_id'].append(id)
            avg_pedo['hour'].append(hour)
            user_avg_pedo = pd.concat([user_avg_pedo, user_pedo_mean], axis=0)
            user_sum_pedo = pd.concat([user_sum_pedo, user_pedo_sum], axis=0)

user_avg_pedo.reset_index(drop=True, inplace=True)
user_sum_pedo.reset_index(drop=True, inplace=True)

avg_pedo = pd.DataFrame(avg_pedo)
train_pedo = pd.concat([avg_pedo, user_avg_pedo], axis=1)
train_pedo = pd.concat([train_pedo, user_sum_pedo], axis=1)
# train_pedo.head()

In [62]:
## test

ids = [5,6,7,8]
avg_pedo = {
    'subject_id': [],
    'hour': []
}
mean_cols = ['mean_burned_calories', 'mean_running_steps', 'mean_steps']
sum_cols = ['sum_burned_calories', 'sum_running_steps', 'sum_steps']
user_avg_pedo = pd.DataFrame(columns=mean_cols)
user_sum_pedo = pd.DataFrame(columns=sum_cols)

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end   = day_start + pd.Timedelta(days=1)
        hours     = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_pedo = ts_pedo.loc[(ts_pedo.subject_id == id) & (ts_pedo.timestamp.dt.floor('H') == hour), :]
            
            if not user_pedo.empty:
                user_pedo_mean = pd.DataFrame(user_pedo.iloc[:, 2:].mean()).T
                user_pedo_mean.columns = mean_cols
                user_pedo_sum  = pd.DataFrame(user_pedo.iloc[:, 2:].sum()).T
                user_pedo_sum.columns = sum_cols
            else:
                user_pedo_mean = pd.DataFrame(np.zeros((1,3)), columns=mean_cols)
                user_pedo_sum  = pd.DataFrame(np.zeros((1,3)), columns=sum_cols)
            
            avg_pedo['subject_id'].append(id)
            avg_pedo['hour'].append(hour)
            user_avg_pedo = pd.concat([user_avg_pedo, user_pedo_mean], axis=0)
            user_sum_pedo = pd.concat([user_sum_pedo, user_pedo_sum], axis=0)

user_avg_pedo.reset_index(drop=True, inplace=True)
user_sum_pedo.reset_index(drop=True, inplace=True)

avg_pedo = pd.DataFrame(avg_pedo)
test_pedo = pd.concat([avg_pedo, user_avg_pedo], axis=1)
test_pedo = pd.concat([test_pedo, user_sum_pedo], axis=1)
# test_pedo.head()

#### m_usage_stats

In [63]:
## train

ids = [1,2,3,4]
avg_m_usage = {
    'subject_id': [],
    'hour': [],
    'app_total_use_time': [],
    'app_mean_use_time': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        total_times = 0

        for hour in hours:
            user_app = df_usage.loc[(df_usage.subject_id == id) & (df_usage.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_app.iterrows():
                data = row['m_usage_stats']
                if len(data):
                    for item in data:
                        if 'total_time' in item:
                            total_times += item['total_time']
                else:
                    pass
            
            ## milisecond to minuts
            total_times /= 60000
            avg_m_usage['subject_id'].append(id)
            avg_m_usage['hour'].append(hour)
            avg_m_usage['app_total_use_time'].append(total_times)
            
            if len(user_app):
                avg_m_usage['app_mean_use_time'].append(total_times / len(user_app))
            else:
                avg_m_usage['app_mean_use_time'].append(0.0)

train_m_usage = pd.DataFrame(avg_m_usage)
# train_m_usage.head()

In [64]:
## test

ids = [5,6,7,8]
avg_m_usage = {
    'subject_id': [],
    'hour': [],
    'app_total_use_time': [],
    'app_mean_use_time': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        total_times = 0

        for hour in hours:
            user_app = ts_usage.loc[(ts_usage.subject_id == id) & (ts_usage.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_app.iterrows():
                data = row['m_usage_stats']
                if len(data):
                    for item in data:
                        if 'total_time' in item:
                            total_times += item['total_time']
                else:
                    pass

            ## milisecond to minuts
            total_times /= 60000
            avg_m_usage['subject_id'].append(id)
            avg_m_usage['hour'].append(hour)
            avg_m_usage['app_total_use_time'].append(total_times)
            
            if len(user_app):
                avg_m_usage['app_mean_use_time'].append(total_times / len(user_app))
            else:
                avg_m_usage['app_mean_use_time'].append(0.0)

test_m_usage = pd.DataFrame(avg_m_usage)
# test_m_usage.head()

#### m_ambience

In [18]:
# ## 앞에서 하나만 뽑아도 unique 한 값이 동일함
# ambient_list = []
# ambient_df = pd.concat([df_ambience, ts_ambience], axis=0).reset_index(drop=True)
# for i in range(len(ambient_df)):
#     for item in ambient_df.ambience_labels[i]:
#         # ambient_name = item[0]
#         name = ''
#         ambient_name = [(name + i) for i in item[:-1]]
#         ambient_name = ambient_name[0].replace(',', '_')
#         ambient_name = ambient_name.replace(' ', '_')
#         ambient_name = ambient_name.replace('__', '_')
#         ambient_list.append(ambient_name)

# abmient_unique = set(ambient_list)
# len(abmient_unique)

In [19]:
# ## 395 샘플을 제외하면 모두 top 10의 결과를 가짐

# cnt = {
#     '0':0,
#     '10':0,
# }
# for item in df_ambience.ambience_labels:
#     if len(item) == 0:
#         cnt['0'] += 1
#     else:
#         cnt['10'] += 1
# cnt, len(df_ambience)

In [41]:
# ## train (현재 수정 중)
# ## top 10의 결과를 뽑음

# ids = [1,2,3,4]
# avg_m_ambience = {
#     'subject_id': [],
#     'hour': [],
# }

# ambience_df = pd.DataFrame()

# for id in ids:
#     user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
#     for day in user_timestamp:

#         day_start = pd.to_datetime(day)
#         day_end = day_start + pd.Timedelta(days=1)
#         hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
#         for hour in hours:
#             user_ambience = df_ambience.loc[(df_ambience.subject_id == id) & (df_ambience.timestamp.dt.floor('H') == hour), :]
#             ambience_dict = []
#             for index, row in user_ambience.iterrows():
#                 data = row['ambience_labels']

#                 if len(data):
#                     for item in data:
#                         ambience_dict.append((item[0], float(item[-1])))

#                 else:
#                     pass

#                 sorted_labels = sorted(ambience_dict, key=lambda x: x[1], reverse=True)[:10]
#                 sorted_labels = [x[0] for x in sorted_labels]
#                 ambience_df = pd.concat([ambience_df, pd.DataFrame(sorted_labels).T], axis=0)

#             avg_m_ambience['subject_id'].append(id)
#             avg_m_ambience['hour'].append(hour)

# train_m_ambience = pd.DataFrame(avg_m_ambience)
# ambience_df.columns = [f'ambience_{i}' for i in range(10)]
# ambience_df.reset_index(drop=True, inplace=True)
# train_m_ambience = pd.concat([train_m_ambience, ambience_df], axis=1)
# train_m_ambience.shape

(74558, 12)

In [136]:
# ## test (현재 수정 중)
# ## top 10의 결과를 뽑음

# ids = [5,6,7,8]
# avg_m_ambience = {
#     'subject_id': [],
#     'hour': [],
# }

# ambience_df = pd.DataFrame()

# for id in ids:
#     user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
#     for day in user_timestamp:

#         day_start = pd.to_datetime(day)
#         day_end = day_start + pd.Timedelta(days=1)
#         hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
#         for hour in hours:
#             user_ambience = ts_ambience.loc[(ts_ambience.subject_id == id) & (ts_ambience.timestamp.dt.floor('H') == hour), :]
#             ambience_dict = {}
#             for index, row in user_ambience.iterrows():
#                 data = row['ambience_labels']

#                 if len(data):
#                     for item in data:
#                         ambience_dict[item[0]] = float(item[-1])

#                 else:
#                     ambience_dict['None'] = 0.0
#                     pass
            
#             sorted_labels = sorted(ambience_dict.items(), key=lambda x: x[1], reverse=True)[:10]
#             sorted_labels = [x[0] for x in sorted_labels]
            
#             if len(ambience_dict) == 0:
#                 sorted_labels = ['None'] * 10 
#                 ambience_df = pd.concat([ambience_df, pd.DataFrame(sorted_labels).T], axis=0)
            
#             else:
#                 ambience_df = pd.concat([ambience_df, pd.DataFrame(sorted_labels).T], axis=0)

#             avg_m_ambience['subject_id'].append(id)
#             avg_m_ambience['hour'].append(hour)

# test_m_ambience = pd.DataFrame(avg_m_ambience)
# ambience_df.columns = [f'ambience_{i}' for i in range(10)]
# ambience_df.reset_index(drop=True, inplace=True)
# test_m_ambience = pd.concat([test_m_ambience, ambience_df], axis=1)
# test_m_ambience.shape

(2760, 12)

In [69]:
## train
## top 1의 결과를 뽑음

ids = [1,2,3,4]
avg_m_ambience = {
    'subject_id': [],
    'hour': [],
    'max_ambience_cls': []
}

ambience_df = pd.DataFrame

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            ambience_rate = 0
            user_ambience = df_ambience.loc[(df_ambience.subject_id == id) & (df_ambience.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_ambience.iterrows():
                data = row['ambience_labels']
                if len(data):
                    for item in data:
                        if ambience_rate < float(item[-1]):
                            ambience_rate = float(item[-1])
                            ambience_name = item[0]

                else:
                    ambience_name ='None'
                    pass

            avg_m_ambience['subject_id'].append(id)
            avg_m_ambience['hour'].append(hour)
            avg_m_ambience['max_ambience_cls'].append(ambience_name)

train_m_ambience = pd.DataFrame(avg_m_ambience)
# train_m_ambience.head()

In [70]:
## test
ids = [5,6,7,8]
avg_m_ambience = {
    'subject_id': [],
    'hour': [],
    'max_ambience_cls': [],
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')

        for hour in hours:
            ambience_rate = 0
            user_ambience = ts_ambience.loc[(ts_ambience.subject_id == id) & (ts_ambience.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_ambience.iterrows():
                data = row['ambience_labels']
                if len(data):
                    for item in data:
                        if ambience_rate < float(item[-1]):
                            ambience_rate = float(item[-1])
                            ambience_name = item[0]

                else:
                    ambience_name ='None'
                    pass

            avg_m_ambience['subject_id'].append(id)
            avg_m_ambience['hour'].append(hour)
            avg_m_ambience['max_ambience_cls'].append(ambience_name)

test_m_ambience = pd.DataFrame(avg_m_ambience)
# test_m_ambience.head()

#### w_heart_rate

In [65]:
## train

ids = [1,2,3,4]
avg_hr = {
    'subject_id': [],
    'hour': [],
    'max_hr': [],
    'mean_hr': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_hr = df_heart_rate.loc[(df_heart_rate.subject_id == id) & (df_heart_rate.timestamp.dt.floor('H') == hour), :]

            if not user_hr.empty:
                hr_max = user_hr.heart_rate.max()
                hr_mean = user_hr.heart_rate.mean()
            else:
                hr_max = 0.0
                hr_mean = 0.0

            avg_hr['subject_id'].append(id)
            avg_hr['hour'].append(hour)
            avg_hr['max_hr'].append(hr_max)
            avg_hr['mean_hr'].append(hr_mean)

train_hr = pd.DataFrame(avg_hr)
# train_hr.head()

In [66]:
## test

ids = [5,6,7,8]
avg_hr = {
    'subject_id': [],
    'hour': [],
    'max_hr': [],
    'mean_hr': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_hr = ts_heart_rate.loc[(ts_heart_rate.subject_id == id) & (ts_heart_rate.timestamp.dt.floor('H') == hour), :]

            if not user_hr.empty:
                hr_max = user_hr.heart_rate.max()
                hr_mean = user_hr.heart_rate.mean()
            else:
                hr_max = 0.0
                hr_mean = 0.0

            avg_hr['subject_id'].append(id)
            avg_hr['hour'].append(hour)
            avg_hr['max_hr'].append(hr_max)
            avg_hr['mean_hr'].append(hr_mean)

test_hr = pd.DataFrame(avg_hr)
# test_hr.head()

#### w_light

In [67]:
## train

ids = [1,2,3,4]
avg_w_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_w_light = df_w_light.loc[(df_w_light.subject_id == id) & (df_w_light.timestamp.dt.floor('H') == hour), :]

            if not user_w_light.empty:
                max_light = user_w_light.w_light.max()
                mean_light = user_w_light.w_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_w_light['subject_id'].append(id)
            avg_w_light['hour'].append(hour)
            avg_w_light['max_light'].append(max_light)
            avg_w_light['mean_light'].append(mean_light)

train_w_light = pd.DataFrame(avg_w_light)
# train_w_light.head()

In [68]:
## test

ids = [5,6,7,8]
avg_w_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_w_light = ts_w_light.loc[(ts_w_light.subject_id == id) & (ts_w_light.timestamp.dt.floor('H') == hour), :]

            if not user_w_light.empty:
                max_light = user_w_light.w_light.max()
                mean_light = user_w_light.w_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_w_light['subject_id'].append(id)
            avg_w_light['hour'].append(hour)
            avg_w_light['max_light'].append(max_light)
            avg_w_light['mean_light'].append(mean_light)

test_w_light = pd.DataFrame(avg_w_light)
# test_w_light.head()

In [71]:
train_data = pd.merge(train_activity, train_gps, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_m_light, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_pedo, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_hr, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_w_light, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_m_usage, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_m_ambience, on=['subject_id', 'hour'], how='left')

train_data.shape
# train_data.head()

(2520, 19)

In [72]:
test_data = pd.merge(test_activity, test_gps, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_m_light, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_pedo, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_hr, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_w_light, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_m_usage, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_m_ambience, on=['subject_id', 'hour'], how='left')

test_data.shape
# test_data.head()

(2760, 19)

In [73]:
train_data.to_csv(os.path.join(CFG['VALID_PATH'],'train_data2.csv'), index=False)
test_data.to_csv(os.path.join(CFG['TEST_PATH'], 'test_data2.csv'), index=False)

In [74]:
train_data = pd.read_csv(os.path.join(CFG['VALID_PATH'],'train_data2.csv'))
test_data = pd.read_csv(os.path.join(CFG['TEST_PATH'], 'test_data2.csv'))

train_data.shape, test_data.shape

((2520, 19), (2760, 19))