## 율동공원팀

Use python 3.11 in WSL  
Nvidia RTX 1080 ti  
- torch==2.0.1  
- lightning==2.0.1  
- wandb==0.17.0  
- matplotlib==3.6.3  
- numpy==11.24.0  
- pandas==1.5.3  
- scikit_learn==1.2.2  
- seaborn==0.11.2

In [None]:
# !pip install pandas numpy matplotlib seaborn scikit-learn torch pyarrow fastparquet py7zr lightning wandb

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import random
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rc('axes', unicode_minus=False)

from datetime import datetime

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary

import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from sklearn.preprocessing import OneHotEncoder

# import wandb
# from lightning.pytorch.loggers import WandbLogger

from argparse import ArgumentParser

parser = ArgumentParser(description="SSP_JY")

## DATA
parser.add_argument("--train_path", default="./data/valid_data", type=str)
parser.add_argument("--test_path", default="./data/test_data", type=str)
parser.add_argument('--window_size', default=24, type=int)  # 수면 시간 고려하여 설정하였음
parser.add_argument('--stride_size', default=1, type=int)  # 1시간 단위로 봄

## MHA
parser.add_argument('--num_head', default=8, type=int)
parser.add_argument('--hid_dim', default=128, type=int)

## TRAIN
parser.add_argument('--optimizer', default="adamw", type=str)
parser.add_argument("--learning_rate", default=1e-4, type=float)
parser.add_argument("--weight_decay", default=0, type=float)
parser.add_argument('--scheduler', default="step", type=str)
parser.add_argument('--batch_size', default=16, type=int)
parser.add_argument('--epochs', default=1000, type=int)
parser.add_argument('--patience', default=100, type=int)

parser.add_argument('--seed', default=42, type=int)
parser.add_argument('--mixed_precision', default=32, type=int)
parser.add_argument('--device', nargs='+', default=[0], type=int)
parser.add_argument('--num_workers', default=0, type=int)

args = parser.parse_args('')

# wandb.init(config=args, name='SSP_JY(GAG)', project="ETRI_Baseline")
# wandb_logger = WandbLogger(name='SSP_JY(GAG)', project="ETRI_Baseline")
# wandb.config.update(args)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

CFG = {
    "WINDOW_SIZE" : args.window_size,
    "STRIDE_SIZE" : args.stride_size,
    "BATCH_SIZE" : args.batch_size,
    "EPOCHS"     : args.epochs,
    "PATIENCE"   : args.patience,
    "SEED"       : args.seed,
    "VALID_PATH" : args.train_path,
    "TEST_PATH"  : args.test_path,
}

def seed_everything(SEED):
    os.environ['PYTHONHASHSEED'] = str(SEED)
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    L.seed_everything(SEED)

torch.set_float32_matmul_precision('high') 
seed_everything(CFG['SEED'])

idx = f"{parser.description}_{device}"
idx

## preprocess.py

In [None]:
## 피실험자별 date 추출
train_label = pd.read_csv('./data/valid_data/val_label.csv')
train_label['date'] = pd.to_datetime(train_label['date'])

test_label = pd.read_csv('./data/answer_sample.csv')
test_label['date'] = pd.to_datetime(test_label['date'])

In [None]:
df_activity   = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_activity.parquet.gzip'))
df_gps        = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_gps.parquet.gzip'))
df_m_light    = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_light.parquet.gzip'))
df_pedo       = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__w_pedo.parquet.gzip'))
df_heart_rate = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__w_heart_rate.parquet.gzip'))
df_w_light    = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__w_light.parquet.gzip'))
df_usage      = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_usage_stats.parquet.gzip'))
df_ambience   = pd.read_parquet(os.path.join(CFG['VALID_PATH'], 'ch2024_val__m_ambience.parquet.gzip'))

ts_activity   = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_activity.parquet.gzip'))
ts_gps        = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_gps.parquet.gzip'))
ts_m_light    = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_light.parquet.gzip'))
ts_pedo       = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_w_pedo.parquet.gzip'))
ts_heart_rate = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_w_heart_rate.parquet.gzip'))
ts_w_light    = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_w_light.parquet.gzip'))
ts_usage      = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_usage_stats.parquet.gzip'))
ts_ambience   = pd.read_parquet(os.path.join(CFG['TEST_PATH'], 'ch2024_test_m_ambience.parquet.gzip'))

#### m_acc

In [None]:
## 데이터 바꾸면서 불러오기
df_m_acc = pd.read_parquet(os.path.join(CFG['VALID_PATH'],'ch2024_val__m_acc_part_4.parquet.gzip'))

In [None]:
# 가속도의 크기 계산
df_m_acc['magnitude'] = np.sqrt(df_m_acc['x']**2 + df_m_acc['y']**2 + df_m_acc['z']**2)

In [None]:
macc = {
    'subject_id': [],
    'hour': [],
    'average_x': [],
    'average_y': [],
    'average_z': [],
    'average_magnitude': []
}

# subject_id가 8인 경우만 처리
id = 4 ################## 이거만 바꾸기
user_timestamp = df_m_acc.loc[df_m_acc.subject_id == id, 'timestamp'].dt.date.unique()

for day in user_timestamp:
    day_start = pd.to_datetime(day)
    day_end = day_start + pd.Timedelta(days=1)
    hours = pd.date_range(start=day_start, end=day_end, freq='H', closed='left')
    
    for hour in hours:
        user_data = df_m_acc.loc[(df_m_acc.subject_id == id) & (df_m_acc.timestamp.dt.floor('H') == hour), :]
        
        if not user_data.empty:
            avg_x = user_data['x'].mean()
            avg_y = user_data['y'].mean()
            avg_z = user_data['z'].mean()
            avg_magnitude = user_data['magnitude'].mean()
        else:
            avg_x = avg_y = avg_z = avg_magnitude = np.nan  # 데이터가 없는 경우 NaN으로 처리
            
        macc['subject_id'].append(id)
        macc['hour'].append(hour)
        macc['average_x'].append(avg_x)
        macc['average_y'].append(avg_y)
        macc['average_z'].append(avg_z)
        macc['average_magnitude'].append(avg_magnitude)

test_macc= pd.DataFrame(macc)
# test_macc.head()

In [None]:
csv_file_name = "train_macc_4-2.csv" ################### 파일명 바꾸기
test_macc.to_csv(csv_file_name, index=False)

#### m_activity

In [None]:
## activity 활동성에 따른 재매핑

activity_map = {
    '8':5,
    '1':4,
    '7':3,
    '3':2,
    '0':1,
    '4':0
}

df_activity.m_activity = df_activity.m_activity.map(activity_map)
ts_activity.m_activity = ts_activity.m_activity.map(activity_map)

In [None]:
## train

ids  = [1,2,3,4]
activity = {
    'subject_id': [],
    'hour': [],
    'activity': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.date
    
    for day in user_timestamp:

        ## 
        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_activity = df_activity.loc[(df_activity.subject_id == id) & (df_activity.timestamp.dt.floor('H') == hour), :]

            if not user_activity.empty:
                act = user_activity.m_activity.max()
            else:
                act = 0

            activity['subject_id'].append(id)
            activity['hour'].append(hour)
            activity['activity'].append(act)

train_activity = pd.DataFrame(activity)
# train_activity

In [None]:
## test

ids = [5,6,7,8]
activity = {
    'subject_id': [],
    'hour': [],
    'activity': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_activity = ts_activity.loc[(ts_activity.subject_id == id) & (ts_activity.timestamp.dt.floor('H') == hour), :]
            
            if not user_activity.empty:
                act = user_activity.m_activity.max()
            else:
                act = 0
            
            activity['subject_id'].append(id)
            activity['hour'].append(hour)
            activity['activity'].append(act)

test_activity = pd.DataFrame(activity)
# test_activity

#### m_gps

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # 지구의 반지름 (km)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def hourly_distance(df):
    distances = haversine(df['latitude'].shift(), df['longitude'].shift(),
                            df['latitude'], df['longitude'])
    distances = distances.fillna(0)
    return distances.sum()   # sum 을 통해 시간별 이동거리를 구하였음

In [None]:
## train

ids = [1,2,3,4]
gps = {
    'subject_id': [],
    'hour': [],
    'distance': [],
    'mean_altitude': [],
    'mean_latitude': [],
    'mean_longitude': [],
    'mean_speed': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_gps = df_gps.loc[(df_gps.subject_id == id) & (df_gps.timestamp.dt.floor('H') == hour), :]
            
            if not user_gps.empty:
                distance = hourly_distance(user_gps)
            else:
                distance = 0.0
            
            mean_altitude = user_gps.altitude.mean()
            mean_latitude = user_gps.latitude.mean()
            mean_longitude = user_gps.longitude.mean()
            mean_speed = user_gps.speed.mean()

            gps['subject_id'].append(id)
            gps['hour'].append(hour)
            gps['distance'].append(distance)
            gps['mean_altitude'].append(mean_altitude)
            gps['mean_latitude'].append(mean_latitude)
            gps['mean_longitude'].append(mean_longitude)
            gps['mean_speed'].append(mean_speed)

train_gps = pd.DataFrame(gps)
# train_gps

In [None]:
## test

ids = [5,6,7,8]
gps = {
    'subject_id': [],
    'hour': [],
    'distance': [],
    'mean_altitude': [],
    'mean_latitude': [],
    'mean_longitude': [],
    'mean_speed': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_gps = ts_gps.loc[(ts_gps.subject_id == id) & (ts_gps.timestamp.dt.floor('H') == hour), :]
            
            if not user_gps.empty:
                distance = hourly_distance(user_gps)
            else:
                distance = 0.0
            
            mean_altitude = user_gps.altitude.mean()
            mean_latitude = user_gps.latitude.mean()
            mean_longitude = user_gps.longitude.mean()
            mean_speed = user_gps.speed.mean()
            
            gps['subject_id'].append(id)
            gps['hour'].append(hour)
            gps['distance'].append(distance)
            gps['mean_altitude'].append(mean_altitude)
            gps['mean_latitude'].append(mean_latitude)
            gps['mean_longitude'].append(mean_longitude)
            gps['mean_speed'].append(mean_speed)

test_gps = pd.DataFrame(gps)
# test_gps

#### m_light

In [None]:
## train

ids = [1,2,3,4]
avg_m_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': [],
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_m_light = df_m_light.loc[(df_m_light.subject_id == id) & (df_m_light.timestamp.dt.floor('H') == hour), :]

            if not user_m_light.empty:
                max_light = user_m_light.m_light.max()
                mean_light = user_m_light.m_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_m_light['subject_id'].append(id)
            avg_m_light['hour'].append(hour)
            avg_m_light['max_light'].append(max_light)
            avg_m_light['mean_light'].append(mean_light)

train_m_light = pd.DataFrame(avg_m_light)
# train_m_light

In [None]:
## test

ids = [5,6,7,8]
avg_m_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': [],
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_m_light = ts_m_light.loc[(ts_m_light.subject_id == id) & (ts_m_light.timestamp.dt.floor('H') == hour), :]

            if not user_m_light.empty:
                max_light = user_m_light.m_light.max()
                mean_light = user_m_light.m_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_m_light['subject_id'].append(id)
            avg_m_light['hour'].append(hour)
            avg_m_light['max_light'].append(max_light)
            avg_m_light['mean_light'].append(mean_light)

test_m_light = pd.DataFrame(avg_m_light)
# test_m_light

#### w_pedo

In [None]:
## 건혁 인사이트 적용
'''
:Drop Columns: ['step_frequency', 'walking_steps','distance', 'speed']
'''

df_pedo = df_pedo[['subject_id', 'timestamp', 'burned_calories', 'running_steps', 'steps']]
ts_pedo = ts_pedo[['subject_id', 'timestamp', 'burned_calories', 'running_steps', 'steps']]

In [None]:
## train

ids = [1,2,3,4]
avg_pedo = {
    'subject_id': [],
    'hour': []
}
mean_cols = ['mean_burned_calories', 'mean_running_steps', 'mean_steps']
sum_cols = ['sum_burned_calories', 'sum_running_steps', 'sum_steps']
user_avg_pedo = pd.DataFrame(columns=mean_cols)
user_sum_pedo = pd.DataFrame(columns=sum_cols)

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_pedo = df_pedo.loc[(df_pedo.subject_id == id) & (df_pedo.timestamp.dt.floor('H') == hour), :]
            
            if not user_pedo.empty:
                user_pedo_mean = pd.DataFrame(user_pedo.iloc[:, 2:].mean()).T
                user_pedo_mean.columns = mean_cols
                user_pedo_sum  = pd.DataFrame(user_pedo.iloc[:, 2:].sum()).T
                user_pedo_sum.columns = sum_cols
            else:
                user_pedo_mean = pd.DataFrame(np.zeros((1,3)), columns=mean_cols)
                user_pedo_sum  = pd.DataFrame(np.zeros((1,3)), columns=sum_cols)
            
            avg_pedo['subject_id'].append(id)
            avg_pedo['hour'].append(hour)
            user_avg_pedo = pd.concat([user_avg_pedo, user_pedo_mean], axis=0)
            user_sum_pedo = pd.concat([user_sum_pedo, user_pedo_sum], axis=0)

user_avg_pedo.reset_index(drop=True, inplace=True)
user_sum_pedo.reset_index(drop=True, inplace=True)

avg_pedo = pd.DataFrame(avg_pedo)
train_pedo = pd.concat([avg_pedo, user_avg_pedo], axis=1)
train_pedo = pd.concat([train_pedo, user_sum_pedo], axis=1)
# train_pedo.head()

In [None]:
## test

ids = [5,6,7,8]
avg_pedo = {
    'subject_id': [],
    'hour': []
}
mean_cols = ['mean_burned_calories', 'mean_running_steps', 'mean_steps']
sum_cols = ['sum_burned_calories', 'sum_running_steps', 'sum_steps']
user_avg_pedo = pd.DataFrame(columns=mean_cols)
user_sum_pedo = pd.DataFrame(columns=sum_cols)

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end   = day_start + pd.Timedelta(days=1)
        hours     = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_pedo = ts_pedo.loc[(ts_pedo.subject_id == id) & (ts_pedo.timestamp.dt.floor('H') == hour), :]
            
            if not user_pedo.empty:
                user_pedo_mean = pd.DataFrame(user_pedo.iloc[:, 2:].mean()).T
                user_pedo_mean.columns = mean_cols
                user_pedo_sum  = pd.DataFrame(user_pedo.iloc[:, 2:].sum()).T
                user_pedo_sum.columns = sum_cols
            else:
                user_pedo_mean = pd.DataFrame(np.zeros((1,3)), columns=mean_cols)
                user_pedo_sum  = pd.DataFrame(np.zeros((1,3)), columns=sum_cols)
            
            avg_pedo['subject_id'].append(id)
            avg_pedo['hour'].append(hour)
            user_avg_pedo = pd.concat([user_avg_pedo, user_pedo_mean], axis=0)
            user_sum_pedo = pd.concat([user_sum_pedo, user_pedo_sum], axis=0)

user_avg_pedo.reset_index(drop=True, inplace=True)
user_sum_pedo.reset_index(drop=True, inplace=True)

avg_pedo = pd.DataFrame(avg_pedo)
test_pedo = pd.concat([avg_pedo, user_avg_pedo], axis=1)
test_pedo = pd.concat([test_pedo, user_sum_pedo], axis=1)
# test_pedo.head()

#### m_usage_stats

In [None]:
## train

ids = [1,2,3,4]
avg_m_usage = {
    'subject_id': [],
    'hour': [],
    'app_total_use_time': [],
    'app_mean_use_time': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        total_times = 0

        for hour in hours:
            user_app = df_usage.loc[(df_usage.subject_id == id) & (df_usage.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_app.iterrows():
                data = row['m_usage_stats']
                if len(data):
                    for item in data:
                        if 'total_time' in item:
                            total_times += item['total_time']
                else:
                    pass
            
            ## milisecond to minuts
            total_times /= 60000
            avg_m_usage['subject_id'].append(id)
            avg_m_usage['hour'].append(hour)
            avg_m_usage['app_total_use_time'].append(total_times)
            
            if len(user_app):
                avg_m_usage['app_mean_use_time'].append(total_times / len(user_app))
            else:
                avg_m_usage['app_mean_use_time'].append(0.0)

train_m_usage = pd.DataFrame(avg_m_usage)
# train_m_usage.head()

In [None]:
## test

ids = [5,6,7,8]
avg_m_usage = {
    'subject_id': [],
    'hour': [],
    'app_total_use_time': [],
    'app_mean_use_time': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        total_times = 0

        for hour in hours:
            user_app = ts_usage.loc[(ts_usage.subject_id == id) & (ts_usage.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_app.iterrows():
                data = row['m_usage_stats']
                if len(data):
                    for item in data:
                        if 'total_time' in item:
                            total_times += item['total_time']
                else:
                    pass

            ## milisecond to minuts
            total_times /= 60000
            avg_m_usage['subject_id'].append(id)
            avg_m_usage['hour'].append(hour)
            avg_m_usage['app_total_use_time'].append(total_times)
            
            if len(user_app):
                avg_m_usage['app_mean_use_time'].append(total_times / len(user_app))
            else:
                avg_m_usage['app_mean_use_time'].append(0.0)

test_m_usage = pd.DataFrame(avg_m_usage)
# test_m_usage.head()

#### m_ambience

In [None]:
## train
## top 1의 결과를 뽑음

ids = [1,2,3,4]
avg_m_ambience = {
    'subject_id': [],
    'hour': [],
    'max_ambience_cls': []
}

ambience_df = pd.DataFrame

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            ambience_rate = 0
            user_ambience = df_ambience.loc[(df_ambience.subject_id == id) & (df_ambience.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_ambience.iterrows():
                data = row['ambience_labels']
                if len(data):
                    for item in data:
                        if ambience_rate < float(item[-1]):
                            ambience_rate = float(item[-1])
                            ambience_name = item[0]

                else:
                    ambience_name = None
                    pass

            avg_m_ambience['subject_id'].append(id)
            avg_m_ambience['hour'].append(hour)
            avg_m_ambience['max_ambience_cls'].append(ambience_name)

train_m_ambience = pd.DataFrame(avg_m_ambience)
# train_m_ambience.head()

In [None]:
## test
ids = [5,6,7,8]
avg_m_ambience = {
    'subject_id': [],
    'hour': [],
    'max_ambience_cls': [],
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')

        for hour in hours:
            ambience_rate = 0
            user_ambience = ts_ambience.loc[(ts_ambience.subject_id == id) & (ts_ambience.timestamp.dt.floor('H') == hour), :]
            
            for index, row in user_ambience.iterrows():
                data = row['ambience_labels']
                if len(data):
                    for item in data:
                        if ambience_rate < float(item[-1]):
                            ambience_rate = float(item[-1])
                            ambience_name = item[0]

                else:
                    ambience_name =None
                    pass

            avg_m_ambience['subject_id'].append(id)
            avg_m_ambience['hour'].append(hour)
            avg_m_ambience['max_ambience_cls'].append(ambience_name)

test_m_ambience = pd.DataFrame(avg_m_ambience)
# test_m_ambience.head()

#### w_heart_rate

In [None]:
## train

ids = [1,2,3,4]
avg_hr = {
    'subject_id': [],
    'hour': [],
    'max_hr': [],
    'mean_hr': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_hr = df_heart_rate.loc[(df_heart_rate.subject_id == id) & (df_heart_rate.timestamp.dt.floor('H') == hour), :]

            if not user_hr.empty:
                hr_max = user_hr.heart_rate.max()
                hr_mean = user_hr.heart_rate.mean()
            else:
                hr_max = 0.0
                hr_mean = 0.0

            avg_hr['subject_id'].append(id)
            avg_hr['hour'].append(hour)
            avg_hr['max_hr'].append(hr_max)
            avg_hr['mean_hr'].append(hr_mean)

train_hr = pd.DataFrame(avg_hr)
# train_hr.head()

In [None]:
## test

ids = [5,6,7,8]
avg_hr = {
    'subject_id': [],
    'hour': [],
    'max_hr': [],
    'mean_hr': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_hr = ts_heart_rate.loc[(ts_heart_rate.subject_id == id) & (ts_heart_rate.timestamp.dt.floor('H') == hour), :]

            if not user_hr.empty:
                hr_max = user_hr.heart_rate.max()
                hr_mean = user_hr.heart_rate.mean()
            else:
                hr_max = 0.0
                hr_mean = 0.0

            avg_hr['subject_id'].append(id)
            avg_hr['hour'].append(hour)
            avg_hr['max_hr'].append(hr_max)
            avg_hr['mean_hr'].append(hr_mean)

test_hr = pd.DataFrame(avg_hr)
# test_hr.head()

#### w_light

In [None]:
## train

ids = [1,2,3,4]
avg_w_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': []
}

for id in ids:
    user_timestamp = train_label.loc[train_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_w_light = df_w_light.loc[(df_w_light.subject_id == id) & (df_w_light.timestamp.dt.floor('H') == hour), :]

            if not user_w_light.empty:
                max_light = user_w_light.w_light.max()
                mean_light = user_w_light.w_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_w_light['subject_id'].append(id)
            avg_w_light['hour'].append(hour)
            avg_w_light['max_light'].append(max_light)
            avg_w_light['mean_light'].append(mean_light)

train_w_light = pd.DataFrame(avg_w_light)
# train_w_light.head()

In [None]:
## test

ids = [5,6,7,8]
avg_w_light = {
    'subject_id': [],
    'hour': [],
    'max_light': [],
    'mean_light': []
}

for id in ids:
    user_timestamp = test_label.loc[test_label.subject_id == id, 'date'].dt.floor('D')
    
    for day in user_timestamp:

        day_start = pd.to_datetime(day)
        day_end = day_start + pd.Timedelta(days=1)
        hours = pd.date_range(start=day_start, end=day_end, freq='H', inclusive='left')
        
        for hour in hours:
            user_w_light = ts_w_light.loc[(ts_w_light.subject_id == id) & (ts_w_light.timestamp.dt.floor('H') == hour), :]

            if not user_w_light.empty:
                max_light = user_w_light.w_light.max()
                mean_light = user_w_light.w_light.mean()
            else:
                max_light = 0.0
                mean_light = 0.0

            avg_w_light['subject_id'].append(id)
            avg_w_light['hour'].append(hour)
            avg_w_light['max_light'].append(max_light)
            avg_w_light['mean_light'].append(mean_light)

test_w_light = pd.DataFrame(avg_w_light)
# test_w_light.head()

In [None]:
train_data = pd.merge(train_activity, train_gps, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_m_light, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_pedo, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_hr, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_w_light, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_m_usage, on=['subject_id', 'hour'], how='left')
train_data = pd.merge(train_data, train_m_ambience, on=['subject_id', 'hour'], how='left')

train_data.shape
# train_data.head()

In [None]:
test_data = pd.merge(test_activity, test_gps, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_m_light, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_pedo, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_hr, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_w_light, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_m_usage, on=['subject_id', 'hour'], how='left')
test_data = pd.merge(test_data, test_m_ambience, on=['subject_id', 'hour'], how='left')

test_data.shape
# test_data.head()

In [None]:
train_data.to_csv(os.path.join(CFG['VALID_PATH'],'train_data2.csv'), index=False)
test_data.to_csv(os.path.join(CFG['TEST_PATH'], 'test_data2.csv'), index=False)

In [None]:
train_data = pd.read_csv(os.path.join(CFG['VALID_PATH'],'train_data2.csv'))
test_data = pd.read_csv(os.path.join(CFG['TEST_PATH'], 'test_data2.csv'))

train_data.shape, test_data.shape

## dataloader.py

In [None]:
train_label = pd.read_csv(os.path.join(CFG['VALID_PATH'],'val_label.csv'))
test_label  = pd.read_csv(os.path.join(CFG['TEST_PATH'],'answer_sample.csv'))

train_label['date'] = pd.to_datetime(train_label['date'])
test_label['date']  = pd.to_datetime(test_label['date'])

In [None]:
train_data = pd.read_csv(os.path.join(CFG['VALID_PATH'],'train_data2.csv'))
test_data  = pd.read_csv(os.path.join(CFG['TEST_PATH'], 'test_data2.csv'))

train_data.shape, test_data.shape

In [None]:
train_data_dict = {}

for id in [1, 2, 3, 4]:
    train_data_dict[f'train_macc_{id}'] = pd.read_csv(os.path.join(CFG['VALID_PATH'], f'train_macc_{id}-2.csv'))

test_data_dict = {}

for id in [5, 6, 7, 8]:
    test_data_dict[f'test_macc_{id}'] = pd.read_csv(os.path.join(CFG['TEST_PATH'], f'test_macc_{id}-2.csv'))

In [None]:
keys = train_data_dict.keys()
train_macc = pd.DataFrame()

for key in keys:
    train_macc = pd.concat([train_macc, train_data_dict[key]], axis=0)

keys = test_data_dict.keys()
test_macc = pd.DataFrame()

for key in keys:
    test_macc = pd.concat([test_macc, test_data_dict[key]], axis=0)

train_macc.fillna(0, inplace=True)
test_macc.fillna(0, inplace=True)

train_macc.shape, test_macc.shape

In [None]:
train_data = train_data.merge(train_macc, on=['subject_id', 'hour'], how='left')
test_data  = test_data.merge(test_macc, on=['subject_id', 'hour'], how='left')

In [None]:
## categorical feature 처리

def Info2Idx(df, cat_feat):
    info2idx = {}
    for f in cat_feat:
        f_unique    = df[f].unique()
        info2idx[f] = {k:v+1 for v, k in enumerate(f_unique)}
    return info2idx

In [None]:
train_data['hour'] = pd.to_datetime(train_data['hour'])
test_data['hour']  = pd.to_datetime(test_data['hour'])

train_data['time'] = train_data['hour'].dt.hour.astype(float)
test_data['time']  = test_data['hour'].dt.hour.astype(float)

In [None]:
train_data['month'] = train_data['hour'].dt.month.astype(float)
test_data['month']  = test_data['hour'].dt.month.astype(float)

train_data['day'] = train_data['hour'].dt.dayofweek.astype(float)
test_data['day']  = test_data['hour'].dt.dayofweek.astype(float)

In [None]:
cat_feat = ['activity', 'month', 'max_ambience_cls']
total_cat = pd.concat([train_data.loc[:, cat_feat], test_data.loc[:, cat_feat]], axis=0)

info2idx = Info2Idx(total_cat, cat_feat)

train_data[cat_feat] = train_data[cat_feat].apply(lambda x: x.map(info2idx[x.name]))
test_data[cat_feat]  = test_data[cat_feat].apply(lambda x: x.map(info2idx[x.name]))

In [None]:
def make_dataset(X, Y, window_size=CFG['WINDOW_SIZE'], stride=CFG['STRIDE_SIZE'], for_train=True):
    
    if for_train:
        ids = [1,2,3,4]
        df_label = train_label.copy()
    else:
        ids = [5,6,7,8]
        df_label = test_label.copy()
    
    train_sequences = []
    valid_sequences = []

    train_sequences_labels = []
    valid_sequences_labels = []

    train_cols = [
        'activity', 'distance', 'mean_light_x', 'mean_burned_calories', 'mean_running_steps', 'mean_steps', 
        'mean_hr', 'mean_light_y', 'average_x', 'average_y', 'average_z', 'average_magnitude',
        'app_total_use_time', 'max_ambience_cls',
        ]
    
    label_cols = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']

    for id in ids:

        dates = df_label.loc[df_label.subject_id == id, 'date'].dt.date
        
        for idx, date in enumerate(dates):
            
            user_data  = X.loc[(X.subject_id == id) & (X.hour.dt.date == date), train_cols]
            user_label = Y.loc[(Y.subject_id == id) & (Y.date.dt.date == date), label_cols]

            user_data  = user_data.values
            user_label = user_label.values
            end = len(user_data) - window_size + 1
            
            for i in range(0, end, stride):
                if (idx == 0) & for_train :
                    valid_sequences.append(user_data[i : i + window_size, :])
                    valid_sequences_labels.append(user_label)
                elif for_train:
                    train_sequences.append(user_data[i : i + window_size, :])
                    train_sequences_labels.append(user_label)
                else:
                    train_sequences.append(user_data[i : i + window_size, :])
                    train_sequences_labels.append(user_label)
    
    if for_train:
        return np.array(train_sequences), np.array(train_sequences_labels), np.array(valid_sequences), np.array(valid_sequences_labels)
    else:
        return np.array(train_sequences), np.array(train_sequences_labels)

In [None]:
train_window_data, train_window_labels, valid_window_data, valid_window_labels = make_dataset(train_data, train_label, window_size=CFG['WINDOW_SIZE'], stride=CFG['STRIDE_SIZE'], for_train=True)
test_window_data, test_window_labels  = make_dataset(test_data, test_label, window_size=CFG['WINDOW_SIZE'], stride=CFG['STRIDE_SIZE'], for_train=False)

train_window_data.shape, train_window_labels.shape, valid_window_data.shape, valid_window_labels.shape, test_window_data.shape, test_window_labels.shape

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        X = torch.tensor(self.X[index], dtype=torch.float32)
        
        if self.Y is not None:
            Y = torch.tensor(self.Y[index], dtype=torch.float32)
            return X, Y
        
        return X
    
    def __len__(self):
        return len(self.X)

## model.py

In [None]:
class GRUWithMultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(GRUWithMultiHeadAttention, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_heads = num_heads

        self.gru_in    = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc        = nn.Linear(hidden_size * 2, hidden_size)
        self.attention = nn.MultiheadAttention(hidden_size, num_heads)
        self.gru_out   = nn.GRU(hidden_size, hidden_size, batch_first=True)
    
    def forward(self, x):

        outputs, hidden = self.gru_in(x)                                # BxTx(Direction * H)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim = 1))).unsqueeze(0)
        outputs = self.fc(outputs)                                      # BxTxH

        context_vec, _ = self.attention(outputs, outputs, outputs)      # BxTxH
        
        gru_input = context_vec[:, -1:, :]
        gru_output, _ = self.gru_out(gru_input, hidden)                 # BxTxH
        
        return gru_output

In [None]:
class BaseModel(nn.Module):
    def __init__(self, args, input_size=8, hidden_size=args.hid_dim):
        super(BaseModel, self).__init__()
        
        # 1D Convolution layers
        self.conv_block = nn.Sequential(
            nn.Conv1d(input_size, hidden_size//2, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm1d(hidden_size//2),
            nn.ReLU()
            )
        
        # Bidirectional GRU with Attention
        self.gru_attention = GRUWithMultiHeadAttention(hidden_size//2, num_heads=args.num_head)

        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_size//2, 7),
            nn.Sigmoid()
            )

    def forward(self, x):
        
        x = x.permute(0, 2, 1)                  # BxTxF -> BxFxT
        x = self.conv_block(x)                  # BxHxT'
        x = x.permute(0, 2, 1)                  # BxT'xH
        
        gru_output = self.gru_attention(x)      # BxT'x(Direction * H)
        output     = self.fc(gru_output[:, -1, :])  # Bx(Direction * H) -> Bx1

        return output.squeeze()

In [None]:
class BaseClassifier(L.LightningModule):
    def __init__(self, backbone, args):
        super().__init__()
        self.backbone = backbone

    def forward(self, x):
        predictions = self.backbone(x)
        return predictions

    def step(self, batch):
        x, y = batch
        y_hat = self.backbone(x)
        loss = nn.BCELoss()(y_hat, y.squeeze())
        return loss, y, y_hat

    def training_step(self, batch, batch_idx):
        loss, y, y_hat = self.step(batch)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, y, y_hat = self.step(batch)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        loss, y, y_hat = self.step(batch)
        self.log("test_mae", loss, on_step=False, on_epoch=True, prog_bar=True)

    def predict_step(self, batch, batch_idx):
        x, _ = batch
        y_hat = self.forward(x)
        return y_hat

    def configure_optimizers(self):
        if args.optimizer == "sgd":
            optimizer = torch.optim.SGD(self.parameters(), lr=args.learning_rate, momentum=0.9)
        if args.optimizer == "adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=args.learning_rate)
        if args.optimizer == "adamw":
            optimizer = torch.optim.AdamW(self.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
        
        if args.scheduler == "none":
            return optimizer
        if args.scheduler == "step":
            scheduler = StepLR(
                optimizer=optimizer,
                step_size=250,
                gamma=0.05,
            )
            return [optimizer], [scheduler]
        if args.scheduler == "cosine":
            scheduler = CosineAnnealingLR(
                optimizer=optimizer,
                T_max=args.epochs,
                eta_min=1e-6,
            )
            return [optimizer], [scheduler]
        if args.scheduler == "plateau":
            scheduler = ReduceLROnPlateau(
                optimizer=optimizer,
                mode="min",
                factor=0.1,
                patience=2, # 2
                verbose=False,
            )
            return {"optimizer":optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

In [None]:
train_dataset = CustomDataset(train_window_data, train_window_labels)
valid_dataset = CustomDataset(valid_window_data, valid_window_labels)
test_dataset  = CustomDataset(test_window_data, test_window_labels)

train_loader  = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
valid_loader  = DataLoader(valid_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)
test_loader   = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

##
model   = BaseClassifier(BaseModel(args, input_size=train_window_data.shape[2]), args)

early_stop_callback = EarlyStopping(
    monitor="train_loss", 
    patience=CFG['PATIENCE'], 
    mode="min"
    )

checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(os.getcwd(),'saved'),
    save_top_k=1,
    verbose=False,
    monitor='train_loss',
    mode='min',
    )

trainer = L.Trainer(
    max_epochs=CFG["EPOCHS"], accelerator="auto", 
    enable_progress_bar=False,
    enable_model_summary=False,
    callbacks=[checkpoint_callback, early_stop_callback],
    devices=args.device#, logger=wandb_logger,
)

trainer.fit(model, train_loader, valid_loader)
checkpoint_callback.best_model_path

eval_dict = trainer.validate(model, dataloaders=valid_loader)[0]
valid_loss = eval_dict["val_loss"]

y_valid_preds = trainer.predict(model, dataloaders=valid_loader)
y_preds = trainer.predict(model, dataloaders=test_loader)

print(f"val_loss: {valid_loss}")
# wandb.log({'val_loss': valid_loss})

In [None]:
final_valid_pred = torch.vstack(y_valid_preds)
final_valid_pred = final_valid_pred.cpu().numpy()
final_valid_pred = np.where(final_valid_pred > 0.5, 1, 0)
final_valid_pred.shape

In [None]:
from sklearn.metrics import f1_score

total_f1 = 0
fina_valid_real = valid_window_labels.squeeze(1)

for i in range(7):

    if i == 2:
        weight = 1.0
    else:
        weight = 1.5
    
    f1 = f1_score(fina_valid_real[:,i], final_valid_pred[:,i])
    f1 *= weight
    total_f1 += f1

    print(f"f1_score_{i+1}: {f1}")
print(f"total_f1: {total_f1}")
# wandb.log({'total_f1': total_f1})

In [None]:
final_preds = torch.vstack(y_preds)
final_preds = final_preds.cpu().numpy()
final_preds = np.where(final_preds > 0.5, 1, 0)
final_preds.shape

In [None]:
x = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']
for_barplot = pd.DataFrame(final_preds, columns=x)

plt.figure(figsize=(10,5))
sns.barplot(train_label.iloc[:, 2:].sum(0)/ 105, alpha=0.3, label='train', color='blue')
sns.barplot(for_barplot.sum(0)/115, alpha=0.3, label='pred', color='red')

plt.grid()
plt.legend()
plt.show()

In [None]:
for_barplot['subject_id'] = test_label.subject_id.values
print(for_barplot.groupby('subject_id').sum())

In [None]:
for_barplot.iloc[:, :].sum(0)/ 115, for_barplot.iloc[:, :].sum(0)

In [None]:
# wandb.finish()

In [None]:
now = datetime.now()
date = str(now.date()).replace('-', '_')
time = str(now.time()).replace(':', '_')

submmit = pd.read_csv(os.path.join(CFG['TEST_PATH'], 'answer_sample.csv'))
submmit.iloc[:, 2:] = final_preds
submmit.to_csv(f'./submission/submission_{idx}_{date}_{time}.csv', index=False)

In [None]:
submmit