In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import random
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rc('axes', unicode_minus=False)

from datetime import datetime

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary

import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

# import wandb
# from lightning.pytorch.loggers import WandbLogger

from argparse import ArgumentParser

parser = ArgumentParser(description="SSP_Yoonji")

## DATA
parser.add_argument("--valid_path", default="/home/yoonji/ictc2024/dataset/val dataset", type=str)
parser.add_argument("--test_path", default="/home/yoonji/ictc2024/dataset/test dataset", type=str)
parser.add_argument('--window_size', default=24, type=int)  # 수면 시간 고려하여 설정하였음
parser.add_argument('--stride_size', default=1, type=int)  # 1시간 단위로 봄

## MHA
parser.add_argument('--num_head', default=8, type=int)
parser.add_argument('--hid_dim', default=128, type=int)

## TRAIN
parser.add_argument('--optimizer', default="adamw", type=str)
parser.add_argument("--learning_rate", default=1e-4, type=float)
parser.add_argument("--weight_decay", default=0, type=float)
parser.add_argument('--scheduler', default="step", type=str)
parser.add_argument('--batch_size', default=16, type=int)
parser.add_argument('--epochs', default=1000, type=int)
parser.add_argument('--patience', default=100, type=int)

parser.add_argument('--cv', default=5, type=int)
parser.add_argument('--seed', default=42, type=int)
parser.add_argument('--mixed_precision', default=32, type=int)
parser.add_argument('--device', nargs='+', default=[0], type=int)
parser.add_argument('--num_workers', default=0, type=int)

args = parser.parse_args('')

# wandb.init(config=args, name='SSP_JY(GAG)', project="ETRI_Baseline")
# wandb_logger = WandbLogger(name='SSP_JY(GAG)', project="ETRI_Baseline")
# wandb.config.update(args)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

CFG = {
    "WINDOW_SIZE" : args.window_size,
    "STRIDE_SIZE" : args.stride_size,
    "BATCH_SIZE" : args.batch_size,
    "EPOCHS"     : args.epochs,
    "PATIENCE"   : args.patience,
    "CV"         : args.cv,
    "SEED"       : args.seed,
    "VALID_PATH" : args.valid_path,
    "TEST_PATH"  : args.test_path,
}

def seed_everything(SEED):
    os.environ['PYTHONHASHSEED'] = str(SEED)
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    L.seed_everything(SEED)

torch.set_float32_matmul_precision('high') 
seed_everything(CFG['SEED'])

idx = f"{parser.description}_{device}"
idx

Seed set to 42


'SSP_Yoonji_cuda'

#### 데이터

In [3]:
train_label = pd.read_csv('/home/yoonji/ictc2024/dataset/val_label.csv')
test_label  = pd.read_csv('/home/yoonji/ictc2024/dataset/answer_sample.csv')

train_label['date'] = pd.to_datetime(train_label['date'])
test_label['date']  = pd.to_datetime(test_label['date'])

In [4]:
train_data = pd.read_csv(os.path.join(CFG['VALID_PATH'],'train_data2.csv'))
test_data  = pd.read_csv(os.path.join(CFG['TEST_PATH'], 'test_data2.csv'))

train_data.shape, test_data.shape

((2520, 19), (2760, 19))

In [5]:
train_data_dict = {}

for id in [1, 2, 3, 4]:
    train_data_dict[f'train_macc_{id}'] = pd.read_csv(os.path.join(CFG['VALID_PATH'], f'train_macc_{id}-2.csv'))

test_data_dict = {}

for id in [5, 6, 7, 8]:
    test_data_dict[f'test_macc_{id}'] = pd.read_csv(os.path.join(CFG['TEST_PATH'], f'test_macc_{id}-2.csv'))

In [6]:
keys = train_data_dict.keys()
train_macc = pd.DataFrame()

for key in keys:
    train_macc = pd.concat([train_macc, train_data_dict[key]], axis=0)

keys = test_data_dict.keys()
test_macc = pd.DataFrame()

for key in keys:
    test_macc = pd.concat([test_macc, test_data_dict[key]], axis=0)

train_macc.fillna(0, inplace=True)
test_macc.fillna(0, inplace=True)

train_macc.shape, test_macc.shape

((2520, 6), (2760, 6))

In [7]:
train_data = train_data.merge(train_macc, on=['subject_id', 'hour'], how='left')
test_data  = test_data.merge(test_macc, on=['subject_id', 'hour'], how='left')

In [8]:
## categorical feature 처리

def Info2Idx(df, cat_feat):
    info2idx = {}
    for f in cat_feat:
        f_unique    = df[f].unique()
        info2idx[f] = {k:v+1 for v, k in enumerate(f_unique)}
    return info2idx

In [9]:
train_data['hour'] = pd.to_datetime(train_data['hour'])
test_data['hour']  = pd.to_datetime(test_data['hour'])

train_data['time'] = train_data['hour'].dt.hour.astype(float)
test_data['time']  = test_data['hour'].dt.hour.astype(float) # 시간 따로 뺌
train_data['month'] = train_data['hour'].dt.month.astype(float)
test_data['month']  = test_data['hour'].dt.month.astype(float) # 월 따로 뺌

train_data['day'] = train_data['hour'].dt.dayofweek.astype(float)
test_data['day']  = test_data['hour'].dt.dayofweek.astype(float) # 날짜 따로 뺌

In [10]:
cat_feat = ['activity', 'month', 'max_ambience_cls']
total_cat = pd.concat([train_data.loc[:, cat_feat], test_data.loc[:, cat_feat]], axis=0)

info2idx = Info2Idx(total_cat, cat_feat) # 각 범주형 특징을 인덱스 변환


train_data[cat_feat] = train_data[cat_feat].apply(lambda x: x.map(info2idx[x.name]))
test_data[cat_feat]  = test_data[cat_feat].apply(lambda x: x.map(info2idx[x.name]))

# args.f_sizes = [len(info2idx[i])+1 for i in cat_feat]

In [12]:
test_data.drop(columns=['subject_id', 'hour'], inplace=True) 
train_data.drop(columns=['subject_id', 'hour'], inplace=True) 

In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset

# 시퀀스 데이터 생성

def create_sequences(data, labels, time_steps=24):
    X, y = [], []
    for i in range(len(data) // time_steps):
        X.append(data[i*time_steps:(i+1)*time_steps])
        y.append(labels[i])
    return np.array(X), np.array(y)


time_steps = 24
X_train, y_train = create_sequences(train_data, train_label.iloc[:,2:].values, time_steps)
X_test, y_test = create_sequences(test_data, test_label.iloc[:,2:].values, time_steps)

In [15]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.FloatTensor(data)
        self.labels = torch.FloatTensor(labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=24, shuffle=True, worker_init_fn=seed_worker)
test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False, worker_init_fn=seed_worker)


#### 모델

In [16]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(device)
        c0 = torch.zeros(1, x.size(0), hidden_dim).to(device)
        x, _ = self.lstm(x, (h0, c0))
        out = self.output(x[:, -1, :])
        return out

# Load model checkpoints
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = X_train.shape[2]
hidden_dim = 50
output_dim = 1  # 각 평가지표별로 예측하기 때문에 output_dim은 1로 설정

In [21]:
# 모델 불러오기
models = []
for i in range(1, 8):
    model_path = f'/home/yoonji/ictc2024/trained_models/model_{i}_20240627_173253.pth'
    model = LSTMModel(input_dim, hidden_dim, output_dim)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    models.append(model)

In [23]:
from sklearn.metrics import f1_score
y_pred = [[] for _ in range(y_test.shape[1])]
y_true = []

with torch.no_grad():
    for data, labels in test_loader:
        data, labels = data.to(device), labels.to(device)
        y_true.extend(labels.cpu().numpy())
        
        for j in range(y_test.shape[1]):
            outputs = models[j](data)
            y_pred[j].extend(outputs.cpu().numpy())

y_pred = [np.array(pred) for pred in y_pred]
y_true = np.array(y_true)

# 각 평가지표별 F1 Macro Score 계산
f1_macros = []
for j in range(y_test.shape[1]):
    f1_macro = f1_score(y_true[:, j], (y_pred[j] > 0.5).astype(int), average='macro')
    f1_macros.append(f1_macro)
    print(f'F1 Macro for label {j}: {f1_macro:.4f}')

# 모든 평가지표에 대한 F1 Macro Score의 평균 계산
average_f1_macro = np.mean(f1_macros)
print(f'Average F1 Macro over all labels: {average_f1_macro:.4f}')

F1 Macro for label 0: 0.4341
F1 Macro for label 1: 0.4868
F1 Macro for label 2: 0.5248
F1 Macro for label 3: 0.4840
F1 Macro for label 4: 0.3556
F1 Macro for label 5: 0.4962
F1 Macro for label 6: 0.5800
Average F1 Macro over all labels: 0.4802


In [24]:
final_pred = (np.array(y_pred)>0.5) * 1

In [None]:
now = datetime.now()

test_label.iloc[:, 2:] = final_pred
test_label.to_csv(f'./dataset/submission_{idx}_{now.date()}_{now.time()}.csv', index=False)