### Library Import

In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go


### Data Load

In [10]:
data_path: str = "../../../data"

In [11]:
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train")
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test")
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [12]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할당
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and 'ALL' in f and f.endswith(".csv")
]
file_names.append('HOURLY_MARKET-DATA_COINBASE-PREMIUM-INDEX.csv')

# 'NETWORK'가 포함된 파일 추가
network_files = [
    f for f in os.listdir(data_path) if 'NETWORK' in f and f.endswith(".csv")
]

# 'NETWORK' 파일들을 기존 file_names 리스트에 추가
file_names.extend(network_files)

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

# 날짜 필터 기준
filter_date = "2023-12-31 23:00:00"

for _file_name, _df in tqdm(file_dict.items()):
    # ID (datetime) 열을 기준으로 2023년까지의 데이터만 필터링
    _df['datetime'] = pd.to_datetime(_df['datetime'])  # datetime열을 datetime 형식으로 변환
    _df = _df[_df['datetime'] <= filter_date]    # 2023년까지만 포함

    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df['ID'] = pd.to_datetime(df['ID'])
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 43/43 [00:02<00:00, 18.44it/s]


### EDA (Explanatory Data Analysis)

In [13]:
# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [16]:
from sklearn.preprocessing import PowerTransformer, StandardScaler

def preprocess_data(df, outlier_threshold=5.0):
    # 결측치 처리
    df = df.dropna(axis=1, how='all')
    df_drop = df.drop(columns=['target', '_type'])
    df_drop = df_drop.fillna(df_drop.mean())
    df.update(df_drop)

    # ID, target, _type 열을 제외한 나머지 열로 새로운 데이터프레임 생성
    df_drop = df.drop(["ID", 'target', "_type"], axis=1)

    # 이상치 제거
    for col in df_drop.columns:
        Q1 = df_drop[col].quantile(0.25)
        Q3 = df_drop[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = ((df_drop[col] < lower_bound) | (df_drop[col] > upper_bound)).sum()
        outlier_percentage = (outliers / len(df_drop)) * 100

        if outlier_percentage > outlier_threshold:
            df_drop = df_drop[~((df_drop[col] < lower_bound) | (df_drop[col] > upper_bound))]

    # Yeo-Johnson 변환
    pt = PowerTransformer(method='yeo-johnson')
    for col in df_drop.columns:
        df_drop[col] = pt.fit_transform(df_drop[[col]])

    # 표준 스케일링
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_drop)
    df_scaled = pd.DataFrame(df_scaled, columns=df_drop.columns)

    # 전처리된 데이터를 원래 데이터프레임에 업데이트
    df.update(df_scaled)

    return df

In [17]:
# 데이터 분할
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
    stratify=train_df["target"].astype(int)
)

print("x_train length:", len(x_train))
print("y_train length:", len(y_train))
print("x_valid length:", len(x_valid))
print("y_valid length:", len(y_valid))


x_train length: 7008
y_train length: 7008
x_valid length: 1752
y_valid length: 1752


### Model Training

In [18]:
# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass", # 다중 클래스 분류 수행할 때 사용하는 목적 함수
    "metric": "multi_logloss", # 로그 손실(성능 평가 지표)
    "num_class": 4,
    "num_leaves": 50,  # 결정트리 최대 리프 수(복잡도 증가)
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(x_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")



acc: 0.464041095890411, auroc: 0.6407336297995615


In [19]:
# performance 체크후 전체 학습 데이터로 다시 재학습
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)



### Inference

In [20]:
y_valid_pred

array([[0.03345759, 0.35758091, 0.56550803, 0.04345347],
       [0.09095249, 0.28688127, 0.56554945, 0.05661679],
       [0.19330374, 0.3229138 , 0.34461212, 0.13917034],
       ...,
       [0.05811356, 0.56967077, 0.26904339, 0.10317228],
       [0.067821  , 0.31912057, 0.44577179, 0.16728664],
       [0.03076975, 0.40286064, 0.517877  , 0.04849261]])

In [21]:
# lgb predict
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

### Output File Save

In [16]:
# output file 할당후 save
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("output.csv", index=False)