# 02) 피처 추출 & 정규화
- GH Archive raw(.json.gz)에서 이벤트별 피처를 추출 → 원-핫 인코딩 → 결측/스케일 정리.
- RobustScaler → MinMaxScaler(0~1).
- 산출물: `features.parquet`, `robust_*.joblib`, `minmax_*.joblib`, `features_*.txt`

In [4]:

BASE_DIR = r"C:\Users\EL040\Desktop\MS_3rd-Project\basemodel"
SPLIT = "train"   # "train" 또는 "test"
USE_FILTERED = False  # True면 *.filtered.json.gz만 사용
TAG = SPLIT


In [2]:

import os, gzip
import pandas as pd
from tqdm import tqdm

from common import get_paths, parse_json_gz, extract_features_from_event, build_dataframe, scale_and_save

paths = get_paths(BASE_DIR)
raw_dir = paths["data_train_raw"] if SPLIT == "train" else paths["data_test_raw"]
out_dir = paths["data_train_feat"] if SPLIT == "train" else paths["data_test_feat"]
scaler_dir = paths["model_dir"]

files = sorted([os.path.join(raw_dir, f) for f in os.listdir(raw_dir) if f.endswith(".json.gz") and (f.endswith(".filtered.json.gz") if USE_FILTERED else True)])

print("Input files:", len(files))
feats = []
for fp in tqdm(files, ncols=30, desc="parse"):
    for evt in parse_json_gz(fp):
        fd = extract_features_from_event(evt)
        if fd is not None:
            feats.append(fd)

df = build_dataframe(feats)
print("Raw feature frame:", df.shape)
if df.empty:
    raise SystemExit("No features extracted. Check input files or filters.")


Input files: 84


parse: 100%|█| 84/84 [16:06<00


Raw feature frame: (11660754, 48)


In [3]:
# Save raw features (optional)
raw_feat_path = os.path.join(out_dir, "features_raw.parquet")
df.to_parquet(raw_feat_path, index=False)
print("Saved raw features:", raw_feat_path)

from common import scale_and_save, transform_with_scalers, load_scalers

# 'base' 태그의 스케일러가 이미 있는지 확인
BASE_TAG = "base" # 베이스 모델의 스케일러를 식별하기 위한 태그
base_scaler_path = os.path.join(scaler_dir, f"robust_{BASE_TAG}.joblib")
is_base_scaler_exists = os.path.exists(base_scaler_path)

if SPLIT == "train" and not is_base_scaler_exists:
    # Case 1: 최초의 '베이스 모델' 학습용 데이터 처리
    # Scaler를 새로 학습하고 'base' 태그로 저장
    print(f"Base scaler not found. Creating and saving new scalers with tag '{BASE_TAG}'...")
    X_scaled, robust, mm, cols = scale_and_save(df, scaler_dir, BASE_TAG)
    print(f"Saved base scalers. Scaled shape: {X_scaled.shape}, Num features: {len(cols)}")
else:
    # Case 2: 이미 '베이스 모델'이 존재하고, 새로운 데이터(train 또는 test)를 처리할 때
    # 저장된 'base' 스케일러를 불러와 적용
    print(f"Loading existing base scalers (tag: '{BASE_TAG}') to transform data...")
    if not is_base_scaler_exists:
        raise SystemExit(f"ERROR: Base scaler ('{BASE_TAG}') not found. Please run with a large initial training set first.")
    X_scaled, cols = transform_with_scalers(df, scaler_dir, BASE_TAG)
    print(f"Transformation complete. Scaled shape: {X_scaled.shape}, Num features: {len(cols)}")

# Persist scaled features
scaled_feat_path = os.path.join(out_dir, "features.parquet")
pd.DataFrame(X_scaled, columns=cols).to_parquet(scaled_feat_path, index=False)
print("Saved scaled features:", scaled_feat_path)

Saved raw features: C:\Users\EL040\Desktop\MS_3rd-Project\basemodel\data\train\features_raw.parquet
Base scaler not found. Creating and saving new scalers with tag 'base'...
Saved base scalers. Scaled shape: (11660754, 48), Num features: 48
Saved scaled features: C:\Users\EL040\Desktop\MS_3rd-Project\basemodel\data\train\features.parquet
