# 02) 피처 추출 & 정규화
- GH Archive raw(.json.gz)에서 이벤트별 피처를 추출 → 원-핫 인코딩 → 결측/스케일 정리.
- RobustScaler → MinMaxScaler(0~1).
- 산출물: `features.parquet`, `robust_*.joblib`, `minmax_*.joblib`, `features_*.txt`

In [None]:

BASE_DIR = r"C:\Users\EL040\Desktop\MS_3rd-Project\basemodel"
SPLIT = "train"   # "train" 또는 "test"
USE_FILTERED = False  # True면 *.filtered.json.gz만 사용
TAG = SPLIT


In [None]:

import os, gzip
import pandas as pd
from tqdm import tqdm

from common import get_paths, parse_json_gz, extract_features_from_event, build_dataframe, scale_and_save

paths = get_paths(BASE_DIR)
raw_dir = paths["data_train_raw"] if SPLIT == "train" else paths["data_test_raw"]
out_dir = paths["data_train_feat"] if SPLIT == "train" else paths["data_test_feat"]
scaler_dir = paths["model_dir"]

files = sorted([os.path.join(raw_dir, f) for f in os.listdir(raw_dir) if f.endswith(".json.gz") and (f.endswith(".filtered.json.gz") if USE_FILTERED else True)])

print("Input files:", len(files))
feats = []
for fp in tqdm(files, ncols=30, desc="parse"):
    for evt in parse_json_gz(fp):
        fd = extract_features_from_event(evt)
        if fd is not None:
            feats.append(fd)

df = build_dataframe(feats)
print("Raw feature frame:", df.shape)
if df.empty:
    raise SystemExit("No features extracted. Check input files or filters.")

# Save raw features (optional)
raw_feat_path = os.path.join(out_dir, "features_raw.parquet")
df.to_parquet(raw_feat_path, index=False)
print("Saved raw features:", raw_feat_path)

# Scale and save scalers
X_scaled, robust, mm, cols = scale_and_save(df, scaler_dir, TAG)
print("Scaled shape:", X_scaled.shape, "Num features:", len(cols))

# Persist scaled features
scaled_feat_path = os.path.join(out_dir, "features.parquet")
pd.DataFrame(X_scaled, columns=cols).to_parquet(scaled_feat_path, index=False)
print("Saved scaled features:", scaled_feat_path)
