In [2]:
import pandas as pd
df_train=pd.read_csv('train.csv')
df_train

Unnamed: 0,time,pv_id,appr_temp,ceiling,cloud_b,dew_point,precip_1h,pressure,real_feel_temp,real_feel_temp_shade,...,temp_a,temp_max,temp_min,wind_dir_a,wind_spd_a,coord1,coord2,type,energy,nins
0,2024-08-01 00:05:00+09:00,PV_ID_0,,,,,,,,,...,,,,,,-2.018131,-0.172021,train,,0.0
1,2024-08-01 00:10:00+09:00,PV_ID_0,,,,,,,,,...,,,,,,-2.018131,-0.172021,train,,0.0
2,2024-08-01 00:15:00+09:00,PV_ID_0,,,,,,,,,...,,,,,,-2.018131,-0.172021,train,0.0,0.0
3,2024-08-01 00:20:00+09:00,PV_ID_0,,,,,,,,,...,,,,,,-2.018131,-0.172021,train,,0.0
4,2024-08-01 00:25:00+09:00,PV_ID_0,,,,,,,,,...,,,,,,-2.018131,-0.172021,train,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19236943,2025-07-31 23:40:00+09:00,PV_ID_209,,,,,,,,,...,,,,,,0.971977,0.318426,train,,0.0
19236944,2025-07-31 23:45:00+09:00,PV_ID_209,,,,,,,,,...,,,,,,0.971977,0.318426,train,0.0,0.0
19236945,2025-07-31 23:50:00+09:00,PV_ID_209,,,,,,,,,...,,,,,,0.971977,0.318426,train,,0.0
19236946,2025-07-31 23:55:00+09:00,PV_ID_209,,,,,,,,,...,,,,,,0.971977,0.318426,train,,0.0


In [6]:
# =========================================================
# 표준 유의미 변수 선별 (이론 드롭 없음) - 호환성 패치
#  - pandas: CategoricalDtype 체크 방식 수정
#  - lightgbm: fit()의 verbose 인자 제거 + 콜백으로 로그 끄기
# =========================================================
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from scipy.stats import spearmanr, kendalltau
import lightgbm as lgb

def select_features_standard(
    df: pd.DataFrame,
    target_col: str = "nins",
    time_col: str = "time",
    id_col: str = "pv_id",

    # 1) 상관 1차 필터
    corr_threshold_first: float = 0.15,
    use_kendall: bool = True,
    min_samples_for_corr: int = 100,

    # 2) 속도/메모리 옵션
    sample_every_n: int = 6,
    daytime_only: bool = True,

    # 3) LGBM 경량 파라미터
    lgb_n_estimators: int = 300,
    lgb_learning_rate: float = 0.05,
    lgb_max_depth: int = 6,
    lgb_subsample: float = 0.7,
    lgb_colsample_bytree: float = 0.7,
    lgb_min_child_samples: int = 100,

    # 4) 스플릿/후처리
    valid_frac: float = 0.2,
    min_valid_size: int = 2000,
    top_k_after_gain: int = 22,
    dedupe_corr_threshold: float = 0.90,

    random_state: int = 42,
):
    df = df.copy()

    # 0) 정렬 + 야간 제외
    if time_col in df.columns:
        df = df.sort_values(time_col)
    if daytime_only and target_col in df.columns:
        df = df[df[target_col].fillna(0) > 0]

    # 1) 다운캐스트 + pv_id 카테고리(신규 방식)
    for c in df.columns:
        if c in [time_col, id_col, target_col]:
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            df[c] = pd.to_numeric(df[c], errors="coerce", downcast="float")
    if id_col in df.columns and not isinstance(df[id_col].dtype, CategoricalDtype):
        df[id_col] = df[id_col].astype("category")

    # 2) pv_id별 간격 샘플
    if sample_every_n and (time_col in df.columns) and (id_col in df.columns):
        df = (
            df.groupby(id_col, observed=True, sort=False)
              .nth(slice(None, None, sample_every_n))
              .reset_index()
        )

    # 3) 피처 풀 구성 (target/time/type/energy 제외)
    feats = []
    for c in df.columns:
        if c in [target_col, time_col, "type", "energy"]:
            continue
        if c == id_col:
            feats.append(c)
        elif pd.api.types.is_numeric_dtype(df[c]):
            feats.append(c)

    # pv_id 코드화
    if id_col in feats:
        df[id_col + "_code"] = df[id_col].cat.codes.astype("int32")
        feats = [f for f in feats if f != id_col] + [id_col + "_code"]

    y = df[target_col].astype("float32")

    # 4) Spearman(+선택 Kendall)
    spearman_scores, kendall_scores = {}, {}
    for c in feats:
        x = df[c]
        m = ~(x.isna() | y.isna())
        if m.sum() < min_samples_for_corr:
            continue
        rho, _ = spearmanr(x[m], y[m])
        spearman_scores[c] = 0.0 if np.isnan(rho) else float(rho)
        if use_kendall:
            try:
                tau, _ = kendalltau(x[m], y[m])
                kendall_scores[c] = 0.0 if np.isnan(tau) else float(tau)
            except Exception:
                kendall_scores[c] = 0.0
        else:
            kendall_scores[c] = 0.0

    sp_sorted = sorted(spearman_scores.items(), key=lambda kv: abs(kv[1]), reverse=True)
    sp_candidates = [c for c, v in sp_sorted if abs(v) >= corr_threshold_first]
    if len(sp_candidates) == 0:
        sp_candidates = [c for c, _ in sp_sorted[:30]]

    # 5) LightGBM gain (시간순 80/20)
    cols = [time_col, target_col] + sp_candidates
    cols = [c for c in cols if c in df.columns]
    df_small = df[cols].dropna(subset=[target_col]).copy()

    if len(df_small) < (min_valid_size + 100):
        split_idx = int(len(df_small) * (1 - valid_frac))
    else:
        split_idx = max(len(df_small) - min_valid_size, int(len(df_small) * (1 - valid_frac)))
    split_idx = min(max(1, split_idx), len(df_small) - 1)

    train_df, valid_df = df_small.iloc[:split_idx], df_small.iloc[split_idx:]
    X_tr, y_tr = train_df[sp_candidates], train_df[target_col].astype("float32")
    X_va, y_va = valid_df[sp_candidates], valid_df[target_col].astype("float32")

    lgbm = lgb.LGBMRegressor(
        n_estimators=lgb_n_estimators,
        learning_rate=lgb_learning_rate,
        max_depth=lgb_max_depth,
        subsample=lgb_subsample,
        colsample_bytree=lgb_colsample_bytree,
        min_child_samples=lgb_min_child_samples,
        random_state=random_state,
        n_jobs=-1,
        # 일부 버전 호환: 모델 레벨에서 로깅 억제
        verbosity=-1,
    )
    # fit()에서 verbose 인자 제거, 대신 콜백으로 로그 억제
    lgbm.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="l1",                # 필요시 "mae"로 바꿔도 동일
        callbacks=[lgb.log_evaluation(period=0)]
    )

    gain = lgbm.booster_.feature_importance(importance_type="gain")
    gain_map = dict(zip(sp_candidates, gain))
    gain_sorted = sorted(gain_map.items(), key=lambda kv: kv[1], reverse=True)
    top_gain_feats = [c for c, g in gain_sorted if g > 0][:top_k_after_gain]
    if len(top_gain_feats) == 0:
        top_gain_feats = sp_candidates[:top_k_after_gain]

    # 6) 중복 제거
    keep, removed = [], set()
    corr_mat = df[top_gain_feats].corr(method="spearman").abs()
    for c in sorted(top_gain_feats, key=lambda x: gain_map.get(x, 0), reverse=True):
        if c in removed:
            continue
        if keep:
            max_corr = corr_mat.loc[c, keep].max()
            if pd.notna(max_corr) and max_corr > dedupe_corr_threshold:
                removed.add(c)
                continue
        keep.append(c)

    # 7) 진단 테이블
    def _get(d, k, default=0.0): return d.get(k, default)
    rows = []
    for c in set(sp_candidates) | set(top_gain_feats):
        rows.append({
            "feature": c,
            "spearman": _get(spearman_scores, c),
            "kendall": _get(kendall_scores, c),
            "gain": float(gain_map.get(c, 0.0)),
            "selected": c in keep
        })
    ranking_df = (
        pd.DataFrame(rows)
        .sort_values(["selected", "gain", "spearman"], ascending=[False, False, False])
        .reset_index(drop=True)
    )
    return keep, ranking_df



In [7]:
if 'pv_id' in df_train.columns:
    df_train['pv_id'] = df_train['pv_id'].astype('category')

selected_feats, ranking_tbl = select_features_standard(
    df_train,
    target_col="nins", time_col="time", id_col="pv_id",
    corr_threshold_first=0.15, use_kendall=True,
    sample_every_n=6, daytime_only=True,
    lgb_n_estimators=300, lgb_max_depth=6,
    lgb_subsample=0.7, lgb_colsample_bytree=0.7,
    lgb_min_child_samples=100,
    valid_frac=0.2, min_valid_size=2000,
    top_k_after_gain=22, dedupe_corr_threshold=0.90,
    random_state=42
)

print(f"[Selected {len(selected_feats)}] :", selected_feats[:15], ("..." if len(selected_feats)>15 else ""))
ranking_tbl.head(20)



[Selected 10] : ['uv_idx', 'humidity', 'ceiling', 'real_feel_temp', 'cloud_a', 'rain', 'cloud_b', 'rel_hum', 'wind_spd_b', 'precip_1h'] 


Unnamed: 0,feature,spearman,kendall,gain,selected
0,uv_idx,0.826333,0.642233,88411750000.0,True
1,humidity,-0.473633,-0.325915,8614741000.0,True
2,ceiling,0.156705,0.104189,6022503000.0,True
3,real_feel_temp,0.226509,0.151725,2258850000.0,True
4,cloud_a,-0.287468,-0.20186,2066018000.0,True
5,rain,-0.272738,-0.218076,1919652000.0,True
6,cloud_b,-0.255506,-0.174136,1127631000.0,True
7,rel_hum,-0.399342,-0.271889,1013326000.0,True
8,wind_spd_b,0.175633,0.117112,378218700.0,True
9,precip_1h,-0.286726,-0.226478,331941800.0,True


In [8]:
print(ranking_tbl.shape)      # (행, 열) → 현재 15행일 것
print(ranking_tbl.tail())     # 끝부분 확인


(15, 5)
            feature  spearman   kendall          gain  selected
10         temp_min  0.177673  0.118607  1.115947e+09     False
11  wind_chill_temp  0.169904  0.113319  1.045595e+09     False
12           temp_b  0.176816  0.118020  7.935110e+08     False
13         temp_max  0.176225  0.117564  5.029253e+08     False
14           temp_a  0.177217  0.118257  3.409236e+08     False


In [9]:
all_feats = [c for c in df_train.columns if c not in ["nins","time","type","energy"]]
all_feats = [("pv_id_code" if c=="pv_id" else c) for c in all_feats]
in_table = set(ranking_tbl["feature"].tolist())
dropped = [c for c in all_feats if c not in in_table]
len(dropped), dropped[:10]


(14,
 ['pv_id_code',
  'appr_temp',
  'dew_point',
  'pressure',
  'real_feel_temp_shade',
  'vis',
  'wind_dir_b',
  'wind_gust_spd',
  'ground_press',
  'snow'])

In [10]:
#====================================================
#2. **원본에서 보간 필요 없는 파생만 생성**
#    - `hr_sin/cos, doy_sin/cos, elev_sin, AM` 같이 시간·좌표 기반 파생(결측과 무관).
#    - 구름·가시거리·습윤 파생은 나중(보간 후)로 미룸.
#=====================================================
import numpy as np
import pandas as pd

def add_time_coord_features(
    df: pd.DataFrame,
    time_col: str = "time",
    lat_col: str = "coord1",   # 위도(deg)라고 가정
    lon_col: str = "coord2",   # 경도(deg, 동경 +, 서경 -)라고 가정
) -> pd.DataFrame:
    """
    원본(미보간) 상태에서 바로 만들어도 안전한 파생만 추가:
      - hr_sin, hr_cos (하루 주기 24h)
      - doy_sin, doy_cos (연 주기 365d)
      - elev_sin        (태양 고도각의 sin; 음수는 0으로 클리핑)
      - AM              (Air Mass; elev<=0이면 NaN)
    좌표가 없거나 time이 없으면 가능한 항목만 만듭니다.
    """
    out = df.copy()

    # --- time 처리 ---
    if time_col not in out.columns:
        raise KeyError(f"'{time_col}' 컬럼이 필요합니다.")
    if not np.issubdtype(out[time_col].dtype, np.datetime64):
        out[time_col] = pd.to_datetime(out[time_col], errors="coerce")

    t = out[time_col]
    # 1) 하루 주기: hour + minute/60 + second/3600
    hour_float = t.dt.hour.astype("float32") + (t.dt.minute.astype("float32")/60.0) + (t.dt.second.astype("float32")/3600.0)
    out["hr_sin"] = np.sin(2.0*np.pi * hour_float/24.0).astype("float32")
    out["hr_cos"] = np.cos(2.0*np.pi * hour_float/24.0).astype("float32")

    # 2) 연 주기: dayofyear (1~365/366) -> 365로 정규화(윤년 영향 미세)
    doy = t.dt.dayofyear.astype("float32")
    out["doy_sin"] = np.sin(2.0*np.pi * doy/365.0).astype("float32")
    out["doy_cos"] = np.cos(2.0*np.pi * doy/365.0).astype("float32")

    # --- 좌표 기반(있을 때만) 태양고도계산 ---
    has_latlon = (lat_col in out.columns) and (lon_col in out.columns)
    if has_latlon:
        # 위도/경도 (deg -> rad)
        lat_deg = pd.to_numeric(out[lat_col], errors="coerce").astype("float32")
        lon_deg = pd.to_numeric(out[lon_col], errors="coerce").astype("float32")
        lat = np.deg2rad(lat_deg.values)
        lon = np.deg2rad(lon_deg.values)

        # 시간(UTC/로컬 상관없이 일관된 특징이면 OK) → 일단 'as-is'로 처리
        # (고정 타임존 데이터셋이면 큰 문제 없음. 더 정확히 하려면 UTC기준+방정식 적용)
        # 간단 NOAA 근사: 태양 적위/시각 방정식으로 고도각 근사
        # 1) 줄리안일(연중일) 기반 (0-index 보정용 -1)
        n = (t.dt.dayofyear.astype("float32") - 1.0).values

        # 2) 경사각/각주파수
        # 태양 적위 δ 근사
        # δ ≈ 23.44° * sin( 2π*(284+n)/365 )
        decl = np.deg2rad(23.44) * np.sin(2.0*np.pi*(284.0 + n)/365.0)

        # 3) 균시차(EoT) 근사 → 분 단위
        B = 2.0*np.pi*(n/365.0)
        eot_min = 229.18*(0.000075 + 0.001868*np.cos(B) - 0.032077*np.sin(B)
                          - 0.014615*np.cos(2*B) - 0.040849*np.sin(2*B))

        # 4) 지역표준시 기준의 시각(시) + 균시차/경도 보정으로 "태양시" 얻기
        # 간단화를 위해 현지 시각을 사용한다고 가정(데이터셋 일관성 중요)
        # 태양시 = 현지시 + (EoT + 4*(표준경도 - 경도deg))/60
        # 표준경도는 대략 경도 근처의 시간대 중앙(한국은 135E)로 근사 가능
        std_long_deg = np.where(np.isfinite(lon_deg.values), 135.0, np.nan).astype("float32")  # KR 가정
        solar_time = hour_float.values + (eot_min + 4.0*(std_long_deg - lon_deg.values))/60.0

        # 5) 시각각 H (라디안): 15° * (태양시 - 12)
        H = np.deg2rad(15.0*(solar_time - 12.0))

        # 6) 태양 고도각(alt) = asin( sinφ sinδ + cosφ cosδ cosH )
        sin_alt = np.sin(lat)*np.sin(decl) + np.cos(lat)*np.cos(decl)*np.cos(H)
        # 수치 안정/야간 처리
        sin_alt = np.clip(sin_alt, -1.0, 1.0)
        # 음수(지평선 아래)는 0으로 클리핑(일사량 모델 관점)
        sin_alt_clipped = np.where(sin_alt > 0, sin_alt, 0.0).astype("float32")
        out["elev_sin"] = sin_alt_clipped

        # 7) Air Mass (Kasten & Young 1989) — 고도각(deg) 필요
        elev_deg = np.degrees(np.arcsin(np.clip(sin_alt, -1.0, 1.0))).astype("float32")
        # elev<=0이면 AM은 NaN
        with np.errstate(invalid="ignore", divide="ignore"):
            am = 1.0 / (np.sin(np.deg2rad(elev_deg)) + 0.50572 * (6.07995 + elev_deg)**-1.6364)
        am = am.astype("float32")
        am[(elev_deg <= 0) | ~np.isfinite(am)] = np.nan
        out["AM"] = am
    else:
        # 좌표 없으면 elev_sin/AM은 만들지 않음
        pass

    # 메모리 정리(float32)
    for c in ["hr_sin","hr_cos","doy_sin","doy_cos","elev_sin","AM"]:
        if c in out.columns:
            out[c] = out[c].astype("float32")

    return out


In [11]:
# === 사용 예시 ===
df_train = add_time_coord_features(df_train, time_col="time", lat_col="coord1", lon_col="coord2")
df_train[["hr_sin","hr_cos","doy_sin","doy_cos","elev_sin","AM"]].head()

Unnamed: 0,hr_sin,hr_cos,doy_sin,doy_cos,elev_sin,AM
0,0.021815,0.999762,-0.516062,-0.856551,0.659357,1.51452
1,0.043619,0.999048,-0.516062,-0.856551,0.673916,1.481922
2,0.065403,0.997859,-0.516062,-0.856551,0.688149,1.451379
3,0.087156,0.996195,-0.516062,-0.856551,0.702049,1.422738
4,0.108867,0.994056,-0.516062,-0.856551,0.71561,1.395863


In [12]:
#=======================================================
#3. **시간순으로 Train/Valid/Test 스플릿**(또는 블록 CV)
#    - 여기서부터 **분할 간 정보가 섞이지 않게** 고정.
#======================================================
import pandas as pd
import numpy as np

def split_time_ordered(
    df: pd.DataFrame,
    time_col: str = "time",
    id_col: str | None = "pv_id",
    fracs: tuple[float, float, float] = (0.7, 0.15, 0.15),
    gap: str | None = "1H",
    per_id: bool = False,
    return_indices: bool = False,
    print_stats: bool = False,
):
    """
    시간 순서로 Train/Valid/Test 스플릿.
    - fracs: (train, valid, test) 합=1.0
    - gap: 경계 누설 방지 버퍼(예: "30min","1H","2H"). None이면 미사용.
    - per_id=True: pv_id별로 각자 비율 스플릿. False: 전체 시간축 기준 글로벌 스플릿.
    - return_indices=True: 인덱스 배열만 반환(메모리 절약). False면 잘린 DataFrame 반환.
    """
    a, b, c = fracs
    assert abs(a + b + c - 1.0) < 1e-9, "fracs 합이 1.0이어야 합니다."

    if time_col not in df.columns:
        raise KeyError(f"'{time_col}' 컬럼이 필요합니다.")
    t = pd.to_datetime(df[time_col], errors="coerce")

    if per_id and (id_col is not None) and (id_col in df.columns):
        tr_idx_parts, va_idx_parts, te_idx_parts = [], [], []
        # 그룹별로 자체 분위수 계산 → 인덱스만 축적(메모리↓)
        for _, g in df[[time_col, id_col]].assign(_idx=np.arange(len(df))).groupby(id_col, observed=True, sort=False):
            g = g.sort_values(time_col)
            tg = pd.to_datetime(g[time_col], errors="coerce")

            q1 = tg.quantile(a)
            q2 = tg.quantile(a + b)
            if gap:
                d = pd.to_timedelta(gap)
                tr = g.loc[tg <= q1, "_idx"].to_numpy()
                va = g.loc[(tg >= q1 + d) & (tg <= q2), "_idx"].to_numpy()
                te = g.loc[tg >= q2 + d, "_idx"].to_numpy()
            else:
                tr = g.loc[tg <= q1, "_idx"].to_numpy()
                va = g.loc[(tg > q1) & (tg <= q2), "_idx"].to_numpy()
                te = g.loc[tg > q2, "_idx"].to_numpy()

            if tr.size: tr_idx_parts.append(tr)
            if va.size: va_idx_parts.append(va)
            if te.size: te_idx_parts.append(te)

        tr_idx = np.concatenate(tr_idx_parts) if tr_idx_parts else np.array([], dtype=int)
        va_idx = np.concatenate(va_idx_parts) if va_idx_parts else np.array([], dtype=int)
        te_idx = np.concatenate(te_idx_parts) if te_idx_parts else np.array([], dtype=int)

    else:
        # 글로벌 시간축 기준 한 번만 분위수 계산
        q1 = t.quantile(a)
        q2 = t.quantile(a + b)
        if gap:
            d = pd.to_timedelta(gap)
            tr_idx = np.nonzero((t <= q1).to_numpy())[0]
            va_idx = np.nonzero(((t >= q1 + d) & (t <= q2)).to_numpy())[0]
            te_idx = np.nonzero((t >= q2 + d).to_numpy())[0]
        else:
            tr_idx = np.nonzero((t <= q1).to_numpy())[0]
            va_idx = np.nonzero(((t > q1) & (t <= q2)).to_numpy())[0]
            te_idx = np.nonzero((t > q2).to_numpy())[0]

    # 정렬(시간순 보기 좋게)
    tr_idx = np.sort(tr_idx)
    va_idx = np.sort(va_idx)
    te_idx = np.sort(te_idx)

    if print_stats:
        print(f"Train: {tr_idx.size:,}  Valid: {va_idx.size:,}  Test: {te_idx.size:,}")

    if return_indices:
        return tr_idx, va_idx, te_idx
    else:
        df_tr = df.iloc[tr_idx].sort_values(time_col)
        df_va = df.iloc[va_idx].sort_values(time_col)
        df_te = df.iloc[te_idx].sort_values(time_col)
        return df_tr, df_va, df_te


In [13]:
# 1) 인덱스만 받아 메모리 최소화
tr_idx, va_idx, te_idx = split_time_ordered(
    df_train, time_col="time", id_col="pv_id",
    fracs=(0.70, 0.15, 0.15), gap="1H",
    per_id=False, return_indices=True, print_stats=True
)

# 필요할 때만 잘라 쓰기
df_tr = df_train.iloc[tr_idx]
df_va = df_train.iloc[va_idx]
df_te = df_train.iloc[te_idx]

# 2) 혹시 발전소별 개별 스플릿이 필요하면:
# df_tr, df_va, df_te = split_time_ordered(
#     df_train, time_col="time", id_col="pv_id",
#     fracs=(0.70, 0.15, 0.15), gap="1H",
#     per_id=True, return_indices=False, print_stats=True
# )


  d = pd.to_timedelta(gap)


Train: 13,465,864  Valid: 2,883,531  Test: 2,883,527


In [14]:
#=============================================
#4. **분할별로(Train/Valid/Test 각각) 피처 보간**
#    - `pv_id`별로 선형/ffill/bfill 등 적용(경계 넘어 보간 금지).
#    - 타깃(nins/예측값)은 **보간 금지**.
#    - 스케일링·중앙값 대체가 필요하면 **Train 통계로만** 계산해 Valid/Test에 적용.
#==============================================
import numpy as np
import pandas as pd

TARGET = "nins"
TIME   = "time"
ID     = "pv_id"

def null_report(df, top=20):
    rep = (df.isna().sum()
             .sort_values(ascending=False)
             .head(top))
    print(rep)


In [15]:
#Step 1) 보간/보존 컬럼 정의 (자동 + 수동 오버라이드 가능)
# 1) 수치형 피처들(타깃/시간/비수치 제외)
numeric_cols = [c for c in df_train.columns
                if c not in [TARGET, TIME]
                and pd.api.types.is_numeric_dtype(df_train[c])]

# 2) 각도형(원형) → 선형보간 금지, ffill→bfill 로만
angle_cols = [c for c in ["wind_dir_a","wind_dir_b"] if c in df_train.columns]

# 3) 의미 있는 NaN 보존 (야간 AM 등)
preserve_nan_cols = [c for c in ["AM"] if c in df_train.columns]

# 4) 선형 보간 대상 = 수치형 - (각도형 ∪ 보존 컬럼)
linear_cols = [c for c in numeric_cols
               if c not in set(angle_cols) | set(preserve_nan_cols)]

print("linear_cols:", len(linear_cols), linear_cols[:10], "...")
print("angle_cols :", angle_cols)
print("preserve   :", preserve_nan_cols)


linear_cols: 32 ['appr_temp', 'ceiling', 'cloud_b', 'dew_point', 'precip_1h', 'pressure', 'real_feel_temp', 'real_feel_temp_shade', 'rel_hum', 'temp_b'] ...
angle_cols : ['wind_dir_a', 'wind_dir_b']
preserve   : ['AM']


In [16]:
#Step 2) 분할별 슬라이스(뷰)와 사전 점검
# 인덱스는 이미 있음: tr_idx, va_idx, te_idx
df_tr = df_train.iloc[tr_idx].copy()
df_va = df_train.iloc[va_idx].copy()
df_te = df_train.iloc[te_idx].copy()

# 시간/PV 정렬 (그룹 보간 전 기본)
def _sort(df):
    if ID in df.columns:
        return df.sort_values([ID, TIME])
    else:
        return df.sort_values(TIME)

df_tr = _sort(df_tr)
df_va = _sort(df_va)
df_te = _sort(df_te)

print("BEFORE nulls (top):")
null_report(df_tr)


BEFORE nulls (top):
appr_temp               12343708
cloud_b                 12343708
ceiling                 12343708
dew_point               12343708
precip_1h               12343708
real_feel_temp          12343708
pressure                12343708
temp_a                  12343708
temp_max                12343708
real_feel_temp_shade    12343708
rel_hum                 12343708
temp_b                  12343708
uv_idx                  12343708
vis                     12343708
wind_chill_temp         12343708
wind_dir_b              12343708
wind_gust_spd           12343708
wind_spd_b              12343708
cloud_a                 12343708
ground_press            12343708
dtype: int64


In [17]:
#Step 3) 각도형 ⇒ ffill→bfill (그룹 내부)
def fill_ffill_bfill_by_group(df, cols, id_col=ID):
    if not cols: 
        return df
    if id_col in df.columns:
        df[cols] = (df.groupby(id_col, observed=True, sort=False)[cols]
                      .apply(lambda g: g.ffill().bfill())
                      .reset_index(level=id_col, drop=True))
    else:
        df[cols] = df[cols].ffill().bfill()
    return df

# 실행
df_tr = fill_ffill_bfill_by_group(df_tr, angle_cols, ID)
df_va = fill_ffill_bfill_by_group(df_va, angle_cols, ID)
df_te = fill_ffill_bfill_by_group(df_te, angle_cols, ID)


In [18]:
#Step 4) 연속형 ⇒ 선형 보간 (그룹 내부, 경계 넘지 않음)
def linear_interpolate_by_group(df, cols, id_col=ID):
    if not cols:
        return df
    if id_col in df.columns:
        def _lin(g):
            # limit_direction="both" : 그룹 내부에서만 양방향
            return g.interpolate(method="linear", limit_direction="both")
        df[cols] = (df.groupby(id_col, observed=True, sort=False)[cols]
                      .apply(_lin)
                      .reset_index(level=id_col, drop=True))
    else:
        df[cols] = df[cols].interpolate(method="linear", limit_direction="both")
    return df

# 실행 (타깃은 포함 X)
df_tr = linear_interpolate_by_group(df_tr, linear_cols, ID)
df_va = linear_interpolate_by_group(df_va, linear_cols, ID)
df_te = linear_interpolate_by_group(df_te, linear_cols, ID)


In [19]:
#Step 5) (옵션) Train 중앙값으로 잔여 결측 일괄 대체
fill_candidates = [c for c in numeric_cols if c not in preserve_nan_cols]
train_medians = df_tr[fill_candidates].median(numeric_only=True)

def fill_with_train_median(df, cols, med):
    cols = [c for c in cols if c in df.columns]
    df[cols] = df[cols].fillna(med[cols])
    return df

df_tr = fill_with_train_median(df_tr, fill_candidates, train_medians)
df_va = fill_with_train_median(df_va, fill_candidates, train_medians)
df_te = fill_with_train_median(df_te, fill_candidates, train_medians)


In [21]:
#Step 6) 다운캐스트 + 최종 점검
# 메모리 절약: float → float32
for df_part in (df_tr, df_va, df_te):
    for c in numeric_cols:
        if c in df_part.columns:
            df_part[c] = pd.to_numeric(df_part[c], errors="coerce", downcast="float")

print("\nAFTER nulls (top):")
null_report(df_tr)
null_report(df_va)
null_report(df_te)



AFTER nulls (top):
AM                      6726669
pv_id                         0
time                          0
ceiling                       0
cloud_b                       0
dew_point                     0
appr_temp                     0
precip_1h                     0
pressure                      0
real_feel_temp_shade          0
real_feel_temp                0
temp_b                        0
uv_idx                        0
vis                           0
rel_hum                       0
wind_dir_b                    0
wind_gust_spd                 0
wind_spd_b                    0
cloud_a                       0
ground_press                  0
dtype: int64
AM                      1440438
pv_id                         0
time                          0
ceiling                       0
cloud_b                       0
dew_point                     0
appr_temp                     0
precip_1h                     0
pressure                      0
real_feel_temp_shade          0
real_fe

In [23]:
# 분할별 컬럼 개수
print("train cols:", len(df_tr.columns))
print("valid cols:", len(df_va.columns))
print("test  cols:", len(df_te.columns))

# 전체 결측 리포트(전 컬럼)
rep_all_tr = df_tr.isna().sum().sort_values(ascending=False)
rep_all_va = df_va.isna().sum().sort_values(ascending=False)
rep_all_te = df_te.isna().sum().sort_values(ascending=False)

print("Train - 결측>0:")
print(rep_all_tr[rep_all_tr>0])
print("\nValid - 결측>0:")
print(rep_all_va[rep_all_va>0])
print("\nTest - 결측>0:")
print(rep_all_te[rep_all_te>0])


train cols: 39
valid cols: 39
test  cols: 39
Train - 결측>0:
AM    6726669
dtype: int64

Valid - 결측>0:
AM    1440438
dtype: int64

Test - 결측>0:
AM    1449040
dtype: int64


In [25]:
#1) 야간 플래그 추가

for dfp in (df_tr, df_va, df_te):
    dfp["is_night"] = ((dfp["AM"].isna()) | (dfp.get("elev_sin", 0) <= 0)).astype("int8")


In [26]:
#2) 모델별 AM 처리
for dfp in (df_tr, df_va, df_te):
    dfp["AM_filled"] = dfp["AM"].fillna(dfp["AM"].median())  # 필요할 때만 사용


In [27]:
#3) 간단 일관성 체크(옵션)
def check_am_elev(df):
    if "elev_sin" in df.columns:
        m = df["AM"].isna()
        print("mean elev_sin at AM NaN:", float(df.loc[m, "elev_sin"].mean()))

for part in (df_tr, df_va, df_te):
    check_am_elev(part)


mean elev_sin at AM NaN: 0.0
mean elev_sin at AM NaN: 0.0
mean elev_sin at AM NaN: 0.0


In [28]:
#=========================================
#5. **보간 끝난 데이터에서 나머지 파생 생성 + 변수 드롭 확정**
#    - `cloud_mean/diff, vis_log, haze, dew_spread, is_rain/snow, ceil_inv, u/v풍향` 등 생성.
#    - 1단계에서 약하다고 본 변수 + 중복 변수(`appr_temp, real_feel_*, wind_chill_temp, temp_b` 등) 드롭.
#=========================================
#셀 1) 설정 + 유틸 (안전 캐스팅, 존재하면 만드는 헬퍼)
import numpy as np
import pandas as pd

# 수치형은 가볍게 float32로, 플래그는 int8로
def _as_f32(x): return pd.to_numeric(x, errors="coerce", downcast="float")
def _as_i8(x):  return x.astype("int8")

def _has(df, cols: set[str]):  # 모든 컬럼이 존재하는지
    return cols.issubset(df.columns)

# 새 컬럼을 안전하게 덧붙이는 헬퍼 (체이닝 경고 회피)
def _add_col(df: pd.DataFrame, name: str, values):
    df.loc[:, name] = values  # in-place, 복사 최소화


In [29]:
#셀 2) 파생 생성 함수 (보간 끝난 각 분할에 공통 적용)
def add_post_interp_features_inplace(df: pd.DataFrame, *, eps: float = 1e-3):
    """
    보간 완료 데이터에 일사량 관련 핵심 파생을 in-place로 추가.
    - cloud_mean/diff
    - vis_log/haze
    - dew_spread  (만들면 temp_a는 나중에 드롭)
    - is_rain/is_snow/precip_1h_pos
    - ceil_inv
    - u_b/v_b ( + u_a/v_a 있으면)
    - uv_elev
    - (이미 만들어뒀다면 is_night는 건너뜀)
    """
    # 1) 구름
    if _has(df, {"cloud_a","cloud_b"}):
        _add_col(df, "cloud_mean", _as_f32((df["cloud_a"] + df["cloud_b"]) / 2.0))
        _add_col(df, "cloud_diff", _as_f32((df["cloud_a"] - df["cloud_b"]).abs()))

    # 2) 가시거리
    if "vis" in df.columns:
        vis_f32 = _as_f32(df["vis"])
        _add_col(df, "vis_log",  _as_f32(np.log(vis_f32 + eps)))
        _add_col(df, "haze",     _as_f32(1.0 / (vis_f32 + eps)))

    # 3) 습윤(건조도)
    if _has(df, {"temp_a","dew_point"}):
        _add_col(df, "dew_spread", _as_f32(df["temp_a"]) - _as_f32(df["dew_point"]))

    # 4) 강수/강설 플래그
    if "rain" in df.columns:
        _add_col(df, "is_rain", _as_i8(df["rain"] > 0))
    if "snow" in df.columns:
        _add_col(df, "is_snow", _as_i8(df["snow"] > 0))
    if "precip_1h" in df.columns:
        _add_col(df, "precip_1h_pos", _as_i8(df["precip_1h"] > 0))

    # 5) 천장 역수 (천장 낮을수록 감쇠↑)
    if "ceiling" in df.columns:
        _add_col(df, "ceil_inv", _as_f32(1.0 / (_as_f32(df["ceiling"]) + eps)))

    # 6) 바람 벡터 (b, a 각각)
    if _has(df, {"wind_spd_b","wind_dir_b"}):
        rad_b = np.deg2rad(_as_f32(df["wind_dir_b"]))
        sp_b  = _as_f32(df["wind_spd_b"])
        _add_col(df, "u_b", _as_f32(sp_b * np.cos(rad_b)))
        _add_col(df, "v_b", _as_f32(sp_b * np.sin(rad_b)))

    if _has(df, {"wind_spd_a","wind_dir_a"}):
        rad_a = np.deg2rad(_as_f32(df["wind_dir_a"]))
        sp_a  = _as_f32(df["wind_spd_a"])
        _add_col(df, "u_a", _as_f32(sp_a * np.cos(rad_a)))
        _add_col(df, "v_a", _as_f32(sp_a * np.sin(rad_a)))

    # 7) 태양고도 상호작용
    if _has(df, {"uv_idx","elev_sin"}):
        _add_col(df, "uv_elev", _as_f32(_as_f32(df["uv_idx"]) * _as_f32(df["elev_sin"])))

    # 8) 야간 플래그가 없으면 만들어 주기(트리모델에서 유용)
    if "is_night" not in df.columns:
        has_am = "AM" in df.columns
        if has_am and "elev_sin" in df.columns:
            _add_col(df, "is_night", _as_i8(df["AM"].isna() | (df["elev_sin"] <= 0)))
        elif has_am:
            _add_col(df, "is_night", _as_i8(df["AM"].isna()))
        elif "elev_sin" in df.columns:
            _add_col(df, "is_night", _as_i8(df["elev_sin"] <= 0))


In [30]:
#셀 3) 세 분할에 in-place 적용 (복사 없음)
add_post_interp_features_inplace(df_tr)
add_post_interp_features_inplace(df_va)
add_post_interp_features_inplace(df_te)

new_cols = ["cloud_mean","cloud_diff","vis_log","haze","dew_spread",
            "is_rain","is_snow","precip_1h_pos","ceil_inv","u_b","v_b","u_a","v_a","uv_elev","is_night"]
{ part: [c for c in new_cols if c in df_.columns]
  for part, df_ in [("train", df_tr), ("valid", df_va), ("test", df_te)] }


{'train': ['cloud_mean',
  'cloud_diff',
  'vis_log',
  'haze',
  'dew_spread',
  'is_rain',
  'is_snow',
  'precip_1h_pos',
  'ceil_inv',
  'u_b',
  'v_b',
  'u_a',
  'v_a',
  'uv_elev',
  'is_night'],
 'valid': ['cloud_mean',
  'cloud_diff',
  'vis_log',
  'haze',
  'dew_spread',
  'is_rain',
  'is_snow',
  'precip_1h_pos',
  'ceil_inv',
  'u_b',
  'v_b',
  'u_a',
  'v_a',
  'uv_elev',
  'is_night'],
 'test': ['cloud_mean',
  'cloud_diff',
  'vis_log',
  'haze',
  'dew_spread',
  'is_rain',
  'is_snow',
  'precip_1h_pos',
  'ceil_inv',
  'u_b',
  'v_b',
  'u_a',
  'v_a',
  'uv_elev',
  'is_night']}

In [31]:
#셀 4) 빠른 검증 (결측/타입/범위)
def quick_check(df, name):
    cols = [c for c in ["cloud_mean","cloud_diff","vis_log","haze","dew_spread",
                        "is_rain","is_snow","precip_1h_pos","ceil_inv","u_b","v_b",
                        "u_a","v_a","uv_elev","is_night"] if c in df.columns]
    nulls = df[cols].isna().sum().sort_values(ascending=False)
    print(f"[{name}] #cols={len(df.columns)}  new_cols={len(cols)}")
    print("  top nulls:\n", nulls.head(10))

quick_check(df_tr, "train")
quick_check(df_va, "valid")
quick_check(df_te, "test")


[train] #cols=55  new_cols=15
  top nulls:
 cloud_mean       0
cloud_diff       0
vis_log          0
haze             0
dew_spread       0
is_rain          0
is_snow          0
precip_1h_pos    0
ceil_inv         0
u_b              0
dtype: int64
[valid] #cols=55  new_cols=15
  top nulls:
 cloud_mean       0
cloud_diff       0
vis_log          0
haze             0
dew_spread       0
is_rain          0
is_snow          0
precip_1h_pos    0
ceil_inv         0
u_b              0
dtype: int64
[test] #cols=55  new_cols=15
  top nulls:
 cloud_mean       0
cloud_diff       0
vis_log          0
haze             0
dew_spread       0
is_rain          0
is_snow          0
precip_1h_pos    0
ceil_inv         0
u_b              0
dtype: int64


In [32]:
#셀 1) 최종 사용할 피처 리스트(베이스라인)
TARGET = "nins"

# (확정 포함 + 모델에 바로 쓰는 파생)
feature_keep = [
    # 확정 포함(원시)
    "ceiling","cloud_b","precip_1h","real_feel_temp","rel_hum","uv_idx","wind_spd_b","cloud_a","humidity","rain",
    # 시간/좌표 파생(보간 전 만들었던 것)
    "hr_sin","hr_cos","doy_sin","doy_cos","elev_sin","AM","is_night",
    # 보간 후 파생
    "cloud_mean","cloud_diff","vis_log","haze","dew_spread","ceil_inv","u_b","v_b","uv_elev",
    # (있을 경우) 추가 파생
    "u_a","v_a","is_rain","is_snow","precip_1h_pos"
]

# 실제 존재하는 것만
feature_keep = [c for c in feature_keep if c in df_tr.columns]
print("keep features:", len(feature_keep))


keep features: 31


In [33]:
#셀 2) 드롭 후보(원시/중복/재료) 정의
# 원시/중복/재료 열(학습에서 제외)
drop_raw = [
    # 유의미하지 않다고 합의한 원시들
    "temp_b","wind_chill_temp","temp_a","temp_max","temp_min",
    # 보류 원시(파생으로 대체되므로 원시는 드롭 권장)
    "appr_temp","real_feel_temp_shade","pressure","ground_press","dew_point","vis",
    "wind_dir_b","wind_spd_a","wind_dir_a","wind_gust_spd",
    # 메타/누설/불필요
    "type","energy","coord1","coord2","pv_id"
]
# 남아있다면만 드롭하게 필터
drop_raw = [c for c in drop_raw if c in df_tr.columns]
print("drop candidates:", len(drop_raw), drop_raw[:10], "...")


drop candidates: 20 ['temp_b', 'wind_chill_temp', 'temp_a', 'temp_max', 'temp_min', 'appr_temp', 'real_feel_temp_shade', 'pressure', 'ground_press', 'dew_point'] ...


In [34]:
#셀 3) 학습용 테이블 구성(필요 열만 슬림화)
def build_train_table(df, target=TARGET, keep_feats=feature_keep):
    # target + keep_feats 외는 버리고, 순서를 고정
    cols = [target] + keep_feats
    use_cols = [c for c in cols if c in df.columns]
    out = df[use_cols].copy()
    return out

df_tr_slim = build_train_table(df_tr)
df_va_slim = build_train_table(df_va)
df_te_slim = build_train_table(df_te)

print(df_tr_slim.shape, df_va_slim.shape, df_te_slim.shape)


(13465864, 32) (2883531, 32) (2883527, 32)


In [35]:
#셀 4) (선택) 안전 드롭 수행 후 재확인
def drop_inplace(df, cols):
    exist = [c for c in cols if c in df.columns]
    if exist:
        df.drop(columns=exist, inplace=True, errors="ignore")

# 원본에서도 정리하고 싶다면(선택):
for _df in (df_tr, df_va, df_te):
    drop_inplace(_df, drop_raw)

# 슬림 테이블에서도 혹시 남아있다면 제거(보통 없음)
for _df in (df_tr_slim, df_va_slim, df_te_slim):
    drop_inplace(_df, drop_raw)

# 최종 확인
print("train/use cols:", len(df_tr_slim.columns))
print("valid/use cols:", len(df_va_slim.columns))
print("test /use cols:", len(df_te_slim.columns))
assert TARGET in df_tr_slim.columns and TARGET in df_va_slim.columns


train/use cols: 32
valid/use cols: 32
test /use cols: 32


In [36]:
#셀 5) 학습 직전 입력/타깃 분리
X_tr = df_tr_slim.drop(columns=[TARGET])
y_tr = df_tr_slim[TARGET].astype("float32")

X_va = df_va_slim.drop(columns=[TARGET])
y_va = df_va_slim[TARGET].astype("float32")

X_te = df_te_slim.drop(columns=[TARGET])  # 테스트 예측용
print(X_tr.shape, X_va.shape, X_te.shape)


(13465864, 31) (2883531, 31) (2883527, 31)


In [37]:
#==============================
#6..학습(베이스라인) LightGBM 학습
#=================================
#셀 A) 라이브러리 & 유틸
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import gc

def brief(dfX, name="X"):
    print(f"{name}: shape={dfX.shape}, float32={sum(dfX.dtypes==np.float32)} / total={len(dfX.columns)}")


In [38]:
#셀 B) 데이터 점검(형/결측) — 가벼운 확인
# 형식(가능한 float32) 확인
brief(X_tr, "X_tr"); brief(X_va, "X_va")
print("y_tr:", y_tr.shape, y_tr.dtype, " y_va:", y_va.shape, y_va.dtype)

# 트리 모델은 NaN 허용하지만 요약만 확인
print("Top NaNs (valid):")
print(X_va.isna().sum().sort_values(ascending=False).head(8))


X_tr: shape=(13465864, 31), float32=27 / total=31
X_va: shape=(2883531, 31), float32=27 / total=31
y_tr: (13465864,) float32  y_va: (2883531,) float32
Top NaNs (valid):
AM                1440438
cloud_b                 0
ceiling                 0
real_feel_temp          0
rel_hum                 0
uv_idx                  0
precip_1h               0
cloud_a                 0
dtype: int64


In [39]:
#셀 C) 모델 파라미터 설정 (빠르고 안정적인 기본값)
lgb_params = dict(
    n_estimators=2000,          # 조기종료로 자동 컷
    learning_rate=0.035,
    max_depth=7,
    num_leaves=96,
    min_child_samples=120,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)
model = lgb.LGBMRegressor(**lgb_params)
early_stop = lgb.early_stopping(stopping_rounds=200, verbose=False)
log_eval   = lgb.log_evaluation(period=200)


In [40]:
#셀 D) 학습 + 검증 (조기종료 & MAE)
# 학습
model.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    eval_metric="l1",
    callbacks=[early_stop, log_eval]
)

# 검증 예측(경진대회 룰 고려해 음수 클리핑)
pred_va = model.predict(X_va).astype("float32")
pred_va = np.maximum(pred_va, 0.0)

mae_va = mean_absolute_error(y_va, pred_va)
print(f"Validation MAE: {mae_va:.5f}  |  best_iteration: {model.best_iteration_}")


[200]	valid_0's l1: 65.58	valid_0's l2: 13981
Validation MAE: 63.91954  |  best_iteration: 158


In [41]:
#셀 E) 피처 중요도(빠른 요약) — split & gain
imp_split = pd.DataFrame({
    "feature": X_tr.columns,
    "split":   model.booster_.feature_importance(importance_type="split")
}).sort_values("split", ascending=False)

imp_gain = pd.DataFrame({
    "feature": X_tr.columns,
    "gain":    model.booster_.feature_importance(importance_type="gain")
}).sort_values("gain", ascending=False)

print("Top by gain:")
display(imp_gain.head(20))
print("Top by split:")
display(imp_split.head(20))


Top by gain:


Unnamed: 0,feature,gain
5,uv_idx,10354890000000.0
11,hr_cos,2185814000000.0
10,hr_sin,395220200000.0
17,cloud_mean,298971200000.0
15,AM,229130000000.0
8,humidity,198660900000.0
13,doy_cos,162572200000.0
12,doy_sin,111677700000.0
21,dew_spread,104147100000.0
7,cloud_a,101776400000.0


Top by split:


Unnamed: 0,feature,split
12,doy_sin,1592
13,doy_cos,1560
11,hr_cos,1110
5,uv_idx,966
10,hr_sin,946
27,v_a,784
3,real_feel_temp,771
8,humidity,639
0,ceiling,597
7,cloud_a,569


In [42]:
#셀 F) (선택) 테스트 예측 준비 — 추후 제출/평가용
# 필요 시 테스트 예측(음수 클리핑) 저장
pred_te = model.predict(X_te).astype("float32")
pred_te = np.maximum(pred_te, 0.0)

# 인덱스 기준으로 보관(시간/발전소 정보는 df_te에서 결합 가능)
pred_te_df = pd.DataFrame({"nins_pred": pred_te}, index=X_te.index)
print(pred_te_df.shape, pred_te_df.head())

# 메모리 청소(선택)
gc.collect()


(2883527, 1)         nins_pred
89363  172.246979
89364  228.610916
89365  239.319931
89366  252.521027
89367  264.355804


2917

In [43]:
#셀 1) 검증셋 Permutation importance(+gain) 계산
import numpy as np, pandas as pd
from sklearn.metrics import mean_absolute_error

# 이미 model, X_va, y_va, X_tr 존재
base_pred = model.predict(X_va)
base_mae  = mean_absolute_error(y_va, base_pred)

def perm_importance_mae(model, X, y, n_repeats=3, seed=42):
    rng = np.random.default_rng(seed)
    feats = list(X.columns)
    rows  = []
    Xc = X.copy()
    for f in feats:
        incs = []
        for _ in range(n_repeats):
            saved = Xc[f].to_numpy().copy()
            shuf  = saved.copy(); rng.shuffle(shuf)
            Xc[f] = shuf
            incs.append(mean_absolute_error(y, model.predict(Xc)) - base_mae)
            Xc[f] = saved
        rows.append((f, float(np.mean(incs)), float(np.std(incs))))
    imp = pd.DataFrame(rows, columns=["feature","mae_inc_mean","mae_inc_std"])
    imp["mae_inc_pct"] = (imp["mae_inc_mean"] / base_mae * 100.0) if base_mae>0 else 0.0
    return imp.sort_values("mae_inc_mean", ascending=False).reset_index(drop=True)

imp_tbl = perm_importance_mae(model, X_va, y_va, n_repeats=3, seed=42)

# gain도 합쳐서 보수적으로 판단
gain_tbl = pd.DataFrame({
    "feature": X_tr.columns,
    "gain": model.booster_.feature_importance(importance_type="gain")
})
imp_tbl = imp_tbl.merge(gain_tbl, on="feature", how="left").fillna(0)
imp_tbl.head(20)


Unnamed: 0,feature,mae_inc_mean,mae_inc_std,mae_inc_pct,gain
0,uv_idx,125.115291,0.10638,195.679582,10354890000000.0
1,hr_cos,69.950481,0.038776,109.402142,2185814000000.0
2,hr_sin,4.823385,0.006468,7.543745,395220200000.0
3,humidity,4.182632,0.011545,6.541611,198660900000.0
4,rain,3.93162,0.008054,6.14903,46002320000.0
5,cloud_mean,3.926894,0.002647,6.141639,298971200000.0
6,AM,3.017289,0.006669,4.719022,229130000000.0
7,dew_spread,2.036395,0.004704,3.18491,104147100000.0
8,cloud_a,2.029026,0.005575,3.173385,101776400000.0
9,ceiling,1.8218,0.003539,2.849285,49446690000.0


In [44]:
#셀 2) 약한 피처 자동 제거(임계치+중복쌍 정리)
# 1) 약한 피처 기준: mae_inc_pct가 너무 작고(노이즈 수준) & gain도 거의 0
LOW_PCT  = 0.20      # 0.2% 미만이면 약한 편
LOW_GAIN = 1e-6      # gain ~ 0
weak = set(imp_tbl.loc[(imp_tbl["mae_inc_pct"] <= LOW_PCT) & (imp_tbl["gain"] <= LOW_GAIN), "feature"].tolist())

# 2) 중복쌍에서 약한 쪽만 제거 (둘 다 약하면 둘 다 제거)
pairs = [
    ("humidity","rel_hum"),
    ("rain","precip_1h"),
    ("cloud_a","cloud_b"),
]
to_drop = set(weak)
for a,b in pairs:
    if a in X_tr.columns and b in X_tr.columns:
        row_a = imp_tbl.loc[imp_tbl.feature==a]
        row_b = imp_tbl.loc[imp_tbl.feature==b]
        if not row_a.empty and not row_b.empty:
            if (row_a.mae_inc_pct.values[0] < row_b.mae_inc_pct.values[0]) and (a in weak):
                to_drop.add(a)
            elif (b in weak):
                to_drop.add(b)

# 3) 최종 keep 목록
final_keep = [f for f in X_tr.columns if f not in to_drop]
print("Removed (auto):", sorted(list(to_drop)))
print("Final keep (#{}):".format(len(final_keep)), final_keep[:20], "...")


Removed (auto): []
Final keep (#31): ['ceiling', 'cloud_b', 'precip_1h', 'real_feel_temp', 'rel_hum', 'uv_idx', 'wind_spd_b', 'cloud_a', 'humidity', 'rain', 'hr_sin', 'hr_cos', 'doy_sin', 'doy_cos', 'elev_sin', 'AM', 'is_night', 'cloud_mean', 'cloud_diff', 'vis_log'] ...


In [45]:
#셀 3) 최종 피처로 재학습 + 검증
Xtr2 = X_tr[final_keep]; Xva2 = X_va[final_keep]

model2 = lgb.LGBMRegressor(
    n_estimators=2000, learning_rate=0.035,
    max_depth=7, num_leaves=96,
    min_child_samples=120,
    subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.0, reg_lambda=0.0,
    random_state=42, n_jobs=-1, verbosity=-1
)
early_stop = lgb.early_stopping(200, verbose=False)
log_eval   = lgb.log_evaluation(200)

model2.fit(
    Xtr2, y_tr,
    eval_set=[(Xva2, y_va)],
    eval_metric="l1",
    callbacks=[early_stop, log_eval]
)

pred_va2 = np.maximum(model2.predict(Xva2).astype("float32"), 0.0)
from sklearn.metrics import mean_absolute_error
mae_va2 = mean_absolute_error(y_va, pred_va2)
print(f"Validation MAE (before → after): {base_mae:.5f} → {mae_va2:.5f}  | best_iter={model2.best_iteration_}")


[200]	valid_0's l1: 65.595	valid_0's l2: 13981.6
Validation MAE (before → after): 63.93886 → 63.91912  | best_iter=158


In [47]:
#셀 4) 테스트 예측 + 최종 제출 파일 생성 (덮어쓰기)
# Xte2, pred_te2, df_train, df_te 가 존재한다고 가정

# 1) 테스트 인덱스 기준의 베이스 프레임 생성
idx = Xte2.index  # 예측과 동일한 순서/인덱스
sub = pd.DataFrame(index=idx)

# 2) 원본(df_train)에서 time/pv_id를 인덱스로 복구
sub["time"] = pd.to_datetime(df_train.loc[idx, "time"]).values
if "pv_id" in df_train.columns:
    sub["pv_id"] = df_train.loc[idx, "pv_id"].values

# 3) type 열이 없으면 'test'로 채움
if "type" in df_te.columns:
    sub["type"] = df_te.loc[idx, "type"].values
else:
    sub["type"] = "test"

# 4) 예측 붙이고, 규격에 맞게 정렬/저장
sub["nins"] = pred_te2.astype("float32")

cols = [c for c in ["time","pv_id","type","nins"] if c in sub.columns]
sub = sub[cols]

# pv_id가 있다면 time, pv_id 순으로, 없다면 time만 기준 정렬
sort_keys = [k for k in ["time","pv_id"] if k in sub.columns]
if sort_keys:
    sub = sub.sort_values(sort_keys)
sub = sub.reset_index(drop=True)

sub.to_csv("result_submission.csv", index=False)
print(sub.shape)
print(sub.head())



(2883527, 4)
                 time      pv_id  type        nins
0 2025-06-06 22:00:00    PV_ID_0  test  172.246979
1 2025-06-06 22:00:00    PV_ID_1  test  175.901306
2 2025-06-06 22:00:00  PV_ID_100  test  127.055817
3 2025-06-06 22:00:00  PV_ID_101  test   95.268211
4 2025-06-06 22:00:00  PV_ID_102  test  173.793716


In [49]:
#셀 1) 공식 테스트/샘플 제출 로드 + 기본 점검
import pandas as pd
import numpy as np

# 파일명은 대회 제공 파일명에 맞게 조정하세요
df_test_raw = pd.read_csv("test.csv")                 # 공식 테스트셋
sub_tmpl    = pd.read_csv("submission_sample.csv")    # 제출 템플릿

print("test.csv :", df_test_raw.shape, df_test_raw.columns.tolist()[:8], "...")
print("sample   :", sub_tmpl.shape, sub_tmpl.columns.tolist())
# 보통 sample의 컬럼은 ['time','pv_id','type','nins'] 입니다.



test.csv : (2838240, 32) ['time', 'pv_id', 'appr_temp', 'ceiling', 'cloud_b', 'dew_point', 'precip_1h', 'pressure'] ...
sample   : (2838240, 4) ['time', 'pv_id', 'type', 'nins']


In [50]:
#셀 2) 시간 파생(보간 불필요) 생성 – test 전용
import numpy as np
import pandas as pd

def add_time_covariates(df, time_col="time"):
    df = df.copy()
    if not np.issubdtype(df[time_col].dtype, np.datetime64):
        df[time_col] = pd.to_datetime(df[time_col], errors="coerce")

    # 시간각
    hr = df[time_col].dt.hour + df[time_col].dt.minute/60.0
    df["hr_sin"] = np.sin(2*np.pi*hr/24).astype("float32")
    df["hr_cos"] = np.cos(2*np.pi*hr/24).astype("float32")

    # 연중일
    doy = df[time_col].dt.dayofyear.astype("float32")
    df["doy_sin"] = np.sin(2*np.pi*doy/365.0).astype("float32")
    df["doy_cos"] = np.cos(2*np.pi*doy/365.0).astype("float32")

    # 태양 고도 근사(elev_sin) / AM(야간 NaN) — 간단 근사 버전
    # (정밀 태양위치식을 못 쓰는 환경이므로, 낮/밤 분리 정도로만 사용)
    # local solar elevation 대용으로: 낮 시간(h in [6,18])이면 양수, 아니면 0
    elev_proxy = ((hr >= 6) & (hr <= 18)).astype("float32")
    df["elev_sin"] = elev_proxy  # 간단 프록시
    df["AM"] = np.where(elev_proxy > 0, 2.0, np.nan).astype("float32")  # 야간 NaN

    return df

df_test_t = add_time_covariates(df_test_raw)


In [51]:
#셀 3) 테스트셋 보간(분할 없이 테스트 내부에서만) + Train 중앙값 대체
def interpolate_test_like_train(
    df, time_col="time", id_col="pv_id", target_col="nins",
    ffill_bfill_cols=("wind_dir_a","wind_dir_b"),
    preserve_nan_cols=("AM",),
    train_medians=None
):
    df = df.copy()
    # 정렬
    if id_col in df.columns:
        df = df.sort_values([id_col, time_col])
    else:
        df = df.sort_values(time_col)

    # 수치형 후보
    numeric_cols = [c for c in df.columns
                    if c not in [time_col, target_col] and pd.api.types.is_numeric_dtype(df[c])]
    linear_cols = [c for c in numeric_cols if c not in set(ffill_bfill_cols)|set(preserve_nan_cols)]

    # 1) 각도형 ffill→bfill
    ffill_bfill_cols = [c for c in ffill_bfill_cols if c in df.columns]
    if ffill_bfill_cols:
        if id_col in df.columns:
            df[ffill_bfill_cols] = (df.groupby(id_col, observed=True, sort=False)[ffill_bfill_cols]
                                      .apply(lambda g: g.ffill().bfill())
                                      .reset_index(level=id_col, drop=True))
        else:
            df[ffill_bfill_cols] = df[ffill_bfill_cols].ffill().bfill()

    # 2) 연속형 선형 보간
    if linear_cols:
        if id_col in df.columns:
            def _lin(g): return g.interpolate(method="linear", limit_direction="both")
            df[linear_cols] = (df.groupby(id_col, observed=True, sort=False)[linear_cols]
                                 .apply(_lin).reset_index(level=id_col, drop=True))
        else:
            df[linear_cols] = df[linear_cols].interpolate(method="linear", limit_direction="both")

    # 3) 남은 결측은 Train 중앙값으로만 채우기
    if train_medians is not None:
        fill_candidates = [c for c in numeric_cols if c not in preserve_nan_cols]
        exist = [c for c in fill_candidates if c in df.columns and c in train_medians.index]
        df[exist] = df[exist].fillna(train_medians[exist])

    # 다운캐스트
    for c in numeric_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce", downcast="float")
    return df

# train_medians 는 앞서 분할 보간 단계에서 저장해둔 그 변수 사용
df_test_bi = interpolate_test_like_train(
    df_test_t, time_col="time", id_col="pv_id", target_col="nins",
    ffill_bfill_cols=("wind_dir_a","wind_dir_b"),
    preserve_nan_cols=("AM",),
    train_medians=train_medians  # ← 중요: Train 통계로만 채움
)


In [52]:
#셀 4) 보간 후 파생(테스트 전용) + 열 슬림화(최종 피처만)
# 앞서 쓰던 함수 재사용(우리가 만든 in-place 버전)
add_post_interp_features_inplace(df_test_bi)

# 최종 피처(perm 정리 후)만 선택
X_test_off = df_test_bi[[c for c in final_keep if c in df_test_bi.columns]]
print("X_test_off:", X_test_off.shape)


X_test_off: (2838240, 31)


In [53]:
#셀 5) 공식 테스트 예측 + 제출 파일 생성(샘플과 행수/순서 일치)
# 모델 예측
pred_off = np.maximum(model2.predict(X_test_off).astype("float32"), 0.0)

# sample_submission의 순서를 '그대로' 따라가며 채운다 (가장 안전)
sub = sub_tmpl.copy()
sub["nins"] = pred_off

# (안전 점검) 행수/순서 일치 여부
assert sub.shape[0] == pred_off.shape[0], "행수가 다릅니다. test 전처리 파이프라인을 점검하세요."

# 저장
sub.to_csv("result_submission.csv", index=False)
print(sub.shape)
print(sub.head())


(2838240, 4)
                        time    pv_id  type       nins
0  2024-08-01 00:05:00+09:00  PV_ID_7  test  18.681427
1  2024-08-01 00:10:00+09:00  PV_ID_7  test  18.681427
2  2024-08-01 00:15:00+09:00  PV_ID_7  test  18.681427
3  2024-08-01 00:20:00+09:00  PV_ID_7  test  18.681427
4  2024-08-01 00:25:00+09:00  PV_ID_7  test  18.681427
