In [1]:
#셀 1) 기본 세팅 & 임포트
# === Speed/Memory oriented setup ===
import os, gc, math, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# 핵심 모델/평가
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

# 재현성
SEED = 42
rng = np.random.default_rng(SEED)

# 경로: 필요시 수정
DATA_DIR = "./"  # train.csv, test.csv, sample_submission.csv 있는 폴더
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
TEST_CSV  = os.path.join(DATA_DIR, "test.csv")
SUB_CSV   = os.path.join(DATA_DIR, "submission_sample.csv")

TIME_COL = "time"
PV_COL   = "pv_id"
TARGET   = "nins"

# downcast 유틸
def downcast_df(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.select_dtypes(include=["float64"]).columns:
        df[c] = pd.to_numeric(df[c], downcast="float")
    for c in df.select_dtypes(include=["int64"]).columns:
        df[c] = pd.to_numeric(df[c], downcast="integer")
    return df

pd.options.display.max_columns = 200
print("Setup done.")


Setup done.


In [3]:
#셀 2) 데이터 로드 (얇게 읽기 + 다운캐스트)
# === (패치) 데이터 로드: pv_id는 문자열 → category로 ===
CAND_COLS = [
    TIME_COL, PV_COL, TARGET,
    "temp_a","temp_max","temp_min","humidity","cloud","rain",
    "wind_spd_a","wind_spd_b","wind_gust_spd","ground_press",
    "wind_dir_a","wind_dir_b",
    "coord1","coord2"
]

def read_csv_smart(path, with_target):
    head = pd.read_csv(path, nrows=10)
    cols = [c for c in CAND_COLS if c in head.columns]

    # pv_id는 문자열로 읽고, 나중에 category로 변환
    dtype_map = {}
    if PV_COL in cols:
        dtype_map[PV_COL] = "string"   # <-- 여기!

    df = pd.read_csv(
        path,
        usecols=cols,
        parse_dates=[TIME_COL] if TIME_COL in cols else None,
        dtype=dtype_map if dtype_map else None,
        low_memory=True,
        memory_map=True
    )

    # 문자열 정리
    if PV_COL in df.columns:
        df[PV_COL] = df[PV_COL].astype("string").str.strip()
        # 메모리 절약을 위해 카테고리화 (계산시에는 문자열처럼 동작)
        df[PV_COL] = df[PV_COL].astype("category")

    # 수치 다운캐스트
    df = downcast_df(df)

    if with_target and TARGET in df.columns:
        df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce").astype("float32")

    return df

train = read_csv_smart(TRAIN_CSV, with_target=True)
test  = read_csv_smart(TEST_CSV,  with_target=False)
sub   = pd.read_csv(SUB_CSV)

print(train.shape, test.shape, "pv_id dtype:", train[PV_COL].dtype)



(19236948, 16) (2838240, 16) pv_id dtype: category


In [5]:
#셀 3) 정렬 & 기본 전처리(짧은 결측 보간) + 결측행 제거
# 시간 정렬 (안전)
train = train.sort_values([PV_COL, TIME_COL]).reset_index(drop=True)
test  = test.sort_values([PV_COL, TIME_COL]).reset_index(drop=True)

# --- 결측 처리 전략 ---
# 1) 같은 pv_id 내부에서 짧은 구간만 interpolate (limit=6) 하고,
# 2) train은 잔여 결측행 제거, test는 남으면 bfill/ffill로 마지막 방어
num_cols = [c for c in train.columns if c not in [TIME_COL, PV_COL] and pd.api.types.is_numeric_dtype(train[c])]
num_cols_test = [c for c in test.columns  if c not in [TIME_COL, PV_COL] and pd.api.types.is_numeric_dtype(test[c])]

def short_interp_group(df, cols, group=PV_COL, limit=6):
    # 그룹 내 ffill/bfill + 선형보간(짧은 구간만)
    df = df.copy()
    g = df.groupby(group, group_keys=False)
    for c in cols:
        if c not in df.columns: 
            continue
        s = g[c].apply(lambda x: x.ffill().bfill().interpolate(limit=limit, limit_direction="both"))
        df[c] = s.astype(df[c].dtype)
    return df

train = short_interp_group(train, [c for c in num_cols if c != TARGET], PV_COL, limit=6)
test  = short_interp_group(test,  num_cols_test, PV_COL, limit=6)

# train은 남은 결측 제거 (룰 준수)
before = len(train)
train = train.dropna(subset=[c for c in num_cols if c != TARGET]).reset_index(drop=True)
print(f"train dropna: {before} -> {len(train)}")

# test는 최종 안전장치
test = test.fillna(method="bfill").fillna(method="ffill")


train dropna: 19236948 -> 19236948


In [7]:
#셀 4) 태양고도 기반 cosZ 생성 (+ 야간 마스크)
import numpy as np
import pandas as pd

# 한국 중앙값 근사 (coord1/2 없을 때 사용)
DEFAULT_LAT = 36.5
DEFAULT_LON = 127.9
KST_TZNAME  = "Asia/Seoul"  # 참고용

def solar_cos_zenith(ts: pd.Series, lat=DEFAULT_LAT, lon=DEFAULT_LON, tz_offset_hours=9):
    """
    간단 NOAA 근사.
    - 입력 ts가 tz-aware면 Asia/Seoul로 변환 후 tz 제거
    - tz-naive면 그대로 사용(로컬 표준시 가정)  → 추가적인 -9시간 보정은 하지 않음
    """
    s = pd.to_datetime(ts, errors="coerce")

    # tz 처리: tz-aware면 KST로 변환 후 naive로
    try:
        if getattr(s.dt, "tz", None) is not None:
            s = s.dt.tz_convert(KST_TZNAME).dt.tz_localize(None)
        else:
            # tz-naive → 그대로 사용(로컬 표준시 가정)
            s = s.dt.tz_localize(None)
    except Exception:
        # 어떤 경우에도 datetime64[ns] naive 보장
        s = pd.to_datetime(s).dt.tz_localize(None)

    # ----- 천문 각도 계산 -----
    doy  = s.dt.dayofyear.values.astype("int32")
    hour = s.dt.hour.values + s.dt.minute.values/60.0 + s.dt.second.values/3600.0
    # Local Standard Time (LST) 계산용 보정항
    B   = 2*np.pi*(doy - 81)/364.0
    EoT = 9.87*np.sin(2*B) - 7.53*np.cos(B) - 1.5*np.sin(B)  # 분(min)
    LSTM = 15 * tz_offset_hours  # 경도 기준(한국=135° 기준이 아님에 주의, 근사식)
    TC   = 4*(lon - LSTM) + EoT  # 분(min)

    LST  = hour + TC/60.0        # 시간 단위
    HRA  = np.deg2rad(15*(LST - 12))  # Hour Angle

    delta = np.deg2rad(23.45) * np.sin(2*np.pi*(284 + doy)/365.0)  # 태양 적위
    latr  = np.deg2rad(lat)

    cosZ = (np.sin(latr)*np.sin(delta) + np.cos(latr)*np.cos(delta)*np.cos(HRA))
    return np.clip(cosZ, 0, 1).astype("float32")

def attach_cosZ(df: pd.DataFrame) -> pd.DataFrame:
    lat = float(pd.to_numeric(df["coord1"], errors="coerce").median()) if "coord1" in df.columns else DEFAULT_LAT
    lon = float(pd.to_numeric(df["coord2"], errors="coerce").median()) if "coord2" in df.columns else DEFAULT_LON
    out = df.copy()
    out["cosZ"] = solar_cos_zenith(out[TIME_COL], lat=lat, lon=lon, tz_offset_hours=9)
    return out


In [18]:
print("train cosZ zero ratio:", (train["cosZ"]==0).mean(), "  min/max:", float(train["cosZ"].min()), float(train["cosZ"].max()))
print("test  cosZ zero ratio:", (test["cosZ"]==0).mean(),  "  min/max:", float(test["cosZ"].min()),  float(test["cosZ"].max()))
print(train[[TIME_COL,"cosZ"]].head())


train cosZ zero ratio: 0.5000193377868465   min/max: 0.0 0.9999527335166931
test  cosZ zero ratio: 0.49997146118721464   min/max: 0.0 0.9999555945396423
                       time      cosZ
0 2024-08-01 00:05:00+09:00  0.678613
1 2024-08-01 00:10:00+09:00  0.663813
2 2024-08-01 00:15:00+09:00  0.648699
3 2024-08-01 00:20:00+09:00  0.633277
4 2024-08-01 00:25:00+09:00  0.617554


In [None]:
#셀 5) 시간/주기 + 풍향 sin/cos + lag/rolling (필수 최소셋)
def add_time_features(df):
    df = df.copy()
    tt = df[TIME_COL]
    df["hour"] = tt.dt.hour.astype("int16")
    df["dow"]  = tt.dt.dayofweek.astype("int16")
    df["doy"]  = tt.dt.dayofyear.astype("int16")
    df["is_weekend"] = (df["dow"]>=5).astype("int8")

    # 사이클릭(1차만, 과잉 방지)
    df["hr_sin1"] = np.sin(2*np.pi*df["hour"]/24).astype("float32")
    df["hr_cos1"] = np.cos(2*np.pi*df["hour"]/24).astype("float32")
    df["doy_sin1"]= np.sin(2*np.pi*df["doy"]/365).astype("float32")
    df["doy_cos1"]= np.cos(2*np.pi*df["doy"]/365).astype("float32")
    return df

def add_wind_dir(df):
    df = df.copy()
    for dcol in ["wind_dir_a","wind_dir_b"]:
        if dcol in df.columns:
            rad = np.deg2rad(pd.to_numeric(df[dcol], errors="coerce"))
            df[dcol+"_sin"] = np.sin(rad).astype("float32")
            df[dcol+"_cos"] = np.cos(rad).astype("float32")
    return df

def add_group_lag_roll(df, cols, group=PV_COL):
    df = df.copy()
    g = df.groupby(group, sort=False)
    for c in cols:
        if c not in df.columns: 
            continue
        # lag (1,2), diff(1)
        df[f"{c}_lag1"] = g[c].shift(1)
        df[f"{c}_lag2"] = g[c].shift(2)
        df[f"{c}_d1"]   = g[c].diff(1)
        # 30분/60분 rolling: 5분 간격 가정 → 6,12
        df[f"{c}_r30"]  = g[c].rolling(6, min_periods=1).mean().reset_index(level=0, drop=True)
        df[f"{c}_r60"]  = g[c].rolling(12, min_periods=1).median().reset_index(level=0, drop=True)
    # 다운캐스트
    return downcast_df(df)

# 적용 대상(핵심만)
BASE_NUMS = [c for c in ["temp_a","humidity","cloud","rain","wind_spd_a","wind_gust_spd","ground_press","cosZ"] if c in train.columns]

train = add_time_features(train)
test  = add_time_features(test)

train = add_wind_dir(train)
test  = add_wind_dir(test)

train = add_group_lag_roll(train, BASE_NUMS, PV_COL)
test  = add_group_lag_roll(test,  BASE_NUMS, PV_COL)

# 생성 후 생긴 결측(lag 등)은 train에서 제거, test는 ffill/bfill
before = len(train)
train = train.dropna().reset_index(drop=True)
test  = test.fillna(method="bfill").fillna(method="ffill")

print(f"post feature-engineering train dropna: {before} -> {len(train)}")
gc.collect()


In [8]:
train = attach_cosZ(train)
test  = attach_cosZ(test)


In [9]:
#셀 6) 특징 목록 구성 + 단조 제약 준비
# 학습에 쓸 최종 FEATURES
exclude = {TIME_COL, PV_COL, TARGET}
FEATURES = [c for c in train.columns if c not in exclude and pd.api.types.is_numeric_dtype(train[c])]

# monotone constraints: cosZ만 +1, 나머지 0
mono = [1 if f=="cosZ" else 0 for f in FEATURES]
print("n_features:", len(FEATURES), "cosZ in:", ("cosZ" in FEATURES))


n_features: 14 cosZ in: True


In [10]:
#셀 7) 라벨 변환 (sqrt) + 시간 기반 Folds (Purged Forward)
# sqrt 변환
train_y = np.sqrt(train[TARGET].values.astype("float32"))
train_X = train[FEATURES].astype("float32")
test_X  = test[FEATURES].astype("float32")

# Purged forward CV by day: fold i를 검증으로, 이전 모든 날을 학습
days = train[TIME_COL].dt.floor("D")
uniq_days = np.sort(days.unique())
N_SPLITS = 5
fold_sizes = np.array_split(uniq_days, N_SPLITS)

folds = []
used_train_days = set()
for i in range(N_SPLITS):
    va_days = set(fold_sizes[i])
    tr_days = set(uniq_days[:np.where(uniq_days==fold_sizes[i][0])[0][0]])  # i 블록 시작 이전의 모든 날
    if len(tr_days)==0:
        # 첫 블록은 과거가 없으니 스킵 (유효하지 않은 검증)
        continue
    va_idx = train.index[days.isin(va_days)].values
    tr_idx = train.index[days.isin(tr_days)].values
    if len(va_idx)==0 or len(tr_idx)==0: 
        continue
    folds.append((tr_idx, va_idx))

print("usable folds:", len(folds))


usable folds: 4


In [19]:
#셀 8) LightGBM 학습 (규제+EarlyStop+단조제약) + OOF/테스트 예측
# 셀 8) LightGBM 학습 (MAE, 단조제약 제거)
params = dict(
    objective="mae",          # regression_l1
    metric="mae",
    learning_rate=0.03,
    num_leaves=127,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l1=2.0,
    lambda_l2=2.0,
    random_state=SEED,
    verbose=-1,
    n_jobs=-1
    # monotone_constraints 제거!
)

oof = np.zeros(len(train), dtype="float32")
pred_te_accum = np.zeros(len(test), dtype="float32")
models = []

for fi, (tr_idx, va_idx) in enumerate(folds, 1):
    if len(tr_idx) == 0 or len(va_idx) == 0:
        print(f"[fold {fi}] skip: empty index")
        continue

    Xtr, ytr = train_X.iloc[tr_idx], train_y[tr_idx]
    Xva, yva = train_X.iloc[va_idx], train_y[va_idx]

    lgb_tr = lgb.Dataset(Xtr, label=ytr, free_raw_data=True)
    lgb_va = lgb.Dataset(Xva, label=yva, reference=lgb_tr, free_raw_data=True)

    model = lgb.train(
        params=params,
        train_set=lgb_tr,
        num_boost_round=5000,
        valid_sets=[lgb_va],
        valid_names=["valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=True),
            lgb.log_evaluation(period=200)
        ]
    )
    models.append(model)

    oof[va_idx] = model.predict(Xva, num_iteration=model.best_iteration)
    pred_te_accum += model.predict(test_X, num_iteration=model.best_iteration).astype("float32")

# --- 역변환 유틸 ---
def inv_sqrt(yhat):
    # sqrt-space 예측을 원래 스케일로 복원: 음수는 0으로 클립 후 제곱
    return np.square(np.clip(yhat, 0, None), dtype="float32")

# OOF MAE (역변환)
oof_pred = inv_sqrt(oof)
oof_mae = mean_absolute_error(train[TARGET].values, oof_pred)
print(f"OOF MAE: {oof_mae:.5f}")

# 테스트 예측 (역변환)
pred_te = inv_sqrt(pred_te_accum / max(1, len(models)))




Training until validation scores don't improve for 200 rounds
[200]	valid's l1: 5.08152
Early stopping, best iteration is:
[153]	valid's l1: 5.07081
Training until validation scores don't improve for 200 rounds
[200]	valid's l1: 4.27749
[400]	valid's l1: 4.27345
Early stopping, best iteration is:
[340]	valid's l1: 4.25944
Training until validation scores don't improve for 200 rounds
[200]	valid's l1: 5.15098
[400]	valid's l1: 5.13524
[600]	valid's l1: 5.1401
Early stopping, best iteration is:
[505]	valid's l1: 5.13045
Training until validation scores don't improve for 200 rounds


KeyboardInterrupt: 

In [14]:
#셀 9) 야간 0 고정 + 일출/일몰 스무딩 + 음수 0 클리핑
# 야간 = 0
cosZ_te = test["cosZ"].values.astype("float32")
pred_te[cosZ_te==0] = 0.0

# 일출/일몰 스무딩 (완만하게)
# cosZ < τ이면 pred *= (cosZ/τ)
tau = 0.05
mask = (cosZ_te > 0) & (cosZ_te < tau)
pred_te[mask] = pred_te[mask] * (cosZ_te[mask] / tau)

# 음수 방지
pred_te = np.clip(pred_te, 0, None)

print("postprocess done.")


postprocess done.


In [None]:
# 빠른 sanity check
print("OOF MAE (reported above) 확인:", "OK" if np.isfinite(oof_mae) else "NaN!")
print("test pred basic stats:", float(pred_te.min()), float(pred_te.max()), float(np.median(pred_te)))
print("postprocess cosZ<0.05 count:", int((test["cosZ"]<0.05).sum()))
print("예측 0인 개수:", int((pred_te==0).sum()), "/", len(pred_te))


In [15]:
#셀 10) 플랜트별 1차 캘리브레이션 (선택) — OOF로 a,b 보정
# === (패치) 캘리브레이션: 키를 문자열로 통일 ===
"""
calib = {}
df_oof = pd.DataFrame({
    "pv_id": train[PV_COL].astype("string").values,            # <-- 문자열화
    "y": train[TARGET].values.astype("float32"),
    "yhat": oof_pred.astype("float32")
})

# 전역 a,b
A_global, B_global = 1.0, 0.0
try:
    A_global = np.cov(df_oof["yhat"], df_oof["y"])[0,1] / (np.var(df_oof["yhat"]) + 1e-6)
    B_global = df_oof["y"].mean() - A_global * df_oof["yhat"].mean()
except Exception:
    pass

for pid, g in df_oof.groupby("pv_id"):
    if len(g) < 200:
        calib[str(pid)] = (A_global, B_global)                 # <-- str(pid)
    else:
        try:
            A = np.cov(g["yhat"], g["y"])[0,1] / (np.var(g["yhat"]) + 1e-6)
            B = g["y"].mean() - A * g["yhat"].mean()
            calib[str(pid)] = (float(A), float(B))             # <-- str(pid)
        except Exception:
            calib[str(pid)] = (A_global, B_global)

pid_arr = test[PV_COL].astype("string").values                 # <-- 문자열화
A_vec = np.array([calib.get(p, (A_global, B_global))[0] for p in pid_arr], dtype="float32")
B_vec = np.array([calib.get(p, (A_global, B_global))[1] for p in pid_arr], dtype="float32")

pred_te_cal = A_vec * pred_te + B_vec
pred_te_cal = np.clip(pred_te_cal, 0, None).astype("float32")
print("Calibration applied.")
"""

Calibration applied.


In [None]:
# 셀 10) (임시) 캘리브레이션 OFF → 그대로 통과
pred_te_cal = pred_te.copy()
print("Calibration skipped (temporary).")


In [None]:
# 셀 10) (임시) 캘리브레이션 OFF → 그대로 통과
pred_te_cal = pred_te.copy()
print("Calibration skipped (temporary).")


In [17]:
#셀 11) 제출파일 생성 (키 머지 안전)
# 셀 11) 제출파일 생성 (키 정규화 + 안전 병합)  [patched]

def norm_time(s):
    # 모두 naive datetime64[ns]로 통일
    s = pd.to_datetime(s, errors="coerce")
    try:
        # tz-aware면 KST로 변환 후 tz 제거
        if getattr(s.dt, "tz", None) is not None:
            s = s.dt.tz_convert("Asia/Seoul").dt.tz_localize(None)
        else:
            s = s.dt.tz_localize(None)  # naive인 경우에도 형식 통일
    except Exception:
        s = pd.to_datetime(s, errors="coerce")
    return s

def norm_keys(df):
    out = df.copy()
    if "time" in out.columns:
        out["time"] = norm_time(out["time"])
    if "pv_id" in out.columns:
        out["pv_id"] = out["pv_id"].astype("string").str.strip()
    if "type" in out.columns:
        out["type"] = out["type"].astype("string").str.strip()
    return out

# 1) 키 정규화
sub_norm  = norm_keys(sub)
test_norm = norm_keys(test)

# 2) 참조 테이블 만들기
merge_keys = [c for c in ["time","pv_id","type"] if c in sub_norm.columns]
ref = test_norm[[TIME_COL, PV_COL]].copy()

if "type" in sub_norm.columns and "type" not in ref.columns:
    # (time, pv_id) 조합을 sample의 type과 매칭
    ref = ref.merge(
        sub_norm[merge_keys].drop_duplicates(),
        on=["time","pv_id"], how="left"
    )

# 3) 예측치 붙이기
ref["nins"] = pred_te_cal.astype("float32")

# 4) 안전 병합 (중복 컬럼 제거)
def safe_merge(left: pd.DataFrame, right: pd.DataFrame, on, how="left") -> pd.DataFrame:
    right = right.loc[:, ~right.columns.duplicated()]
    return left.merge(right, on=on, how=how, copy=False)

out = safe_merge(
    sub_norm.drop(columns=["nins"]) if "nins" in sub_norm.columns else sub_norm,
    ref,
    on=merge_keys, how="left"
)

SAVE_PATH = os.path.join(DATA_DIR, "result_submission.csv")
out.to_csv(SAVE_PATH, index=False)
print("Saved:", SAVE_PATH)
print("dtypes:", out.dtypes.loc[["time","pv_id","type","nins"]].to_dict() if set(["time","pv_id","type"]).issubset(out.columns) else out.dtypes.head())


Saved: ./result_submission.csv
dtypes: {'time': dtype('<M8[ns]'), 'pv_id': string[python], 'type': string[python], 'nins': dtype('float32')}


In [20]:
# 디버그 셀 A — 기본 분포/정렬/키 점검
print("OOF MAE:", oof_mae if 'oof_mae' in globals() else 'N/A')
print("train cosZ zero ratio:", (train["cosZ"]==0).mean(), "min/max:", float(train["cosZ"].min()), float(train["cosZ"].max()))
print("test  cosZ zero ratio:", (test["cosZ"]==0).mean(),  "min/max:", float(test["cosZ"].min()),  float(test["cosZ"].max()))

print("pred_te stats:", float(pred_te.min()), float(pred_te.max()), float(np.median(pred_te)))
print("pred_te==0:", int((pred_te==0).sum()), "/", len(pred_te))

# 디버그 셀 B — 제출 키 정합성
print("sub keys:", sub[['time','pv_id','type']].dtypes.to_dict())
print("test keys:", test[['time','pv_id']].dtypes.to_dict())

# 제출 병합 후 누락/중복 검사
print("submission nins null:", out["nins"].isna().sum())
dups = out.duplicated(subset=['time','pv_id','type']).sum()
print("submission duplicate key rows:", dups)


OOF MAE: 133.07325744628906
train cosZ zero ratio: 0.5000193377868465 min/max: 0.0 0.9999527335166931
test  cosZ zero ratio: 0.49997146118721464 min/max: 0.0 0.9999555945396423
pred_te stats: 0.0 741.2813720703125 4.3305945635063736e-11
pred_te==0: 1419039 / 2838240
sub keys: {'time': dtype('O'), 'pv_id': dtype('O'), 'type': dtype('O')}
test keys: {'time': datetime64[ns, UTC+09:00], 'pv_id': CategoricalDtype(categories=['PV_ID_10', 'PV_ID_114', 'PV_ID_117', 'PV_ID_121',
                  'PV_ID_122', 'PV_ID_134', 'PV_ID_165', 'PV_ID_173',
                  'PV_ID_175', 'PV_ID_180', 'PV_ID_182', 'PV_ID_183',
                  'PV_ID_192', 'PV_ID_194', 'PV_ID_204', 'PV_ID_23',
                  'PV_ID_29', 'PV_ID_39', 'PV_ID_48', 'PV_ID_49', 'PV_ID_64',
                  'PV_ID_7', 'PV_ID_72', 'PV_ID_78', 'PV_ID_8', 'PV_ID_82',
                  'PV_ID_95'],
, ordered=False, categories_dtype=string)}
submission nins null: 0
submission duplicate key rows: 0
