In [1]:
# ==============================================
# Step 1 ‚Äî Import & Feature Definition
# ==============================================
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
import joblib

# ‚úÖ ÏµúÏ¢Ö 17Í∞ú feature (VIF ÏïàÏ†ïÌôî Î≤ÑÏ†Ñ)
final_feats = [
    'humidity', 'uv_idx', 'ceiling', 'appr_temp', 'uv_cloud_adj',
    'wind_spd_b', 'is_rain', 'rain', 'dow',
    'doy_cos', 'hour_cos', 'snow', 'hour_sin', 'doy_sin',
    'coord1', 'coord2', 'haze'
]
target_col = "nins"
print("‚úÖ Using 17 features:", len(final_feats))


‚úÖ Using 17 features: 17


In [5]:
# ==============================================
# Step 2 ‚Äî Define add_features()
# ==============================================
def add_features(df):
    df = df.copy()
    df["time"] = pd.to_datetime(df["time"])

    # ‚ë† ÏãúÍ∞Ñ Í∏∞Î∞ò ÌååÏÉù
    df["hour"] = df["time"].dt.hour
    df["dow"] = df["time"].dt.dayofweek
    df["doy"] = df["time"].dt.dayofyear

    # ‚ë° Ï£ºÍ∏∞ Ïù∏ÏΩîÎî©
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    df["doy_sin"] = np.sin(2 * np.pi * df["doy"] / 365)
    df["doy_cos"] = np.cos(2 * np.pi * df["doy"] / 365)

    # ‚ë¢ Í∞ÄÏãúÍ±∞Î¶¨ Í∏∞Î∞ò haze
    if "visibility" in df.columns:
        df["haze"] = 1 / (df["visibility"] + 1)
    else:
        df["haze"] = 0

    # ‚ë£ Í∞ïÏàò Ïó¨Î∂Ä (is_rain)
    if "rain" in df.columns:
        df["is_rain"] = (df["rain"] > 0).astype(int)
    else:
        df["is_rain"] = 0

    # ‚ë§ ÏûêÏô∏ÏÑ† √ó Íµ¨Î¶Ñ Î≥¥Ï†ï (uv_cloud_adj)
    if {"uv_idx", "cloud"} <= set(df.columns):
        df["uv_cloud_adj"] = df["uv_idx"] * (1 - df["cloud"] / 100)
    else:
        df["uv_cloud_adj"] = df.get("uv_idx", 0)

    return df



In [6]:
# ==============================================
# Step 3 ‚Äî Load train/test
# ==============================================
print("üöÄ Loading train & test CSVs ...")

train = pd.read_csv("train.csv", low_memory=True, memory_map=True)
test  = pd.read_csv("test.csv",  low_memory=True, memory_map=True)
print(f"‚úÖ Train shape: {train.shape} | Test shape: {test.shape}")

# ÌååÏÉùÎ≥ÄÏàò ÏÉùÏÑ±
train = add_features(train)
test  = add_features(test)


üöÄ Loading train & test CSVs ...
‚úÖ Train shape: (19236948, 33) | Test shape: (2838240, 32)


In [7]:
# ==============================================
# Step 4 ‚Äî Feature Engineering
# ==============================================
train = train.dropna(subset=[target_col])
X = train[final_feats]
y = train[target_col]
print("‚úÖ Features ready:", X.shape)


‚úÖ Features ready: (19236948, 17)


In [8]:
# ==============================================
# Step 5 ‚Äî VIF Sampling Check
# ==============================================
sample_frac = 0.07
df_vif_sample = train[final_feats].sample(frac=sample_frac, random_state=42)
df_vif_sample = df_vif_sample.fillna(0).astype("float32")

print(f"‚öôÔ∏è Using sample of {len(df_vif_sample):,} rows (‚âà {sample_frac*100:.1f}%) for FINAL VIF check")

def compute_vif(df):
    X = df.select_dtypes(include=[np.number])
    vif_df = pd.DataFrame({
        "feature": X.columns,
        "VIF": [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    })
    return vif_df.sort_values("VIF", ascending=False).reset_index(drop=True)

vif_final = compute_vif(df_vif_sample)
print("\nüîç FINAL VIF results (sample-based, 17 features):")
display(vif_final)


‚öôÔ∏è Using sample of 1,346,586 rows (‚âà 7.0%) for FINAL VIF check


  vif = 1. / (1. - r_squared_i)



üîç FINAL VIF results (sample-based, 17 features):


  return 1 - self.ssr/self.uncentered_tss


Unnamed: 0,feature,VIF
0,uv_idx,inf
1,uv_cloud_adj,inf
2,humidity,6.782844
3,ceiling,4.551765
4,appr_temp,4.178122
5,wind_spd_b,2.463356
6,is_rain,1.82335
7,rain,1.448978
8,dow,1.060158
9,doy_cos,1.053712


In [9]:
# ==============================================
# Step 6 ‚Äî Train/Valid Split
# ==============================================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"‚úÖ Train={X_train.shape}, Valid={X_valid.shape}")


‚úÖ Train=(15389558, 17), Valid=(3847390, 17)


In [11]:
# ==============================================
# Step 7 ‚Äî Model Training
# ==============================================
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.04,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=4,
    random_state=42,
    tree_method="hist",
    n_jobs=-1
)

print("üöÄ Training start ...")
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=200)


üöÄ Training start ...
[0]	validation_0-rmse:298.50138
[200]	validation_0-rmse:128.29964
[400]	validation_0-rmse:118.86529
[600]	validation_0-rmse:112.33741
[800]	validation_0-rmse:108.82772
[999]	validation_0-rmse:106.19465


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
# ==============================================
# Step 8 ‚Äî Validation
# ==============================================
y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
print(f"‚úÖ Validation MAE: {mae:.4f}")


‚úÖ Validation MAE: 58.0535


In [13]:
# ==============================================
# Step 9 ‚Äî Full-data Train & Save
# ==============================================
X_full = train[final_feats]
y_full = train[target_col]

final_model = XGBRegressor(
    n_estimators=2500,
    learning_rate=0.04,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=4,
    random_state=42,
    tree_method="hist",
    n_jobs=-1
)

print("\nüöÄ Full-data training start ...")
final_model.fit(X_full, y_full)
print("‚úÖ Full-data training complete")

joblib.dump(final_model, "xgb_full_final.pkl")
print("üíæ Saved ‚Üí xgb_full_final.pkl")



üöÄ Full-data training start ...
‚úÖ Full-data training complete
üíæ Saved ‚Üí xgb_full_final.pkl


In [19]:
# ==============================================
# Step 10 ‚Äî Test ÏòàÏ∏° + (ÏÑ†ÌÉù) Ïã¨Ïïº ÌÅ¥Îû®ÌîÑ + ÏïàÏ†Ñ Î≥ëÌï© + Ï†ÄÏû•
# ==============================================
print("\nüöÄ Inference on test set ...")
X_test = test[final_feats].fillna(0)
pred_test = final_model.predict(X_test)
pred_test = np.clip(pred_test, 0, None)  # ÏùåÏàò ÌÅ¥Î¶¨Ìïë

# (ÏÑ†ÌÉù) Ïã¨Ïïº/Î¨¥ÏûêÏô∏ÏÑ† Íµ¨Í∞Ñ 0ÏúºÎ°ú Î≥¥Ï†ï ‚Äî ÏòàÏ∏° ÏßÅÌõÑ Ïó¨Í∏∞!
try:
    # uv_idxÍ∞Ä 0(ÎòêÎäî Îß§Ïö∞ ÎÇÆÏùå)Ïù¥Í±∞ÎÇò, Ïã¨Ïïº ÏãúÍ∞ÑÎåÄÎäî 0ÏúºÎ°ú
    uv_mask = (test.get('uv_idx', 0) <= 0)
    hour_mask = (test['hour'].between(0, 5) | test['hour'].between(20, 23))
    mask_night = (uv_mask | hour_mask).values  # numpy boolean
    pred_test[mask_night] = 0.0
    print(f"üåô night clamp applied: {mask_night.sum()} rows set to 0")
except Exception as e:
    print("‚ö†Ô∏è night clamp skipped:", e)

print(f"‚úÖ Test prediction complete | min={pred_test.min():.4f}, max={pred_test.max():.4f}")

# sample_submission Î°úÎìú
sub = pd.read_csv("submission_sample.csv")
print(f"üìÑ sample_submission shape: {sub.shape} | test shape: {test.shape}")

# Í∏∏Ïù¥ Í∞ôÏúºÎ©¥ ÏßÅÏ†ë Ï£ºÏûÖ
if len(sub) == len(pred_test):
    if "nins" not in sub.columns:
        sub["nins"] = 0.0
    sub["nins"] = pred_test.astype("float32")
    out = sub.copy()
    print("üîó Applied direct assignment (len match).")
else:
    # (Î≥¥Ìò∏) ÌÇ§ Î≥ëÌï© Í≤ΩÎ°ú
    merge_keys = [c for c in ["time", "pv_id", "type"] if c in test.columns and c in sub.columns]
    for df_ in (test, sub):
        if "time" in df_.columns:
            df_["time"] = df_["time"].astype(str)
    ref = test[merge_keys].copy()
    ref["nins"] = pred_test.astype("float32")
    ref = ref.drop_duplicates(subset=merge_keys, keep="last")
    out = sub.drop(columns=["nins"], errors="ignore").merge(ref, on=merge_keys, how="left")

nan_before = out["nins"].isna().sum()
out["nins"] = out["nins"].fillna(0.0).astype("float32")
out.to_csv("result_submission.csv", index=False)
print("üíæ Saved ‚Üí result_submission.csv")

# Ï≤¥ÌÅ¨
try:
    from IPython.display import display
    print("\n‚úÖ Submission preview:")
    display(out.head())
except Exception:
    print(out.head())
print(f"nins null before fill: {nan_before} | after fill: {out['nins'].isna().sum()}")
key_cols = [c for c in ["time", "pv_id", "type"] if c in out.columns]
print("duplicate key rows:", out.duplicated(key_cols).sum() if key_cols else "(skip)")





üöÄ Inference on test set ...
üåô night clamp applied: 1208943 rows set to 0
‚úÖ Test prediction complete | min=0.0000, max=1266.2826
üìÑ sample_submission shape: (2838240, 4) | test shape: (2838240, 42)
üîó Applied direct assignment (len match).
üíæ Saved ‚Üí result_submission.csv

‚úÖ Submission preview:


Unnamed: 0,time,pv_id,type,nins
0,2024-08-01 00:05:00+09:00,PV_ID_7,test,0.0
1,2024-08-01 00:10:00+09:00,PV_ID_7,test,0.0
2,2024-08-01 00:15:00+09:00,PV_ID_7,test,0.0
3,2024-08-01 00:20:00+09:00,PV_ID_7,test,0.0
4,2024-08-01 00:25:00+09:00,PV_ID_7,test,0.0


nins null before fill: 0 | after fill: 0
duplicate key rows: 0
