In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/china-real-estate-demand-prediction/sample_submission.csv
/kaggle/input/china-real-estate-demand-prediction/test.csv
/kaggle/input/china-real-estate-demand-prediction/train/city_search_index.csv
/kaggle/input/china-real-estate-demand-prediction/train/land_transactions_nearby_sectors.csv
/kaggle/input/china-real-estate-demand-prediction/train/new_house_transactions_nearby_sectors.csv
/kaggle/input/china-real-estate-demand-prediction/train/city_indexes.csv
/kaggle/input/china-real-estate-demand-prediction/train/pre_owned_house_transactions.csv
/kaggle/input/china-real-estate-demand-prediction/train/new_house_transactions.csv
/kaggle/input/china-real-estate-demand-prediction/train/land_transactions.csv
/kaggle/input/china-real-estate-demand-prediction/train/sector_POI.csv
/kaggle/input/china-real-estate-demand-prediction/train/pre_owned_house_transactions_nearby_sectors.csv


In [2]:
# =========================
# Real Estate Demand — Metric-aware, Leakage-safe Ensemble
# =========================
import numpy as np
import pandas as pd
from pathlib import Path
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [3]:
# ---------- Paths ----------
PTH = Path("/kaggle/input/china-real-estate-demand-prediction")
OUT = Path("/kaggle/working")

In [4]:
# ---------- Helpers ----------
MONTH2NUM = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6,
                 Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)


In [5]:
def parse_time(df, month_col="month"):
    # df['month'] like '2020-Jan' or '2024 Aug' in test id
    if month_col == "month":
        yr = df[month_col].str.slice(0, 4).astype(int)
        mo_txt = df[month_col].str.slice(5)
        mo = mo_txt.map(MONTH2NUM).astype(int)
    else:
        # for test: month_text like '2024 Aug'
        yr = df[month_col].str.slice(0, 4).astype(int)
        mo_txt = df[month_col].str.slice(5)
        mo = mo_txt.map(MONTH2NUM).astype(int)
    df["year"] = yr
    df["month_num"] = mo
    df["time"] = (df["year"] - 2019) * 12 + df["month_num"] - 1
    return df

In [6]:
def split_test_id(test):
    parts = test["id"].str.split("_", expand=True)
    test["month_text"] = parts[0]
    test["sector"] = parts[1]
    test["sector_id"] = test["sector"].str.replace("sector ", "", regex=False).astype(int)
    test = parse_time(test.rename(columns={"month_text":"month_parsed"}), "month_parsed")
    return test

In [7]:
# Lunar New Year (LNY) month indicator for 2019–2024 (simple, robust flag)
# source: public calendars; encoded explicitly to avoid internet calls.
LNY_MONTH_BY_YEAR = {2019:2, 2020:1, 2021:2, 2022:2, 2023:1, 2024:2}

In [8]:
def add_calendar_feats(df):
    m = df["month_num"]
    df["sin12"] = np.sin(2*np.pi*(m-1)/12)
    df["cos12"] = np.cos(2*np.pi*(m-1)/12)
    df["sin6"]  = np.sin(2*np.pi*(m-1)/6)
    df["cos6"]  = np.cos(2*np.pi*(m-1)/6)
    df["qtr"]   = ((m-1)//3 + 1).astype(int)

    # LNY month flag and the month before LNY
    df["lny_month"] = df["year"].map(LNY_MONTH_BY_YEAR).fillna(0).astype(int)
    df["is_lny"]    = (df["month_num"] == df["lny_month"]).astype(int)
    df["is_pre_lny"]= ((df["month_num"] % 12) == ((df["lny_month"]-1-1) % 12)+1).astype(int)
    return df


In [9]:
def ewgm(x, n_lags=6, alpha=0.6):
    """Exponentially Weighted Geometric Mean on tail values > 0."""
    vals = x[-n_lags:]
    if len(vals)==0 or (vals>0).sum()==0: 
        return 0.0
    w = np.array([alpha**(n_lags-1-i) for i in range(n_lags)], float)
    w = w / w.sum()
    pos = vals>0
    if pos.sum()==0: 
        return 0.0
    logv = np.log(np.where(vals[pos]<=0, 1e-12, vals[pos]))
    ww = w[pos] / w[pos].sum()
    return float(np.exp((ww*logv).sum()))

In [10]:
def december_multipliers(wide_amount, clip=(0.85, 1.4)):
    idx = wide_amount.index
    is_dec = (idx % 12)==11
    dec_mean = wide_amount[is_dec].mean(0)
    other_mean = wide_amount[~is_dec].mean(0)
    overall = dec_mean.mean()/(other_mean.mean()+1e-12)
    mult = dec_mean/(other_mean+1e-12)
    mult = mult.fillna(overall).replace([np.inf,-np.inf], 1.0)
    return mult.clip(*clip)

In [11]:
def custom_score(y_true, y_pred, eps=1e-12):
    y_true = np.asarray(y_true, float)
    y_pred = np.maximum(np.asarray(y_pred, float), 0.0)
    ape = np.abs(y_true - y_pred) / np.maximum(y_true, eps)
    if (ape>1).mean() > 0.30:
        return 0.0
    good = ape<=1
    if not np.any(good):
        return 0.0
    mape = ape[good].mean()
    fraction = good.mean()
    return max(0.0, 1.0 - mape/(fraction+eps))

In [12]:
# ---------- Load ----------
nht = pd.read_csv(PTH/"train/new_house_transactions.csv")
pht = pd.read_csv(PTH/"train/pre_owned_house_transactions.csv")
lt  = pd.read_csv(PTH/"train/land_transactions.csv")
nht_ns = pd.read_csv(PTH/"train/new_house_transactions_nearby_sectors.csv")
pht_ns = pd.read_csv(PTH/"train/pre_owned_house_transactions_nearby_sectors.csv")
lt_ns  = pd.read_csv(PTH/"train/land_transactions_nearby_sectors.csv")
poi = pd.read_csv(PTH/"train/sector_POI.csv")
city_idx = pd.read_csv(PTH/"train/city_indexes.csv")
test = pd.read_csv(PTH/"test.csv")

In [13]:
# ---------- Parse basics ----------
for df in [nht, pht, lt, nht_ns, pht_ns, lt_ns]:
    df["sector_id"] = df["sector"].str.replace("sector ", "", regex=False).astype(int)
    parse_time(df)

parse_time(poi.assign(month="2019-Jan"))  # inject year for join-safety
poi["sector_id"] = poi["sector"].str.replace("sector ", "", regex=False).astype(int)

In [14]:
# join city indexes by year (no .head(6) truncation)
city_idx.rename(columns={"city_indicator_data_year":"year"}, inplace=True)

In [15]:
# ---------- Build base panel ----------
# one row per (sector_id, time)
times = nht["time"].unique()
sectors = np.arange(1, 97)
base = (pd.MultiIndex.from_product([times, sectors], names=["time","sector_id"])
        .to_frame(index=False)
        .sort_values(["sector_id","time"]))

In [16]:
# attach month/year & season features
base["year"] = 2019 + (base["time"]//12)
base["month_num"] = (base["time"]%12) + 1
add_calendar_feats(base)

Unnamed: 0,time,sector_id,year,month_num,sin12,cos12,sin6,cos6,qtr,lny_month,is_lny,is_pre_lny
0,0,1,2019,1,0.000000e+00,1.000000e+00,0.000000e+00,1.0,1,2,0,1
96,1,1,2019,2,5.000000e-01,8.660254e-01,8.660254e-01,0.5,1,2,1,0
192,2,1,2019,3,8.660254e-01,5.000000e-01,8.660254e-01,-0.5,1,2,0,0
288,3,1,2019,4,1.000000e+00,6.123234e-17,1.224647e-16,-1.0,2,2,0,0
384,4,1,2019,5,8.660254e-01,-5.000000e-01,-8.660254e-01,-0.5,2,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6047,62,96,2024,3,8.660254e-01,5.000000e-01,8.660254e-01,-0.5,1,2,0,0
6143,63,96,2024,4,1.000000e+00,6.123234e-17,1.224647e-16,-1.0,2,2,0,0
6239,64,96,2024,5,8.660254e-01,-5.000000e-01,-8.660254e-01,-0.5,2,2,0,0
6335,65,96,2024,6,5.000000e-01,-8.660254e-01,-8.660254e-01,0.5,2,2,0,0


In [17]:
# attach targets and raw features for current month (no future joins)
def safe_merge(df, cols, suffix):
    keep = ["time","sector_id"] + cols
    out = base.merge(df[keep], on=["time","sector_id"], how="left", suffixes=("",""))
    out.rename(columns={c: f"{c}{suffix}" for c in cols}, inplace=True)
    return out


In [18]:
# ---------- 从这里替换：safe_merge + 各数据表 merge ----------

# 需要排除的列（已经在 base 里，或不是我们想拼接的特征）
EXCLUDE = ["time", "sector_id", "sector", "month", "year", "month_num"]

def safe_merge(df, cols, suffix):
    """
    右表先把要保留的特征列重命名为带后缀的名字，然后再和 base 按键合并，避免列名冲突。
    """
    right = df[["time", "sector_id"] + cols].copy()
    right.rename(columns={c: f"{c}{suffix}" for c in cols}, inplace=True)
    out = base.merge(right, on=["time", "sector_id"], how="left")
    return out


In [19]:
# 先合 new house（本月的真实值，用来构造 y 的未来月标签）
base = safe_merge(nht, ["amount_new_house_transactions"], "_nht")

# 注意：下面各表的 cols 都排除了 EXCLUDE，避免 year / month_num 等冲突
cols_nhtns = nht_ns.columns.difference(EXCLUDE).tolist()
base = safe_merge(nht_ns, cols_nhtns, "_nhtns")

cols_pht = pht.columns.difference(EXCLUDE).tolist()
base = safe_merge(pht, cols_pht, "_pht")

cols_phtns = pht_ns.columns.difference(EXCLUDE).tolist()
base = safe_merge(pht_ns, cols_phtns, "_phtns")

cols_lt = lt.columns.difference(EXCLUDE).tolist()
base = safe_merge(lt, cols_lt, "_lt")

cols_ltns = lt_ns.columns.difference(EXCLUDE).tolist()
base = safe_merge(lt_ns, cols_ltns, "_ltns")

In [20]:
# POI 只按 sector_id 合并（没有时间维度），不会冲突
base = base.merge(poi.drop(columns=["sector","month"], errors="ignore"), on="sector_id", how="left")

# 城市指数按 year 合并（base 已自带 year）
base = base.merge(city_idx, on="year", how="left")

# ---------- 下面与先前版本保持一致：排序、构造标签与特征 ----------

base = base.sort_values(["sector_id","time"]).reset_index(drop=True)

In [21]:
def add_lags_rollings(df, col, group, lags=(1,2,3,6,12), windows=(3,6,12)):
    df = df.copy()
    g = df.groupby(group)[col]
    for L in lags:
        df[f"{col}_lag{L}"] = g.shift(L)
    for W in windows:
        df[f"{col}_ma{W}"] = g.shift(1).rolling(W, min_periods=1).mean()
        df[f"{col}_med{W}"] = g.shift(1).rolling(W, min_periods=1).median()
    return df


In [22]:

def detect_amount_col_in_nht(df):
    """
    在 new_house_transactions.csv 中自动寻找金额列。
    优先匹配：'amount_new_house_transactions'、'new_house_transaction_amount'
    否则：选择包含 'amount' 或 'transaction' 且不包含 'area'/'price' 的数值列。
    """
    cols = df.columns.tolist()
    preferred = ["amount_new_house_transactions", "new_house_transaction_amount"]
    for name in preferred:
        if name in df.columns and pd.api.types.is_numeric_dtype(df[name]):
            return name

    # 打分选择
    cand = []
    for c in cols:
        lc = c.lower()
        if not pd.api.types.is_numeric_dtype(df[c]):
            continue
        score = 0
        if "amount" in lc: score += 10
        if "transact" in lc or "transaction" in lc or "txn" in lc: score += 8
        if "house" in lc and "new" in lc: score += 4
        # 惩罚面积/价格类
        if "area" in lc or "price" in lc or "per_area" in lc or "avg" in lc:
            score -= 6
        if score > 0:
            cand.append((score, c))
    if cand:
        cand.sort(reverse=True)
        return cand[0][1]
    # 实在没有就选一个最可能的数值列（最后兜底）
    num_cols = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])]
    if not num_cols:
        raise KeyError("在 nht 表中未找到任何数值型列，无法识别目标列。")
    return num_cols[0]

# 1) 如果 base 中没有任何 *_nht 列，就从 nht 回填一个统一名字 target_nht
nht_amount_src = detect_amount_col_in_nht(nht)
if not any(col.endswith("_nht") for col in base.columns):
    base = base.merge(
        nht[["time", "sector_id", nht_amount_src]].rename(columns={nht_amount_src: "target_nht"}),
        on=["time", "sector_id"], how="left"
    )
    TARGET_COL = "target_nht"
else:
    # 有 *_nht 就优先用匹配度最高的
    def choose_best_nht_col(df):
        cands = [c for c in df.columns if c.endswith("_nht")]
        def score(c):
            s = c.lower()
            sc = 0
            if "amount_new_house_transactions" in s: sc += 10
            if "new_house_transaction_amount" in s: sc += 9
            if "amount" in s: sc += 5
            if "transaction" in s or "transact" in s or "txn" in s: sc += 4
            if "area" in s or "price" in s: sc -= 3
            return sc
        cands.sort(key=lambda x: (score(x), len(x)), reverse=True)
        for c in cands:
            if pd.api.types.is_numeric_dtype(df[c]):
                return c
        return cands[0]
    TARGET_COL = choose_best_nht_col(base)

print("Using TARGET_COL =", TARGET_COL)

Using TARGET_COL = amount_new_house_transactions_nht


In [23]:
# 2) 可能用到的辅助列（存在才用）
def find_optional_col(df, must_end=None, contains_any=()):
    for c in df.columns:
        if must_end and not c.endswith(must_end): 
            continue
        s = c.lower()
        if all(kw in s for kw in contains_any) and pd.api.types.is_numeric_dtype(df[c]):
            return c
    return None

NEIGHBOR_NHT_COL = find_optional_col(base, must_end="_nhtns", contains_any=("amount",))
PHT_BUILDING_COL = (find_optional_col(base, must_end="_pht", contains_any=("building","area"))
                    or find_optional_col(base, must_end="_pht", contains_any=("area",)))
LT_TRANS_AREA_COL = (find_optional_col(base, must_end="_ltns", contains_any=("transacted","area"))
                     or find_optional_col(base, must_end="_lt", contains_any=("transacted","area")))

print("Aux columns:",
      "\n  NEIGHBOR_NHT_COL =", NEIGHBOR_NHT_COL,
      "\n  PHT_BUILDING_COL =", PHT_BUILDING_COL,
      "\n  LT_TRANS_AREA_COL =", LT_TRANS_AREA_COL)


Aux columns: 
  NEIGHBOR_NHT_COL = amount_new_house_transactions_nearby_sectors_nhtns 
  PHT_BUILDING_COL = area_pre_owned_house_transactions_pht 
  LT_TRANS_AREA_COL = None


In [24]:
# 3) 排序 & 构造标签（预测下个月 → shift(-1)）
base = base.sort_values(["sector_id","time"]).reset_index(drop=True)
base["y"] = base.groupby("sector_id")[TARGET_COL].shift(-1)


In [25]:
# 4) 构造滞后/滚动特征（仅对存在列）
def add_lags_rollings(df, col, group, lags=(1,2,3,6,12), windows=(3,6,12)):
    df = df.copy()
    g = df.groupby(group)[col]
    for L in lags:
        df[f"{col}_lag{L}"] = g.shift(L)
    for W in windows:
        df[f"{col}_ma{W}"] = g.shift(1).rolling(W, min_periods=1).mean()
        df[f"{col}_med{W}"] = g.shift(1).rolling(W, min_periods=1).median()
    return df

base = add_lags_rollings(base, TARGET_COL, ["sector_id"], lags=(1,2,3,6,12), windows=(3,6,12))
for col in [NEIGHBOR_NHT_COL, PHT_BUILDING_COL, LT_TRANS_AREA_COL]:
    if col and (col in base.columns):
        base = add_lags_rollings(base, col, ["sector_id"], lags=(1,2,6,12), windows=(3,6))

In [26]:
# 5) 划分训练/测试
train_df = base[base["time"]<=66].copy()
test_df  = base[(base["time"]>=67)&(base["time"]<=78)].copy()

In [27]:
# 6) 类别列
CAT_COLS = ["month_num","qtr"]
for c in CAT_COLS:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("category")
    if c in test_df.columns:
        test_df[c]  = test_df[c].astype("category")

In [28]:
# 7) 特征列（排除目标与键）
DROP_COLS = {"y", TARGET_COL, "time", "sector_id", "year"}
FEATURES = [c for c in train_df.columns if c not in DROP_COLS]

In [29]:
# 8) 二阶段目标
# —— 训练 / 测试划分（已有）
train_df = base[base["time"]<=66].copy()
test_df  = base[(base["time"]>=67)&(base["time"]<=78)].copy()

# === 新增：清理标签与特征中的 NaN/Inf（在计算 y_pos 之前）===
# 把正负无穷替换为 NaN
for df in (train_df, test_df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

# 只保留 y 有真实值的训练样本（y 是由 shift(-1) 得来）
train_df = train_df[train_df["y"].notna()].copy()

# （可选）若仍担心，有极端小概率 y < 0 的脏值时做裁剪
train_df["y"] = train_df["y"].clip(lower=0)

# 现在再生成二阶段分类标签就不会有警告了
train_df["y_pos"] = (train_df["y"] > 0).astype(int)



In [30]:
# Baselines
def ewgm(x, n_lags=6, alpha=0.6):
    vals = np.asarray(x)[-n_lags:]
    if len(vals)==0 or (vals>0).sum()==0:
        return 0.0
    w = np.array([alpha**(n_lags-1-i) for i in range(n_lags)], float)
    w = w / w.sum()
    pos = vals>0
    if pos.sum()==0: 
        return 0.0
    logv = np.log(np.where(vals[pos]<=0, 1e-12, vals[pos]))
    ww = w[pos] / w[pos].sum()
    return float(np.exp((ww*logv).sum()))

In [31]:
def december_multipliers(wide_amount, clip=(0.85, 1.4)):
    idx = wide_amount.index
    is_dec = (idx % 12)==11
    dec_mean = wide_amount[is_dec].mean(0)
    other_mean = wide_amount[~is_dec].mean(0)
    overall = dec_mean.mean()/(other_mean.mean()+1e-12)
    mult = dec_mean/(other_mean+1e-12)
    mult = mult.fillna(overall).replace([np.inf,-np.inf], 1.0)
    return mult.clip(*clip)

In [32]:
def build_baselines(df_all, df_te, value_col):
    # ---- 1) 只保留需要的三列，并做去重聚合 ----
    core = df_all.loc[:, ["time", "sector_id", value_col]].copy()
    core = core.dropna(subset=["time", "sector_id"])
    core["time"] = core["time"].astype(int)
    core["sector_id"] = core["sector_id"].astype(int)
    core[value_col] = pd.to_numeric(core[value_col], errors="coerce")

    # 对 (time, sector_id) 去重聚合：取最大值（通常等同于“非空优先”）
    # 也可以换成 .mean()，两者差别很小；max 对于含 NaN 的重复更稳
    core = (
        core.groupby(["time", "sector_id"], as_index=False)[value_col]
            .max()
    )

    # ---- 2) 再 pivot，保证唯一性 ----
    wide = core.pivot(index="time", columns="sector_id", values=value_col).fillna(0.0)

    # ---- 3) 计算季节基线（与原逻辑一致）----
    dec_mult = december_multipliers(wide)
    idx_h = np.arange(67, 79)

    # EWGM
    ewgm_pred = pd.DataFrame(0.0, index=idx_h, columns=wide.columns)
    for s in wide.columns:
        ewgm_pred.loc[idx_h, s] = ewgm(wide[s].values, n_lags=12, alpha=0.6)

    # Holt-Winters
    from statsmodels.tsa.holtwinters import ExponentialSmoothing
    hw_pred = pd.DataFrame(0.0, index=idx_h, columns=wide.columns)
    for s in wide.columns:
        arr = wide[s].values
        try:
            hw = ExponentialSmoothing(
                arr, trend="add", seasonal="add", seasonal_periods=12,
                initialization_method="estimated"
            ).fit()
            f = hw.forecast(12)
            hw_pred.loc[idx_h, s] = np.maximum(f, 0.0)
        except Exception:
            hw_pred.loc[idx_h, s] = np.maximum(arr[-1], 0.0)

    # December bump
    is_dec = (hw_pred.index % 12) == 11
    hw_pred.loc[is_dec] = (hw_pred.loc[is_dec] * dec_mult).values
    ewgm_pred.loc[is_dec] = (ewgm_pred.loc[is_dec] * dec_mult).values

    # 展平成测试行顺序；df_te 即使有重复键也能对应映射
    te_key = df_te[["time", "sector_id"]].copy()
    hw_flat = te_key.merge(
        hw_pred.stack().rename("hw").reset_index().rename(columns={"level_1": "sector_id"}),
        on=["time", "sector_id"], how="left"
    )["hw"].values
    ew_flat = te_key.merge(
        ewgm_pred.stack().rename("ew").reset_index().rename(columns={"level_1": "sector_id"}),
        on=["time", "sector_id"], how="left"
    )["ew"].values

    return hw_flat, ew_flat


In [33]:
# --- 统一清洗：替换inf为NaN，丢掉无标签行，裁剪负值 ---
for df in (train_df, test_df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

# 训练集只保留 y 有值的样本
train_df = train_df[train_df["y"].notna()].copy()
train_df["y"] = pd.to_numeric(train_df["y"], errors="coerce").clip(lower=0)

In [34]:
# --- 面板去重聚合：避免一键多行 ---
def dedupe_panel(df, key=("time","sector_id"), cat_cols=("month_num","qtr")):
    df = df.copy()
    # 保证键是整数
    df[key[0]] = df[key[0]].astype(int)
    df[key[1]] = df[key[1]].astype(int)

    # 数值列和类别列识别
    num_cols = [c for c in df.columns if c not in key and pd.api.types.is_numeric_dtype(df[c])]
    # —— 关键：把“看起来是数字但变成了 object”的列也转回数值 —— #
    for c in df.columns:
        if c not in key and df[c].dtype == "object":
            # 尝试转为数值；失败则变 NaN（不会影响聚合）
            df[c] = pd.to_numeric(df[c], errors="coerce")

    num_cols = [c for c in df.columns if c not in key and pd.api.types.is_numeric_dtype(df[c])]
    cat_cols = [c for c in cat_cols if c in df.columns]

    agg = {c: "mean" for c in num_cols}
    for c in cat_cols:
        agg[c] = "first"

    df = df.groupby(list(key), as_index=False).agg(agg)
    return df


In [35]:
train_df = dedupe_panel(train_df)
test_df  = dedupe_panel(test_df)


In [36]:
import warnings, numpy as np, pandas as pd

# 遇到 NaN 与比较时不再警告（只针对这类 runtime）
np.seterr(invalid="ignore")

# 把这类警告都忽略（仅显示层面）
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered.*")

# 统一浮点显示（可选）
pd.options.display.float_format = lambda x: f"{x:.6f}"


In [37]:
import warnings, numpy as np, pandas as pd
np.seterr(invalid="ignore")  # 避免 NaN 比较时的 runtime 警告
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered.*")
pd.options.display.float_format = lambda x: f"{x:.6f}"


In [38]:
# ---- 更稳的去重聚合（将 object 数值列转回 numeric 再 groupby）----
def dedupe_panel(df, key=("time","sector_id"), cat_cols=("month_num","qtr")):
    df = df.copy()
    df[key[0]] = df[key[0]].astype(int)
    df[key[1]] = df[key[1]].astype(int)
    # 把“看起来是数字却是 object”的列转为数值
    for c in df.columns:
        if c not in key and df[c].dtype == "object":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    # 聚合：数值均值、分类取第一条
    num_cols = [c for c in df.columns if c not in key and pd.api.types.is_numeric_dtype(df[c])]
    agg = {c: "mean" for c in num_cols}
    for c in cat_cols:
        if c in df.columns:
            agg[c] = "first"
    return df.groupby(list(key), as_index=False).agg(agg)

In [39]:
# ---- 基础清洗 ----
for df in (train_df, test_df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [40]:
# 训练集仅保留有标签的样本，并裁掉负值
train_df = train_df[train_df["y"].notna()].copy()
train_df["y"] = pd.to_numeric(train_df["y"], errors="coerce").clip(lower=0)


In [41]:
# 去重聚合，确保一键一行
train_df = dedupe_panel(train_df)
test_df  = dedupe_panel(test_df)

In [42]:
# ---- 类别列 ----
CAT_COLS = [c for c in ["month_num","qtr"] if c in train_df.columns]
for c in CAT_COLS:
    train_df[c] = train_df[c].astype("category")
    if c in test_df.columns:
        test_df[c] = test_df[c].astype("category")

In [43]:
# ---- 特征列（去掉目标与主键）----
DROP_COLS = {"y", "time", "sector_id", "year"}
if TARGET_COL in train_df.columns:
    DROP_COLS.add(TARGET_COL)
FEATURES = [c for c in train_df.columns if c not in DROP_COLS]

In [44]:
def build_baselines(df_all, df_te, value_col):
    """
    先对 (time, sector_id) 去重聚合，再 pivot 得 wide；
    生成 EWGM 和 Holt-Winters 12 期预测；
    用 melt 显式产出带 ['time','sector_id'] 的长表，与 test_df 键合并。
    """
    # ---- 1) 去重聚合，保证一键一值 ----
    core = df_all.loc[:, ["time", "sector_id", value_col]].copy()
    core = core.dropna(subset=["time", "sector_id"])
    core["time"] = core["time"].astype(int)
    core["sector_id"] = core["sector_id"].astype(int)
    core[value_col] = pd.to_numeric(core[value_col], errors="coerce")

    # 聚合：max 或 mean 都可；max 对含 NaN 的重复更稳
    core = core.groupby(["time", "sector_id"], as_index=False)[value_col].max()

    # ---- 2) 透视成 wide[time x sector] ----
    wide = core.pivot(index="time", columns="sector_id", values=value_col).fillna(0.0)
    wide = wide.sort_index()
    wide.columns = wide.columns.astype(int)

    # ---- 3) 计算 December bump ----
    dec_mult = december_multipliers(wide)
    idx_h = np.arange(67, 79)

    # ---- 4) EWGM 预测 ----
    ewgm_pred = pd.DataFrame(0.0, index=idx_h, columns=wide.columns, dtype=float)
    for s in wide.columns:
        ewgm_pred.loc[idx_h, s] = ewgm(wide[s].values, n_lags=12, alpha=0.6)
    # 命名 index/columns，避免后续列名缺失
    ewgm_pred.index.name = "time"
    ewgm_pred.columns.name = "sector_id"

    # ---- 5) Holt-Winters 预测（不收敛时回退为最后值）----
    from statsmodels.tsa.holtwinters import ExponentialSmoothing
    hw_pred = pd.DataFrame(0.0, index=idx_h, columns=wide.columns, dtype=float)
    for s in wide.columns:
        arr = wide[s].astype(float).values
        try:
            model = ExponentialSmoothing(
                arr, trend="add", seasonal="add", seasonal_periods=12,
                initialization_method="estimated"
            ).fit()
            f = model.forecast(12)
            hw_pred.loc[idx_h, s] = np.maximum(f, 0.0)
        except Exception:
            hw_pred.loc[idx_h, s] = np.maximum(arr[-1], 0.0)
    hw_pred.index.name = "time"
    hw_pred.columns.name = "sector_id"

    # ---- 6) 应用 December bump（已按列对齐）----
    is_dec = (ewgm_pred.index % 12) == 11
    # 将 dec_mult 对齐列
    dec_mult = dec_mult.reindex(hw_pred.columns).fillna(1.0)
    if is_dec.any():
        # 行向量 * 列向量的广播：给 12 月的行乘以每个 sector 的 multiplier
        hw_pred.loc[is_dec, :] = hw_pred.loc[is_dec, :].values * dec_mult.values
        ewgm_pred.loc[is_dec, :] = ewgm_pred.loc[is_dec, :].values * dec_mult.values

    # ---- 7) 展平成带键的长表（显式列名），与 test_df 键合并 ----
    hw_long = (hw_pred.reset_index()
                      .melt(id_vars="time", var_name="sector_id", value_name="hw"))
    ew_long = (ewgm_pred.reset_index()
                        .melt(id_vars="time", var_name="sector_id", value_name="ew"))
    hw_long["sector_id"] = hw_long["sector_id"].astype(int)
    ew_long["sector_id"] = ew_long["sector_id"].astype(int)

    te_key = df_te[["time", "sector_id"]].copy()
    te_key["time"] = te_key["time"].astype(int)
    te_key["sector_id"] = te_key["sector_id"].astype(int)

    hw_flat = te_key.merge(hw_long, on=["time", "sector_id"], how="left")["hw"].to_numpy()
    ew_flat = te_key.merge(ew_long, on=["time", "sector_id"], how="left")["ew"].to_numpy()

    # 缺失就补 0（很少见，通常键全能对上）
    hw_flat = np.nan_to_num(hw_flat, nan=0.0, posinf=0.0, neginf=0.0)
    ew_flat = np.nan_to_num(ew_flat, nan=0.0, posinf=0.0, neginf=0.0)
    return hw_flat, ew_flat

In [45]:
# 生成季节基线
hw_te, ew_te = build_baselines(base, test_df, TARGET_COL)

# ……你之前的后处理与提交流程保持不变……
# 二分类门控 + 收缩 + caps → final_pred
# 按键合并到 test.csv 行序 → 保存 submission.csv


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)


In [46]:
# ====== 安全收尾：若缺少预测则即时训练，然后完成后处理与提交 ======
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

# 1) 如未计算季节基线，则先计算
if "hw_te" not in globals() or "ew_te" not in globals():
    hw_te, ew_te = build_baselines(base, test_df, TARGET_COL)


In [47]:
def train_two_stage_and_predict(train_df, test_df, FEATURES=None, CAT_COLS=None,
                                n_splits=4, seed=42):
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import TimeSeriesSplit
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool

    # 1) 特征/类别列（更稳：仅用 train/test 公共列；显式排除非特征）
    if FEATURES is None:
        drop = {"y", "y_pos", "time", "sector_id", "year", "pred"}
        if "TARGET_COL" in globals() and TARGET_COL in train_df.columns:
            drop.add(TARGET_COL)
        common_cols = [c for c in train_df.columns if c in test_df.columns]
        FEATURES = [c for c in common_cols if c not in drop]

    if CAT_COLS is None:
        # 只保留也在 FEATURES 里的类别列
        guess_cats = [c for c in ["month_num", "qtr"] if c in FEATURES]
        CAT_COLS = guess_cats

    # 统一类别 dtype
    for c in CAT_COLS:
        if c in train_df.columns and not pd.api.types.is_categorical_dtype(train_df[c]):
            train_df[c] = train_df[c].astype("category")
        if c in test_df.columns and not pd.api.types.is_categorical_dtype(test_df[c]):
            test_df[c] = test_df[c].astype("category")

    # 2) 构造 X/X_te：数值与类别分开处理
    def make_X(df):
        X = df[FEATURES].copy()

        # object→numeric（能转就转，失败置 NaN）
        for c in X.columns:
            if c in CAT_COLS:
                continue
            if X[c].dtype == "object":
                X[c] = pd.to_numeric(X[c], errors="coerce")

        # 数值列填 -2
        num_cols = [c for c in X.columns if (c not in CAT_COLS) and pd.api.types.is_numeric_dtype(X[c])]
        if num_cols:
            X[num_cols] = (X[num_cols]
                           .replace([np.inf, -np.inf], np.nan)
                           .astype(float)
                           .fillna(-2.0))

        # 类别列填 "__MISSING__"
        for c in CAT_COLS:
            if c in X.columns:
                X[c] = X[c].cat.add_categories(["__MISSING__"]).fillna("__MISSING__")

        return X

    X    = make_X(train_df)
    X_te = make_X(test_df)

    # 3) 列对齐（补缺/删多，顺序一致）
    for c in X.columns:
        if c not in X_te.columns:
            if c in CAT_COLS:
                s = pd.Series(["__MISSING__"]*len(X_te), index=X_te.index, dtype="object")
                s = s.astype("category")
                X_te[c] = s
                X_te[c] = X_te[c].cat.add_categories(["__MISSING__"]).fillna("__MISSING__")
            else:
                X_te[c] = -2.0
    # 仅保留 X 中的列并按顺序排列
    X_te = X_te[X.columns]

    y     = train_df["y"].to_numpy()
    y_pos = (train_df["y"] > 0).astype(int).to_numpy()

    # 4) 时序 CV 训练与预测
    tscv = TimeSeriesSplit(n_splits=n_splits)
    pred_test_reg  = np.zeros((len(X_te), n_splits))
    pred_test_posp = np.zeros((len(X_te), n_splits))

    for k, (tr_idx, va_idx) in enumerate(tscv.split(X, y)):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        p_tr, p_va = y_pos[tr_idx], y_pos[va_idx]

        # —— 分类器：P(y>0)
        clf = CatBoostClassifier(
            depth=6, learning_rate=0.04, iterations=4000,
            loss_function="Logloss", eval_metric="AUC",
            l2_leaf_reg=3.0, random_seed=seed, verbose=False
        )
        clf.fit(Pool(X_tr, p_tr, cat_features=CAT_COLS),
                eval_set=Pool(X_va, p_va, cat_features=CAT_COLS),
                verbose=False)
        pred_test_posp[:, k] = clf.predict_proba(X_te)[:, 1]

        # —— 回归器：log1p(y) on positives
        mask_tr = y_tr > 0
        reg = CatBoostRegressor(
            depth=8, learning_rate=0.03, iterations=6000,
            loss_function="MAE", eval_metric="MAE",
            l2_leaf_reg=2.0, random_seed=seed + k, verbose=False
        )
        reg.fit(Pool(X_tr[mask_tr], np.log1p(y_tr[mask_tr]), cat_features=CAT_COLS),
                eval_set=Pool(X_va, np.log1p(np.maximum(y_va, 0)), cat_features=CAT_COLS),
                verbose=False)
        pred_test_reg[:, k] = np.expm1(np.maximum(reg.predict(X_te), 0.0))

    reg_te  = pred_test_reg.mean(axis=1)
    posp_te = pred_test_posp.mean(axis=1)
    return reg_te, posp_te


In [48]:
# ====================== 一键收尾：从此格开始跑到生成 submission.csv ======================
import warnings, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

In [49]:
# ---- 显示层与数值清理（压掉无害警告）----
np.seterr(invalid="ignore")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered.*")
warnings.filterwarnings("ignore", category=UserWarning, message="Optimization failed to converge.*")


In [50]:
# ---- 路径兜底 ----
try:
    OUT
except NameError:
    OUT = Path("/kaggle/working")

In [51]:
# ---- 兜底工具：去重聚合、拆分 test id、December bump、EWGM ----
def dedupe_panel(df, key=("time","sector_id"), cat_cols=("month_num","qtr")):
    df = df.copy()
    df[key[0]] = df[key[0]].astype(int)
    df[key[1]] = df[key[1]].astype(int)
    # object→numeric
    for c in df.columns:
        if c not in key and df[c].dtype == "object":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    num_cols = [c for c in df.columns if c not in key and pd.api.types.is_numeric_dtype(df[c])]
    agg = {c: "mean" for c in num_cols}
    for c in cat_cols:
        if c in df.columns:
            agg[c] = "first"
    return df.groupby(list(key), as_index=False).agg(agg)


In [52]:
def split_test_id(test):
    MONTH2NUM = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6,
                     Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
    parts = test["id"].str.split("_", expand=True)
    test["month_text"] = parts[0]
    test["sector"] = parts[1]
    test["sector_id"] = test["sector"].str.replace("sector ", "", regex=False).astype(int)
    test["year"] = test["month_text"].str.slice(0, 4).astype(int)
    test["month_num"] = test["month_text"].str.slice(5).map(MONTH2NUM).astype(int)
    test["time"] = (test["year"] - 2019) * 12 + test["month_num"] - 1
    return test

In [53]:
def ewgm(arr, n_lags=12, alpha=0.6):
    v = np.asarray(arr)[-n_lags:]
    if v.size == 0 or (v>0).sum()==0:
        return 0.0
    w = np.array([alpha**(n_lags-1-i) for i in range(n_lags)], float)
    w = w / w.sum()
    pos = v>0
    if pos.sum()==0: 
        return 0.0
    logv = np.log(np.where(v[pos]<=0, 1e-12, v[pos]))
    ww = w[pos] / w[pos].sum()
    return float(np.exp((ww*logv).sum()))

In [54]:
def december_multipliers(wide, clip=(0.85, 1.4)):
    is_dec = (wide.index % 12) == 11
    dm = wide[is_dec].mean(0)
    om = wide[~is_dec].mean(0)
    overall = dm.mean()/(om.mean()+1e-12)
    mult = (dm/(om+1e-12)).fillna(overall).replace([np.inf,-np.inf], 1.0)
    return mult.clip(*clip)

In [55]:
# ---- 若未定义 build_baselines，则定义为“melt + 显式键名”的稳健版 ----
if "build_baselines" not in globals():
    from statsmodels.tsa.holtwinters import ExponentialSmoothing
    def build_baselines(df_all, df_te, value_col):
        core = df_all.loc[:, ["time", "sector_id", value_col]].copy()
        core = core.dropna(subset=["time","sector_id"])
        core["time"] = core["time"].astype(int)
        core["sector_id"] = core["sector_id"].astype(int)
        core[value_col] = pd.to_numeric(core[value_col], errors="coerce")
        core = core.groupby(["time","sector_id"], as_index=False)[value_col].max()

        wide = core.pivot(index="time", columns="sector_id", values=value_col).fillna(0.0)
        wide = wide.sort_index()
        wide.columns = wide.columns.astype(int)

        dec_mult = december_multipliers(wide)
        idx_h = np.arange(67,79)

        ewgm_pred = pd.DataFrame(0.0, index=idx_h, columns=wide.columns, dtype=float)
        for s in wide.columns:
            ewgm_pred.loc[idx_h, s] = ewgm(wide[s].values, n_lags=12, alpha=0.6)
        ewgm_pred.index.name = "time"; ewgm_pred.columns.name = "sector_id"

        hw_pred = pd.DataFrame(0.0, index=idx_h, columns=wide.columns, dtype=float)
        for s in wide.columns:
            arr = wide[s].astype(float).values
            try:
                model = ExponentialSmoothing(arr, trend="add", seasonal="add",
                                             seasonal_periods=12,
                                             initialization_method="estimated").fit()
                f = model.forecast(12)
                hw_pred.loc[idx_h, s] = np.maximum(f, 0.0)
            except Exception:
                hw_pred.loc[idx_h, s] = np.maximum(arr[-1], 0.0)
        hw_pred.index.name = "time"; hw_pred.columns.name = "sector_id"

        is_dec = (ewgm_pred.index % 12) == 11
        dec_mult = dec_mult.reindex(hw_pred.columns).fillna(1.0)
        if is_dec.any():
            hw_pred.loc[is_dec, :] = hw_pred.loc[is_dec, :].values * dec_mult.values
            ewgm_pred.loc[is_dec, :] = ewgm_pred.loc[is_dec, :].values * dec_mult.values

        hw_long = hw_pred.reset_index().melt(id_vars="time", var_name="sector_id", value_name="hw")
        ew_long = ewgm_pred.reset_index().melt(id_vars="time", var_name="sector_id", value_name="ew")
        hw_long["sector_id"] = hw_long["sector_id"].astype(int)
        ew_long["sector_id"] = ew_long["sector_id"].astype(int)

        te_key = df_te[["time","sector_id"]].copy().astype(int)
        hw_flat = te_key.merge(hw_long, on=["time","sector_id"], how="left")["hw"].to_numpy()
        ew_flat = te_key.merge(ew_long, on=["time","sector_id"], how="left")["ew"].to_numpy()
        hw_flat = np.nan_to_num(hw_flat, nan=0.0, posinf=0.0, neginf=0.0)
        ew_flat = np.nan_to_num(ew_flat, nan=0.0, posinf=0.0, neginf=0.0)
        return hw_flat, ew_flat

In [56]:
# ---- 训练函数（修好类别/数值填充 & 列对齐）----
def train_two_stage_and_predict(train_df, test_df, FEATURES=None, CAT_COLS=None,
                                n_splits=4, seed=42,
                                default_reg_pred=None,  # 建议传 ew_te
                                default_posp=None):     # 可不传，用全局阳性率
    import pandas as pd, numpy as np
    from sklearn.model_selection import TimeSeriesSplit
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool

    # 1) 特征/类别列（仅取 train/test 交集，显式排除非特征）
    if FEATURES is None:
        drop = {"y", "y_pos", "time", "sector_id", "year", "pred"}
        if "TARGET_COL" in globals() and TARGET_COL in train_df.columns:
            drop.add(TARGET_COL)
        common = [c for c in train_df.columns if c in test_df.columns]
        FEATURES = [c for c in common if c not in drop]
    if CAT_COLS is None:
        CAT_COLS = [c for c in ["month_num", "qtr"] if c in FEATURES]

    # 统一类别 dtype
    for c in CAT_COLS:
        if c in train_df and not pd.api.types.is_categorical_dtype(train_df[c]):
            train_df[c] = train_df[c].astype("category")
        if c in test_df and not pd.api.types.is_categorical_dtype(test_df[c]):
            test_df[c] = test_df[c].astype("category")

    # 2) 构造特征矩阵（数值/类别分开处理）
    def make_X(df):
        X = df[FEATURES].copy()
        for c in X.columns:
            if c in CAT_COLS: 
                continue
            if X[c].dtype == "object":
                X[c] = pd.to_numeric(X[c], errors="coerce")
        num_cols = [c for c in X.columns if (c not in CAT_COLS) and pd.api.types.is_numeric_dtype(X[c])]
        if num_cols:
            X[num_cols] = (X[num_cols].replace([np.inf,-np.inf], np.nan).astype(float).fillna(-2.0))
        for c in CAT_COLS:
            if c in X.columns:
                X[c] = X[c].cat.add_categories(["__MISSING__"]).fillna("__MISSING__")
        return X

    X    = make_X(train_df)
    X_te = make_X(test_df)

    # 列对齐
    for c in X.columns:
        if c not in X_te.columns:
            if c in CAT_COLS:
                s = pd.Series(["__MISSING__"]*len(X_te), index=X_te.index, dtype="object").astype("category")
                X_te[c] = s.cat.add_categories(["__MISSING__"]).fillna("__MISSING__")
            else:
                X_te[c] = -2.0
    X_te = X_te[X.columns]

    y      = train_df["y"].to_numpy()
    y_pos  = (train_df["y"] > 0).astype(int).to_numpy()
    pos_gl = float(y_pos.mean()) if default_posp is None else float(default_posp)
    # 保证不是 0 或 1 的极端概率（避免完全屏蔽）
    pos_gl = min(max(pos_gl, 0.01), 0.99)

    tscv = TimeSeriesSplit(n_splits=n_splits)
    pred_test_reg  = np.zeros((len(X_te), n_splits))
    pred_test_posp = np.zeros((len(X_te), n_splits))

    for k, (tr_idx, va_idx) in enumerate(tscv.split(X, y)):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        p_tr, p_va = y_pos[tr_idx], y_pos[va_idx]

        # ========== 分类器：若目标单一，跳过训练 ==========
        if np.unique(p_tr).size < 2:
            # 使用全局阳性率作为该折预测
            pred_test_posp[:, k] = pos_gl
        else:
            clf = CatBoostClassifier(
                depth=6, learning_rate=0.04, iterations=4000,
                loss_function="Logloss", eval_metric="AUC",
                l2_leaf_reg=3.0, random_seed=seed, verbose=False
            )
            clf.fit(Pool(X_tr, p_tr, cat_features=CAT_COLS),
                    eval_set=Pool(X_va, p_va, cat_features=CAT_COLS),
                    verbose=False)
            pred_test_posp[:, k] = clf.predict_proba(X_te)[:, 1]

        # ========== 回归器：若无正样本，跳过训练 ==========
        mask_tr = y_tr > 0
        if mask_tr.sum() == 0:
            if default_reg_pred is not None:
                pred_test_reg[:, k] = np.asarray(default_reg_pred, float)
            else:
                # 退化为全局均值
                pred_test_reg[:, k] = max(float(y.mean()), 0.0)
        else:
            reg = CatBoostRegressor(
                depth=8, learning_rate=0.03, iterations=6000,
                loss_function="MAE", eval_metric="MAE",
                l2_leaf_reg=2.0, random_seed=seed + k, verbose=False
            )
            reg.fit(Pool(X_tr[mask_tr], np.log1p(y_tr[mask_tr]), cat_features=CAT_COLS),
                    eval_set=Pool(X_va, np.log1p(np.maximum(y_va, 0)), cat_features=CAT_COLS),
                    verbose=False)
            pred_test_reg[:, k] = np.expm1(np.maximum(reg.predict(X_te), 0.0))

    reg_te  = pred_test_reg.mean(axis=1)
    posp_te = pred_test_posp.mean(axis=1)
    return reg_te, posp_te


In [57]:
# ================= 数据清理 & 去重（若你之前已做，这几行再次执行也安全） =================
for df in (train_df, test_df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
train_df = train_df[train_df["y"].notna()].copy()
train_df["y"] = pd.to_numeric(train_df["y"], errors="coerce").clip(lower=0)
train_df = dedupe_panel(train_df)
test_df  = dedupe_panel(test_df)

In [58]:
# ================= 季节基线（若未计算则计算） =================
if "hw_te" not in globals() or "ew_te" not in globals():
    hw_te, ew_te = build_baselines(base, test_df, TARGET_COL)


In [59]:
# 若缺失则训练
if "reg_te" not in globals() or "posp_te" not in globals():
    # default_reg_pred 用季节性基线 ew_te；default_posp 用全局阳性率（可不传）
    reg_te, posp_te = train_two_stage_and_predict(
        train_df, test_df,
        FEATURES=None, CAT_COLS=None,
        n_splits=4, seed=42,
        default_reg_pred=ew_te
    )


  if c in train_df and not pd.api.types.is_categorical_dtype(train_df[c]):
  if c in test_df and not pd.api.types.is_categorical_dtype(test_df[c]):
No objects info loaded
No objects info loaded
No objects info loaded
No objects info loaded


In [60]:
thr    = 0.65
z_gate = (posp_te >= thr).astype(int)

shrink = 0.35
pred_te = shrink * reg_te + (1 - shrink) * ew_te

caps = (train_df.groupby("sector_id")["y"]
        .quantile([0.01, 0.99]).unstack()
        .rename(columns={0.01:"lo", 0.99:"hi"})
        .reset_index())
caps = test_df[["sector_id"]].merge(caps, on="sector_id", how="left").fillna(0.0)
hi_cap = np.maximum(caps["hi"].values * 1.25, 1.0)
pred_te = np.minimum(np.maximum(pred_te, 0.0), hi_cap)

final_pred = pred_te * z_gate


In [61]:
# 生成提交（按键合并）
test2 = split_test_id(test.copy())
pred_df = test_df[["time","sector_id"]].copy()
pred_df["pred"] = final_pred
pred_df = pred_df.groupby(["time","sector_id"], as_index=False)["pred"].mean()

submit = test2.merge(pred_df, on=["time","sector_id"], how="left")
submit["new_house_transaction_amount"] = submit["pred"].fillna(0.0).astype(float)
submit = submit[["id", "new_house_transaction_amount"]]
submit.to_csv(OUT/"submission.csv", index=False)
print("Saved:", OUT/"submission.csv")
print(submit.head(10).to_string(index=False))

Saved: /kaggle/working/submission.csv
                id  new_house_transaction_amount
 2024 Aug_sector 1                      0.000000
 2024 Aug_sector 2                      0.000000
 2024 Aug_sector 3                      0.000000
 2024 Aug_sector 4                      0.000000
 2024 Aug_sector 5                      0.000000
 2024 Aug_sector 6                      0.000000
 2024 Aug_sector 7                      0.000000
 2024 Aug_sector 8                      0.000000
 2024 Aug_sector 9                      0.000000
2024 Aug_sector 10                      0.000000
