In [1]:
import numpy as np
import pandas as pd
import zipfile
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb


# ==============================
# 配置
# ==============================
ZIP_PATH = Path("blogfeedback.zip")      # 本地 BlogFeedback zip 路径
CSV_NAME = "blogData_train.csv"         # zip 里的文件名

PPPI_TEST_FRACTION = 0.3                # 留给 PPCI 的比例
RANDOM_STATE = 2025
N_X0 = 50                               # x0 个数（从整体干净数据中先抽出来）

OUT_CSV_PPCI = "blogfeedback_ppci.csv"
OUT_CSV_X0   = "blogfeedback_ppci_x0.csv"


# ==============================
# 1. 从 zip 读取原始数据
# ==============================
if not ZIP_PATH.exists():
    raise FileNotFoundError(f"找不到 zip 文件: {ZIP_PATH}")

with zipfile.ZipFile(ZIP_PATH, "r") as zf:
    print("Files in zip:", zf.namelist())
    if CSV_NAME not in zf.namelist():
        raise FileNotFoundError(f"zip 里找不到 {CSV_NAME}，请检查文件名。")
    with zf.open(CSV_NAME) as f:
        df_raw = pd.read_csv(f, header=None)

print("原始 df_raw 形状:", df_raw.shape)  # 例如 (52397, 281)


# ==============================
# 2. 去掉重复数据
#    (1) 先去掉整行重复 (X+Y 都一样)
#    (2) 再去掉 X 重复的所有行，只保留 X 只出现一次的样本
# ==============================

# (1) 去掉整行重复
df_nodup_all = df_raw.drop_duplicates()
print("去掉整行重复后行数:", len(df_nodup_all))

# (2) 去掉 X 重复的行
X_all = df_nodup_all.iloc[:, :-1]   # 前 280 列是 X
Y_all = df_nodup_all.iloc[:, -1]    # 最后一列是 Y

# mask_uniqueX 为 True 的行是：X 在整个数据集中只出现一次
mask_uniqueX = ~X_all.duplicated(keep=False)
df_clean = df_nodup_all[mask_uniqueX].reset_index(drop=True)

print("最终去掉 X 重复后行数:", len(df_clean))

# 提取干净的 X_raw, Y_raw
X_clean_raw = df_clean.iloc[:, :-1].to_numpy(dtype=float)
Y_clean_raw = df_clean.iloc[:, -1].to_numpy(dtype=float)

print("X_clean_raw shape:", X_clean_raw.shape)
print("Y_clean_raw shape:", Y_clean_raw.shape)
print("Y_clean_raw min/max:", Y_clean_raw.min(), Y_clean_raw.max())


# ==============================
# 3. 对 X 做全局标准化，对 Y 做 log1p
# ==============================
X_mean = X_clean_raw.mean(axis=0)
X_std = X_clean_raw.std(axis=0)
X_std_safe = np.where(X_std == 0, 1.0, X_std)

X_clean_std = (X_clean_raw - X_mean) / X_std_safe
Y_log = np.log1p(Y_clean_raw)

n_clean, d = X_clean_std.shape
print("标准化后 X_clean_std 形状:", X_clean_std.shape)


# ==============================
# 4. 先从整体干净数据中抽 N_X0 个 x0
#    然后再对剩下的样本做 train/test split
# ==============================
rng = np.random.default_rng(RANDOM_STATE)

if N_X0 >= n_clean:
    raise ValueError("N_X0 太大，没有足够样本留给训练和 PPCI。")

# 4.1 随机抽 N_X0 行作为 x0 的原始索引
idx_all = np.arange(n_clean)
idx_x0 = rng.choice(idx_all, size=N_X0, replace=False)

# 4.2 剩余样本索引（用于 LightGBM 训练 + PPCI）
mask_rem = np.ones(n_clean, dtype=bool)
mask_rem[idx_x0] = False
idx_rem = idx_all[mask_rem]

X_rem = X_clean_std[idx_rem]
Y_rem_log = Y_log[idx_rem]
Y_rem_raw = Y_clean_raw[idx_rem]

print(f"用于后续 train/test split 的剩余样本数: {X_rem.shape[0]}")

# 4.3 在“剩余样本”上做 train/test split
#     - 训练 LightGBM 的额外样本：X_train_extra
#     - PPCI 的样本：X_ppci
X_train_extra, X_ppci, Y_train_extra_log, Y_ppci_log, Y_train_extra_raw, Y_ppci_raw = train_test_split(
    X_rem,
    Y_rem_log,
    Y_rem_raw,
    test_size=PPPI_TEST_FRACTION,
    random_state=RANDOM_STATE,
)

print(f"LightGBM 额外训练样本数: {X_train_extra.shape[0]}")
print(f"PPCI 样本数:             {X_ppci.shape[0]}")

# 4.4 LightGBM 训练集 = x0 样本 + 额外训练样本
X_pred = np.vstack([X_clean_std[idx_x0], X_train_extra])
Y_pred_log = np.concatenate([Y_log[idx_x0], Y_train_extra_log])
Y_pred_raw = np.concatenate([Y_clean_raw[idx_x0], Y_train_extra_raw])

print(f"LightGBM 总训练样本数 (含 x0): {X_pred.shape[0]}")

# 4.5 x0 标准化后的坐标
X_x0_std = X_clean_std[idx_x0, :]


# ==============================
# 5. 训练 LightGBM 回归模型（在 log 标度上）
# ==============================
lgbm = lgb.LGBMRegressor(
    objective="regression",
    boosting_type="gbdt",
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_pred,
    Y_pred_log,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

print("Fitting LightGBM ...")
lgbm.fit(
    X_tr,
    Y_tr,
    eval_set=[(X_val, Y_val)],
    eval_metric="l2",
)

# 在 PPCI 那部分上预测（log 标度）
Yhat_ppci_log = lgbm.predict(X_ppci)

# 看一下模型表现
mse_log = mean_squared_error(Y_ppci_log, Yhat_ppci_log)
r2_log = r2_score(Y_ppci_log, Yhat_ppci_log)
print(f"[Log scale]  MSE = {mse_log:.4f}, R^2 = {r2_log:.4f}")

Yhat_ppci_raw = np.expm1(Yhat_ppci_log)
mse_raw = mean_squared_error(Y_ppci_raw, Yhat_ppci_raw)
r2_raw = r2_score(Y_ppci_raw, Yhat_ppci_raw)
print(f"[Raw scale]  MSE = {mse_raw:.2f}, R^2 = {r2_raw:.4f}")


# ==============================
# 6. 生成 PPCI 用的 CSV
#    列：x1,...,xD, logy, logyhat
# ==============================
cols_X = [f"x{j+1}" for j in range(d)]

df_ppci = pd.DataFrame(X_ppci, columns=cols_X)
df_ppci["logy"] = Y_ppci_log
df_ppci["logyhat"] = Yhat_ppci_log

df_ppci.to_csv(OUT_CSV_PPCI, index=False)
print(f"PPCI 数据已写入: {OUT_CSV_PPCI}")
print(df_ppci.head())


# ==============================
# 7. 生成 x0 的 CSV（来自整体干净数据，且保证不在 PPCI 样本中）
# ==============================
df_x0 = pd.DataFrame(X_x0_std, columns=cols_X)
df_x0.to_csv(OUT_CSV_X0, index=False)
print(f"{N_X0} 个 x0 已写入: {OUT_CSV_X0}")
print(df_x0.head())


# ==============================
# 8. 检查 1：PPCI 里 X 是否有重复行
# ==============================
# 把 X_ppci 单独拿出来做 DataFrame，方便用 duplicated 检查
X_ppci_df = pd.DataFrame(X_ppci, columns=cols_X)

# duplicated(keep=False) 会把所有属于“重复组”的行都标 True
dup_mask_ppci = X_ppci_df.duplicated(keep=False)
n_dup_ppci = dup_mask_ppci.sum()
n_unique_ppci = len(X_ppci_df.drop_duplicates())

print("\n[Check 1] PPCI 中 X 的重复情况：")
print(f"  PPCI 总行数: {len(X_ppci_df)}")
print(f"  去重后行数: {n_unique_ppci}")
print(f"  属于重复 X 的行数: {n_dup_ppci}")
print(f"  重复比例: {n_dup_ppci / len(X_ppci_df):.4f}")

if n_dup_ppci > 0:
    print("  示例重复的几行 X：")
    print(X_ppci_df[dup_mask_ppci].head())


# ==============================
# 9. 检查 2：x0 是否和 PPCI 中的 X 有重复
# ==============================
matches = 0
match_detail = []

for i in range(N_X0):
    # 对每一个 x0，看在 X_ppci 里是否存在完全相同的一行
    # 用 isclose 做逐维比较，避免浮点误差
    same_rows = np.all(np.isclose(X_ppci, X_x0_std[i], atol=1e-12), axis=1)
    if np.any(same_rows):
        matches += 1
        idxs = np.where(same_rows)[0]
        match_detail.append((i, idxs[:5]))  # 只记录前几个位置，避免太长

print("\n[Check 2] x0 与 PPCI 中 X 的交集情况：")
print(f"  x0 的个数: {N_X0}")
print(f"  在 PPCI 中能找到完全相同 X 的 x0 个数: {matches}")

if matches > 0:
    print("  其中一些匹配示例 (x0_index -> PPCI 行索引的前几个):")
    for x0_idx, p_idx in match_detail:
        print(f"    x0[{x0_idx}] -> PPCI rows {p_idx}")


Files in zip: ['blogData_test-2012.02.01.00_00.csv', 'blogData_test-2012.02.02.00_00.csv', 'blogData_test-2012.02.03.00_00.csv', 'blogData_test-2012.02.04.00_00.csv', 'blogData_test-2012.02.05.00_00.csv', 'blogData_test-2012.02.06.00_00.csv', 'blogData_test-2012.02.07.00_00.csv', 'blogData_test-2012.02.08.00_00.csv', 'blogData_test-2012.02.09.00_00.csv', 'blogData_test-2012.02.10.00_00.csv', 'blogData_test-2012.02.11.00_00.csv', 'blogData_test-2012.02.12.00_00.csv', 'blogData_test-2012.02.13.00_00.csv', 'blogData_test-2012.02.14.00_00.csv', 'blogData_test-2012.02.15.00_00.csv', 'blogData_test-2012.02.16.00_00.csv', 'blogData_test-2012.02.17.00_00.csv', 'blogData_test-2012.02.18.00_00.csv', 'blogData_test-2012.02.19.00_00.csv', 'blogData_test-2012.02.20.00_00.csv', 'blogData_test-2012.02.21.00_00.csv', 'blogData_test-2012.02.22.00_00.csv', 'blogData_test-2012.02.23.00_00.csv', 'blogData_test-2012.02.24.00_00.csv', 'blogData_test-2012.02.25.00_00.csv', 'blogData_test-2012.02.26.00_00.csv

KeyboardInterrupt: 

In [2]:
import numpy as np
import pandas as pd
import zipfile
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb


# ==============================
# Config
# ==============================
ZIP_PATH = Path("blogfeedback.zip")      # Local path to BlogFeedback zip
CSV_NAME = "blogData_train.csv"         # CSV name inside the zip

PPPI_TEST_FRACTION = 0.3                # Fraction reserved for PPCI
RANDOM_STATE = 2025
N_X0 = 50                               # Number of x0 points sampled from the clean pool

OUT_CSV_PPCI = "blogfeedback_ppci.csv"
OUT_CSV_X0   = "blogfeedback_ppci_x0.csv"


# ==============================
# 1. Read raw data from zip
# ==============================
if not ZIP_PATH.exists():
    raise FileNotFoundError(f"Zip file not found: {ZIP_PATH}")

with zipfile.ZipFile(ZIP_PATH, "r") as zf:
    print("Files in zip:", zf.namelist())
    if CSV_NAME not in zf.namelist():
        raise FileNotFoundError(f"{CSV_NAME} not found in zip. Please check the filename.")
    with zf.open(CSV_NAME) as f:
        df_raw = pd.read_csv(f, header=None)

print("Raw df_raw shape:", df_raw.shape)  # e.g. (52397, 281)


# ==============================
# 2. Remove duplicates
#    (1) Drop full-row duplicates (X+Y identical)
#    (2) Drop any rows whose X appears more than once (keep only unique X)
# ==============================

# (1) Drop exact row duplicates
df_nodup_all = df_raw.drop_duplicates()
print("Rows after dropping full-row duplicates:", len(df_nodup_all))

# (2) Drop duplicated X rows (keep only X that appears once in the dataset)
X_all = df_nodup_all.iloc[:, :-1]   # first 280 columns are X
Y_all = df_nodup_all.iloc[:, -1]    # last column is Y

# mask_uniqueX == True means: X appears exactly once in the whole dataset
mask_uniqueX = ~X_all.duplicated(keep=False)
df_clean = df_nodup_all[mask_uniqueX].reset_index(drop=True)

print("Final rows after dropping duplicated X:", len(df_clean))

# Extract clean X_raw, Y_raw
X_clean_raw = df_clean.iloc[:, :-1].to_numpy(dtype=float)
Y_clean_raw = df_clean.iloc[:, -1].to_numpy(dtype=float)

print("X_clean_raw shape:", X_clean_raw.shape)
print("Y_clean_raw shape:", Y_clean_raw.shape)
print("Y_clean_raw min/max:", Y_clean_raw.min(), Y_clean_raw.max())


# ==============================
# 3. Global standardization for X, log1p for Y
# ==============================
X_mean = X_clean_raw.mean(axis=0)
X_std = X_clean_raw.std(axis=0)
X_std_safe = np.where(X_std == 0, 1.0, X_std)

X_clean_std = (X_clean_raw - X_mean) / X_std_safe
Y_log = np.log1p(Y_clean_raw)

n_clean, d = X_clean_std.shape
print("Standardized X_clean_std shape:", X_clean_std.shape)


# ==============================
# 4. Sample N_X0 x0 points from the clean pool,
#    then split the remaining samples into train/test (extra train vs PPCI).
# ==============================
rng = np.random.default_rng(RANDOM_STATE)

if N_X0 >= n_clean:
    raise ValueError("N_X0 is too large; not enough samples left for training and PPCI.")

# 4.1 Randomly sample N_X0 indices as x0
idx_all = np.arange(n_clean)
idx_x0 = rng.choice(idx_all, size=N_X0, replace=False)

# 4.2 Remaining indices (used for LightGBM training + PPCI)
mask_rem = np.ones(n_clean, dtype=bool)
mask_rem[idx_x0] = False
idx_rem = idx_all[mask_rem]

X_rem = X_clean_std[idx_rem]
Y_rem_log = Y_log[idx_rem]
Y_rem_raw = Y_clean_raw[idx_rem]

print(f"Remaining samples for train/test split: {X_rem.shape[0]}")

# 4.3 Train/test split on the remaining samples
#     - X_train_extra: additional samples for LightGBM training
#     - X_ppci: samples reserved for PPCI
X_train_extra, X_ppci, Y_train_extra_log, Y_ppci_log, Y_train_extra_raw, Y_ppci_raw = train_test_split(
    X_rem,
    Y_rem_log,
    Y_rem_raw,
    test_size=PPPI_TEST_FRACTION,
    random_state=RANDOM_STATE,
)

print(f"Extra training samples for LightGBM: {X_train_extra.shape[0]}")
print(f"PPCI samples:                      {X_ppci.shape[0]}")

# 4.4 LightGBM training set = x0 samples + extra training samples
X_pred = np.vstack([X_clean_std[idx_x0], X_train_extra])
Y_pred_log = np.concatenate([Y_log[idx_x0], Y_train_extra_log])
Y_pred_raw = np.concatenate([Y_clean_raw[idx_x0], Y_train_extra_raw])

print(f"Total LightGBM training samples (including x0): {X_pred.shape[0]}")

# 4.5 x0 standardized coordinates
X_x0_std = X_clean_std[idx_x0, :]


# ==============================
# 5. Train LightGBM regressor (on log scale)
# ==============================
lgbm = lgb.LGBMRegressor(
    objective="regression",
    boosting_type="gbdt",
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_pred,
    Y_pred_log,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

print("Fitting LightGBM ...")
lgbm.fit(
    X_tr,
    Y_tr,
    eval_set=[(X_val, Y_val)],
    eval_metric="l2",
)

# Predict on PPCI subset (log scale)
Yhat_ppci_log = lgbm.predict(X_ppci)

# Basic performance checks
mse_log = mean_squared_error(Y_ppci_log, Yhat_ppci_log)
r2_log = r2_score(Y_ppci_log, Yhat_ppci_log)
print(f"[Log scale]  MSE = {mse_log:.4f}, R^2 = {r2_log:.4f}")

Yhat_ppci_raw = np.expm1(Yhat_ppci_log)
mse_raw = mean_squared_error(Y_ppci_raw, Yhat_ppci_raw)
r2_raw = r2_score(Y_ppci_raw, Yhat_ppci_raw)
print(f"[Raw scale]  MSE = {mse_raw:.2f}, R^2 = {r2_raw:.4f}")


# ==============================
# 6. Write PPCI CSV
#    Columns: x1,...,xD, logy, logyhat
# ==============================
cols_X = [f"x{j+1}" for j in range(d)]

df_ppci = pd.DataFrame(X_ppci, columns=cols_X)
df_ppci["logy"] = Y_ppci_log
df_ppci["logyhat"] = Yhat_ppci_log

df_ppci.to_csv(OUT_CSV_PPCI, index=False)
print(f"PPCI data written to: {OUT_CSV_PPCI}")
print(df_ppci.head())


# ==============================
# 7. Write x0 CSV (sampled from the clean pool and excluded from PPCI by construction)
# ==============================
df_x0 = pd.DataFrame(X_x0_std, columns=cols_X)
df_x0.to_csv(OUT_CSV_X0, index=False)
print(f"{N_X0} x0 points written to: {OUT_CSV_X0}")
print(df_x0.head())


# ==============================
# 8. Check 1: whether X in PPCI has duplicate rows
# ==============================
X_ppci_df = pd.DataFrame(X_ppci, columns=cols_X)

# duplicated(keep=False) marks all rows that are part of any duplicate group
dup_mask_ppci = X_ppci_df.duplicated(keep=False)
n_dup_ppci = int(dup_mask_ppci.sum())
n_unique_ppci = len(X_ppci_df.drop_duplicates())

print("\n[Check 1] Duplicates in PPCI X:")
print(f"  PPCI total rows:    {len(X_ppci_df)}")
print(f"  Rows after dedup:   {n_unique_ppci}")
print(f"  Rows in duplicate groups: {n_dup_ppci}")
print(f"  Duplicate ratio:    {n_dup_ppci / len(X_ppci_df):.4f}")

if n_dup_ppci > 0:
    print("  Example duplicate X rows:")
    print(X_ppci_df[dup_mask_ppci].head())


# ==============================
# 9. Check 2 (FAST): whether any x0 row appears in PPCI X
#    This replaces the slow O(N_X0 * n_ppci * d) loop with a vectorized row-match.
#    Since X_ppci and X_x0_std are slices from the same standardized array,
#    exact row matching is safe here.
# ==============================
X_ppci_c = np.ascontiguousarray(X_ppci)
X_x0_c = np.ascontiguousarray(X_x0_std)

# View each row as a single "byte string" (void) so we can do fast set-like membership tests.
row_dtype = np.dtype((np.void, X_ppci_c.dtype.itemsize * X_ppci_c.shape[1]))
ppci_rows = X_ppci_c.view(row_dtype).ravel()
x0_rows = X_x0_c.view(row_dtype).ravel()

matches = int(np.isin(x0_rows, ppci_rows).sum())

print("\n[Check 2 - FAST] Intersection between x0 and PPCI X:")
print(f"  Number of x0 points: {N_X0}")
print(f"  Number of x0 rows found in PPCI X: {matches}")

Files in zip: ['blogData_test-2012.02.01.00_00.csv', 'blogData_test-2012.02.02.00_00.csv', 'blogData_test-2012.02.03.00_00.csv', 'blogData_test-2012.02.04.00_00.csv', 'blogData_test-2012.02.05.00_00.csv', 'blogData_test-2012.02.06.00_00.csv', 'blogData_test-2012.02.07.00_00.csv', 'blogData_test-2012.02.08.00_00.csv', 'blogData_test-2012.02.09.00_00.csv', 'blogData_test-2012.02.10.00_00.csv', 'blogData_test-2012.02.11.00_00.csv', 'blogData_test-2012.02.12.00_00.csv', 'blogData_test-2012.02.13.00_00.csv', 'blogData_test-2012.02.14.00_00.csv', 'blogData_test-2012.02.15.00_00.csv', 'blogData_test-2012.02.16.00_00.csv', 'blogData_test-2012.02.17.00_00.csv', 'blogData_test-2012.02.18.00_00.csv', 'blogData_test-2012.02.19.00_00.csv', 'blogData_test-2012.02.20.00_00.csv', 'blogData_test-2012.02.21.00_00.csv', 'blogData_test-2012.02.22.00_00.csv', 'blogData_test-2012.02.23.00_00.csv', 'blogData_test-2012.02.24.00_00.csv', 'blogData_test-2012.02.25.00_00.csv', 'blogData_test-2012.02.26.00_00.csv