In [1]:
!pip install mapie -qq

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.2/173.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m97.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.7.0 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.[0m[31m
[0m

In [2]:
pip install lightgbm --install-option=--gpu


Usage:   
  /usr/bin/python3 -m pip install [options] <requirement specifier> [package-index-options] ...
  /usr/bin/python3 -m pip install [options] -r <requirements file> [package-index-options] ...
  /usr/bin/python3 -m pip install [options] [-e] <vcs project url> ...
  /usr/bin/python3 -m pip install [options] [-e] <local project path> ...
  /usr/bin/python3 -m pip install [options] <archive url/path> ...

no such option: --install-option
Note: you may need to restart the kernel to use updated packages.


In [3]:
# --- 0. Import Libraries ---
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from mapie.regression import ConformalizedQuantileRegressor
import warnings
import os

In [4]:
warnings.filterwarnings('ignore')

In [5]:
class CFG:
    # Global settings
    SEED = 42
    N_SPLITS = 5  # Number of folds for cross-validation
    CONFIDENCE_LEVEL = 0.9  # Target confidence level (90%)
    ALPHA = 1 - CONFIDENCE_LEVEL

    # File paths
    try:
        # Kaggle environment
        DATA_PATH = '/kaggle/input/prediction-interval-competition-ii-house-price/'
        if not os.path.exists(os.path.join(DATA_PATH, 'dataset.csv')):
            raise FileNotFoundError
    except FileNotFoundError:
        print("Kaggle path not found or files missing, switching to local path './'.")
        DATA_PATH = './'
    OUTPUT_PATH = './'

    # Base parameters for the quantile models
    LGBM_PARAMS = {
        'objective': 'quantile',
        'metric': 'quantile',
        'n_estimators': 2000, # Increased for early stopping
        'subsample': 0.8,
        'colsample_bytree': 0.5,
        'learning_rate': 0.05,
        'max_depth': -1,
        'min_child_samples': 150,
        'n_jobs': -1,
        'random_state': SEED,
        'verbose': -1,
        'device': 'gpu',  # Enable GPU acceleration
        'max_bin':255,
        'gpu_platform_id': 0,  # Optional: specify which platform to use, usually 0 if only one GPU
        'gpu_device_id': 0,  
    }

In [6]:
def winkler_score_func(y_true, lower, upper, alpha=CFG.ALPHA):
    """Utility function to calculate the Winkler score."""
    score = np.mean(upper - lower)
    score += np.mean(np.where(y_true < lower, (2 / alpha) * (lower - y_true), 0))
    score += np.mean(np.where(y_true > upper, (2 / alpha) * (y_true - upper), 0))
    return score


In [7]:
# --- 2. Data Loading and Preprocessing ---
print("\n--- Phase 1: Loading and Preprocessing Data ---")
try:
    train_df_raw = pd.read_csv(os.path.join(CFG.DATA_PATH, 'dataset.csv'))
    test_df_raw = pd.read_csv(os.path.join(CFG.DATA_PATH, 'test.csv'))
except FileNotFoundError:
    print("Error: dataset.csv or test.csv not found.")
    print("Creating dummy data for demonstration purposes.")
    train_df_raw = pd.DataFrame({
        'id': range(1000), 'sale_price': np.random.rand(1000) * 500000 + 100000,
        'sale_date': pd.to_datetime(pd.date_range(start='2022-01-01', periods=1000)),
        'category_feature': np.random.choice(['A', 'B', 'C'], 1000)
    })
    test_df_raw = pd.DataFrame({
        'id': range(1000, 1200),
        'sale_date': pd.to_datetime(pd.date_range(start='2024-09-01', periods=200)),
        'category_feature': np.random.choice(['A', 'B', 'C'], 200)
    })



--- Phase 1: Loading and Preprocessing Data ---


In [8]:
def feature_engineer(df):
    """Simple feature engineering."""
    data = df.copy()
    if 'sale_date' in data.columns:
        data['sale_date'] = pd.to_datetime(data['sale_date'])
        data['sale_year'] = data['sale_date'].dt.year
        data['sale_month'] = data['sale_date'].dt.month
        data['sale_dayofweek'] = data['sale_date'].dt.dayofweek
        first_sale_month = data['sale_date'].dt.to_period('M').min()
        data['months_since_first_sale'] = (data['sale_date'].dt.to_period('M') - first_sale_month).apply(lambda x: x.n)
        data = data.drop('sale_date', axis=1)
    cat_cols = data.select_dtypes(include=['object']).columns
    for col in cat_cols:
        data[col] = pd.Categorical(data[col])#给特征编码
    return data


In [9]:
train_df = feature_engineer(train_df_raw)
test_df_processed = feature_engineer(test_df_raw)

In [10]:
max_bin_size = 255
def get_bin_size(data):
    bin_sizes = {}
    for col in data.columns:
        # 获取每个特征的唯一值数量
        unique_values = data[col].nunique()
        bin_sizes[col] = unique_values
    return bin_sizes

# 2. 改造 bin size 过大的特征
def change_large_bin_features(X, max_bin_size=255):
    bin_sizes = get_bin_size(X)
    categorical_features = X.select_dtypes(include=['object', 'category']).columns

    for col in categorical_features:
        if bin_sizes[col] > max_bin_size:
            X[col] = pd.Categorical(X[col]).codes
            print(f"Encoded column: {col}")

    return X


In [11]:
features = [col for col in train_df.columns if col not in ['id', 'sale_price']]
y= train_df['sale_price']
X = train_df[features]
X_test = test_df_processed[features]
print(f"Training with {len(features)} features. Train shape: {X.shape}, Test shape: {X_test.shape}")

Training with 48 features. Train shape: (200000, 48), Test shape: (200000, 48)


In [12]:
X = change_large_bin_features(X, max_bin_size)
X_test= change_large_bin_features(X_test, max_bin_size)

Encoded column: zoning
Encoded column: subdivision
Encoded column: zoning
Encoded column: subdivision


In [13]:
# --- 3. Cross-Validation Training with prefit=True ---
print(f"\n--- Phase 2: Training with {CFG.N_SPLITS}-Fold CV and prefit CQR ---")
kf = KFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.SEED)

oof_preds_lower = np.zeros(len(train_df))
oof_preds_upper = np.zeros(len(train_df))
test_preds_lower_sum = np.zeros(len(test_df_raw))
test_preds_upper_sum = np.zeros(len(test_df_raw))
fold_scores = []



--- Phase 2: Training with 5-Fold CV and prefit CQR ---


In [14]:
# Define parameters for the three quantile models
params_lower = {**CFG.LGBM_PARAMS, 'alpha': CFG.ALPHA / 2}
params_median = {**CFG.LGBM_PARAMS, 'alpha': 0.5}
params_upper = {**CFG.LGBM_PARAMS, 'alpha': 1 - (CFG.ALPHA / 2)}


In [15]:
for fold, (fit_idx, calib_idx) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1}/{CFG.N_SPLITS} ---")
    X_fit, X_calib = X.iloc[fit_idx], X.iloc[calib_idx]
    y_fit, y_calib = y.iloc[fit_idx], y.iloc[calib_idx]

    # Step 1: Fit the three quantile models on the fitting dataset
    print("Fitting lower, median, and upper models...")
    model_lower = lgb.LGBMRegressor(**params_lower)
    model_median = lgb.LGBMRegressor(**params_median)
    model_upper = lgb.LGBMRegressor(**params_upper)

    callbacks = [lgb.early_stopping(100, verbose=False)]
    model_lower.fit(X_fit, y_fit, eval_set=[(X_calib, y_calib)], callbacks=callbacks)
    model_median.fit(X_fit, y_fit, eval_set=[(X_calib, y_calib)], callbacks=callbacks)
    model_upper.fit(X_fit, y_fit, eval_set=[(X_calib, y_calib)], callbacks=callbacks)

    # Step 2: Conformalize using the pre-fitted models and the calibration dataset
    print("Conformalizing models...")
    mapie_cqr = ConformalizedQuantileRegressor(
        estimator=[model_lower, model_upper, model_median], # [lower, upper, median] order
        confidence_level=CFG.CONFIDENCE_LEVEL,
        prefit=True
    ).conformalize(X_calib, y_calib)

    # Step 3: Generate OOF predictions for the calibration set
    _, oof_pis = mapie_cqr.predict_interval(X_calib)
    oof_preds_lower[calib_idx] = oof_pis[:, 0, 0]
    oof_preds_upper[calib_idx] = oof_pis[:, 1, 0]

    fold_score = winkler_score_func(y_calib, oof_pis[:, 0, 0], oof_pis[:, 1, 0])
    fold_scores.append(fold_score)
    print(f"Fold {fold+1} Winkler Score: {fold_score:,.2f}")

    # Step 4: Generate predictions for the test set and accumulate them
    print("Predicting on test data...")
    _, test_pis = mapie_cqr.predict_interval(X_test)
    test_preds_lower_sum += test_pis[:, 0, 0]
    test_preds_upper_sum += test_pis[:, 1, 0]



--- Fold 1/5 ---
Fitting lower, median, and upper models...




Conformalizing models...
Fold 1 Winkler Score: 326,238.01
Predicting on test data...

--- Fold 2/5 ---
Fitting lower, median, and upper models...
Conformalizing models...
Fold 2 Winkler Score: 324,707.61
Predicting on test data...

--- Fold 3/5 ---
Fitting lower, median, and upper models...
Conformalizing models...
Fold 3 Winkler Score: 331,748.56
Predicting on test data...

--- Fold 4/5 ---
Fitting lower, median, and upper models...
Conformalizing models...
Fold 4 Winkler Score: 324,312.12
Predicting on test data...

--- Fold 5/5 ---
Fitting lower, median, and upper models...
Conformalizing models...
Fold 5 Winkler Score: 327,849.28
Predicting on test data...


In [16]:
# --- 4. Final Evaluation and Submission ---
print("\n--- Phase 3: Final Evaluation and Submission ---")

overall_oof_score = winkler_score_func(y, oof_preds_lower, oof_preds_upper)
print(f"\nFold Scores: {[f'{s:,.2f}' for s in fold_scores]}")
print(f"Overall OOF Winkler Score: {overall_oof_score:,.2f}")


--- Phase 3: Final Evaluation and Submission ---

Fold Scores: ['326,238.01', '324,707.61', '331,748.56', '324,312.12', '327,849.28']
Overall OOF Winkler Score: 326,971.11


In [17]:
test_preds_lower = test_preds_lower_sum / CFG.N_SPLITS
test_preds_upper = test_preds_upper_sum / CFG.N_SPLITS

In [18]:
submission_df = pd.DataFrame({
    'id': test_df_raw['id'],
    'pi_lower': test_preds_lower,
    'pi_upper': test_preds_upper
})
submission_df['pi_lower'] = np.minimum(submission_df['pi_lower'], submission_df['pi_upper'])
submission_df.to_csv(os.path.join(CFG.OUTPUT_PATH, 'submission_baseline_cqr_prefit.csv'), index=False)

In [19]:
print("\nSubmission file 'submission_baseline_cqr_prefit.csv' has been created.")
print(submission_df.head())


Submission file 'submission_baseline_cqr_prefit.csv' has been created.
       id       pi_lower      pi_upper
0  200000  789256.343558  1.039679e+06
1  200001  530110.526735  8.082509e+05
2  200002  448209.170878  6.818468e+05
3  200003  294700.918156  4.347813e+05
4  200004  398325.817184  7.019698e+05
